From 3a6efdd03c46c5ba08e43880d34260c02dd9999b Mon Sep 17 00:00:00 2001 From: Romain D <90720+Artefact2@users.noreply.github.com> Date: Mon, 18 Mar 2024 09:04:41 +0100 Subject: [PATCH 01/28] convert : use f32 outtype for bf16 tensors (#6106) The old behaviour is to use f16, but bf16 to f16 is not a lossless conversion. Change the outtype to f32 to default to a lossless conversion. --- convert.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/convert.py b/convert.py index 161430f3e..817cb6612 100755 --- a/convert.py +++ b/convert.py @@ -1167,9 +1167,9 @@ class OutputFile: def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileType: wq_type = model[gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.ATTN_Q].format(bid=0) + ".weight"].data_type - if output_type_str == "f32" or (output_type_str is None and wq_type == DT_F32): + if output_type_str == "f32" or (output_type_str is None and wq_type in (DT_F32, DT_BF16)): return GGMLFileType.AllF32 - if output_type_str == "f16" or (output_type_str is None and wq_type in (DT_F16, DT_BF16)): + if output_type_str == "f16" or (output_type_str is None and wq_type == DT_F16): return GGMLFileType.MostlyF16 if output_type_str == "q8_0": return GGMLFileType.MostlyQ8_0 From 9b03719ad712e2dc36c5c0c20f352bf3e4bda332 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9rence?= <13496987+Royalphax@users.noreply.github.com> Date: Mon, 18 Mar 2024 09:17:00 +0100 Subject: [PATCH 02/28] convert : add support for CamembertModel architecture (#6119) Adding support for CamembertModel architecture used by : https://huggingface.co/dangvantuan/sentence-camembert-large --- convert-hf-to-gguf.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index cf1f98d66..1e49d56c1 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1634,7 +1634,7 @@ in chat mode so that the conversation can end normally.") self.post_write_tensors(tensor_map, name, data_torch) -@Model.register("BertModel") +@Model.register("BertModel", "CamembertModel") class BertModel(Model): model_arch = gguf.MODEL_ARCH.BERT From 496bc79bc2b79bfd6124b8687a8dbd6a646e9b06 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Mon, 18 Mar 2024 04:27:44 -0400 Subject: [PATCH 03/28] common : tidy-up argument parsing (#6105) * Tidy-up argument parsing. * Missing ref. * common : minor * common : add static classifier --------- Co-authored-by: Georgi Gerganov --- common/common.cpp | 2057 +++++++++++++++++++++++---------------------- 1 file changed, 1035 insertions(+), 1022 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 2f5d965d6..919182862 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -154,6 +154,1040 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return result; } +static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int & i, bool & invalid_param) { + std::string arg = argv[i]; + llama_sampling_params& sparams = params.sparams; + + if (arg == "-s" || arg == "--seed") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.seed = std::stoul(argv[i]); + return true; + } + if (arg == "-t" || arg == "--threads") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads = std::stoi(argv[i]); + if (params.n_threads <= 0) { + params.n_threads = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-tb" || arg == "--threads-batch") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_batch = std::stoi(argv[i]); + if (params.n_threads_batch <= 0) { + params.n_threads_batch = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-td" || arg == "--threads-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_draft = std::stoi(argv[i]); + if (params.n_threads_draft <= 0) { + params.n_threads_draft = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-tbd" || arg == "--threads-batch-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_threads_batch_draft = std::stoi(argv[i]); + if (params.n_threads_batch_draft <= 0) { + params.n_threads_batch_draft = std::thread::hardware_concurrency(); + } + return true; + } + if (arg == "-p" || arg == "--prompt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.prompt = argv[i]; + return true; + } + if (arg == "-e" || arg == "--escape") { + params.escape = true; + return true; + } + if (arg == "--prompt-cache") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.path_prompt_cache = argv[i]; + return true; + } + if (arg == "--prompt-cache-all") { + params.prompt_cache_all = true; + return true; + } + if (arg == "--prompt-cache-ro") { + params.prompt_cache_ro = true; + return true; + } + if (arg == "-bf" || arg == "--binary-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i], std::ios::binary); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::ostringstream ss; + ss << file.rdbuf(); + params.prompt = ss.str(); + fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); + return true; + } + if (arg == "-f" || arg == "--file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + // store the external file name in params + params.prompt_file = argv[i]; + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); + if (!params.prompt.empty() && params.prompt.back() == '\n') { + params.prompt.pop_back(); + } + return true; + } + if (arg == "-n" || arg == "--n-predict") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_predict = std::stoi(argv[i]); + return true; + } + if (arg == "--top-k") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.top_k = std::stoi(argv[i]); + return true; + } + if (arg == "-c" || arg == "--ctx-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-n" || arg == "-gan") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.grp_attn_n = std::stoi(argv[i]); + return true; + } + if (arg == "--grp-attn-w" || arg == "-gaw") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.grp_attn_w = std::stoi(argv[i]); + return true; + } + if (arg == "--rope-freq-base") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rope_freq_base = std::stof(argv[i]); + return true; + } + if (arg == "--rope-freq-scale") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rope_freq_scale = std::stof(argv[i]); + return true; + } + if (arg == "--rope-scaling") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } + else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } + else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } + else { invalid_param = true; } + return true; + } + if (arg == "--rope-scale") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.rope_freq_scale = 1.0f / std::stof(argv[i]); + return true; + } + if (arg == "--yarn-orig-ctx") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_orig_ctx = std::stoi(argv[i]); + return true; + } + if (arg == "--yarn-ext-factor") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_ext_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-attn-factor") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_attn_factor = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-fast") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_beta_fast = std::stof(argv[i]); + return true; + } + if (arg == "--yarn-beta-slow") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.yarn_beta_slow = std::stof(argv[i]); + return true; + } + if (arg == "--pooling") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string value(argv[i]); + /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } + else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } + else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } + else { invalid_param = true; } + return true; + } + if (arg == "--defrag-thold" || arg == "-dt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.defrag_thold = std::stof(argv[i]); + return true; + } + if (arg == "--samplers") { + if (++i >= argc) { + invalid_param = true; + return true; + } + const auto sampler_names = string_split(argv[i], ';'); + sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); + return true; + } + if (arg == "--sampling-seq") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.samplers_sequence = sampler_types_from_chars(argv[i]); + return true; + } + if (arg == "--top-p") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.top_p = std::stof(argv[i]); + return true; + } + if (arg == "--min-p") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.min_p = std::stof(argv[i]); + return true; + } + if (arg == "--temp") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.temp = std::stof(argv[i]); + sparams.temp = std::max(sparams.temp, 0.0f); + return true; + } + if (arg == "--tfs") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.tfs_z = std::stof(argv[i]); + return true; + } + if (arg == "--typical") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.typical_p = std::stof(argv[i]); + return true; + } + if (arg == "--repeat-last-n") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_last_n = std::stoi(argv[i]); + sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); + return true; + } + if (arg == "--repeat-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_repeat = std::stof(argv[i]); + return true; + } + if (arg == "--frequency-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_freq = std::stof(argv[i]); + return true; + } + if (arg == "--presence-penalty") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.penalty_present = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-range") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.dynatemp_range = std::stof(argv[i]); + return true; + } + if (arg == "--dynatemp-exp") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.dynatemp_exponent = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.mirostat = std::stoi(argv[i]); + return true; + } + if (arg == "--mirostat-lr") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.mirostat_eta = std::stof(argv[i]); + return true; + } + if (arg == "--mirostat-ent") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.mirostat_tau = std::stof(argv[i]); + return true; + } + if (arg == "--cfg-negative-prompt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.cfg_negative_prompt = argv[i]; + return true; + } + if (arg == "--cfg-negative-prompt-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); + if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { + sparams.cfg_negative_prompt.pop_back(); + } + return true; + } + if (arg == "--cfg-scale") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.cfg_scale = std::stof(argv[i]); + return true; + } + if (arg == "-b" || arg == "--batch-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_batch = std::stoi(argv[i]); + return true; + } + if (arg == "-ub" || arg == "--ubatch-size") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_ubatch = std::stoi(argv[i]); + return true; + } + if (arg == "--keep") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_keep = std::stoi(argv[i]); + return true; + } + if (arg == "--draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_draft = std::stoi(argv[i]); + return true; + } + if (arg == "--chunks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_chunks = std::stoi(argv[i]); + return true; + } + if (arg == "-np" || arg == "--parallel") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_parallel = std::stoi(argv[i]); + return true; + } + if (arg == "-ns" || arg == "--sequences") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_sequences = std::stoi(argv[i]); + return true; + } + if (arg == "--p-split" || arg == "-ps") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.p_split = std::stof(argv[i]); + return true; + } + if (arg == "-m" || arg == "--model") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model = argv[i]; + return true; + } + if (arg == "-mu" || arg == "--model-url") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_url = argv[i]; + return true; + } + if (arg == "-md" || arg == "--model-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_draft = argv[i]; + return true; + } + if (arg == "-a" || arg == "--alias") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.model_alias = argv[i]; + return true; + } + if (arg == "--lora") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lora_adapter.emplace_back(argv[i], 1.0f); + params.use_mmap = false; + return true; + } + if (arg == "--lora-scaled") { + if (++i >= argc) { + invalid_param = true; + return true; + } + const char* lora_adapter = argv[i]; + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); + params.use_mmap = false; + return true; + } + if (arg == "--lora-base") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.lora_base = argv[i]; + return true; + } + if (arg == "--control-vector") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vectors.push_back({ 1.0f, argv[i], }); + return true; + } + if (arg == "--control-vector-scaled") { + if (++i >= argc) { + invalid_param = true; + return true; + } + const char* fname = argv[i]; + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vectors.push_back({ std::stof(argv[i]), fname, }); + return true; + } + if (arg == "--control-vector-layer-range") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vector_layer_start = std::stoi(argv[i]); + if (++i >= argc) { + invalid_param = true; + return true; + } + params.control_vector_layer_end = std::stoi(argv[i]); + return true; + } + if (arg == "--mmproj") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.mmproj = argv[i]; + return true; + } + if (arg == "--image") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.image = argv[i]; + return true; + } + if (arg == "-i" || arg == "--interactive") { + params.interactive = true; + return true; + } + if (arg == "--embedding") { + params.embedding = true; + return true; + } + if (arg == "--interactive-first") { + params.interactive_first = true; + return true; + } + if (arg == "-ins" || arg == "--instruct") { + params.instruct = true; + return true; + } + if (arg == "-cml" || arg == "--chatml") { + params.chatml = true; + return true; + } + if (arg == "--infill") { + params.infill = true; + return true; + } + if (arg == "-dkvc" || arg == "--dump-kv-cache") { + params.dump_kv_cache = true; + return true; + } + if (arg == "-nkvo" || arg == "--no-kv-offload") { + params.no_kv_offload = true; + return true; + } + if (arg == "-ctk" || arg == "--cache-type-k") { + params.cache_type_k = argv[++i]; + return true; + } + if (arg == "-ctv" || arg == "--cache-type-v") { + params.cache_type_v = argv[++i]; + return true; + } + if (arg == "--multiline-input") { + params.multiline_input = true; + return true; + } + if (arg == "--simple-io") { + params.simple_io = true; + return true; + } + if (arg == "-cb" || arg == "--cont-batching") { + params.cont_batching = true; + return true; + } + if (arg == "--color") { + params.use_color = true; + return true; + } + if (arg == "--mlock") { + params.use_mlock = true; + return true; + } + if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_gpu_layers = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + return true; + } + if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_gpu_layers_draft = std::stoi(argv[i]); + if (!llama_supports_gpu_offload()) { + fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); + fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); + } + return true; + } + if (arg == "--main-gpu" || arg == "-mg") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.main_gpu = std::stoi(argv[i]); +#ifndef GGML_USE_CUBLAS_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUBLAS_SYCL + return true; + } + if (arg == "--split-mode" || arg == "-sm") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_MODE_NONE; + } + else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_MODE_LAYER; + } + else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL + params.split_mode = LLAMA_SPLIT_MODE_ROW; + } + else { + invalid_param = true; + return true; + } +#ifndef GGML_USE_CUBLAS_SYCL + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUBLAS_SYCL + return true; + } + if (arg == "--tensor-split" || arg == "-ts") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string arg_next = argv[i]; + + // split string by , and / + const std::regex regex{ R"([,/]+)" }; + std::sregex_token_iterator it{ arg_next.begin(), arg_next.end(), regex, -1 }; + std::vector split_arg{ it, {} }; + if (split_arg.size() >= llama_max_devices()) { + invalid_param = true; + return true; + } + for (size_t i = 0; i < llama_max_devices(); ++i) { + if (i < split_arg.size()) { + params.tensor_split[i] = std::stof(split_arg[i]); + } + else { + params.tensor_split[i] = 0.0f; + } + } +#ifndef GGML_USE_CUBLAS_SYCL_VULKAN + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); +#endif // GGML_USE_CUBLAS_SYCL + return true; + } + if (arg == "--no-mmap") { + params.use_mmap = false; + return true; + } + if (arg == "--numa") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::string value(argv[i]); + /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } + else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } + else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } + else { invalid_param = true; } + return true; + } + if (arg == "--verbose-prompt") { + params.verbose_prompt = true; + return true; + } + if (arg == "--no-display-prompt") { + params.display_prompt = false; + return true; + } + if (arg == "-r" || arg == "--reverse-prompt") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.antiprompt.emplace_back(argv[i]); + return true; + } + if (arg == "-ld" || arg == "--logdir") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.logdir = argv[i]; + + if (params.logdir.back() != DIRECTORY_SEPARATOR) { + params.logdir += DIRECTORY_SEPARATOR; + } + return true; + } + if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.logits_file = argv[i]; + return true; + } + if (arg == "--perplexity" || arg == "--all-logits") { + params.logits_all = true; + return true; + } + if (arg == "--ppl-stride") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ppl_stride = std::stoi(argv[i]); + return true; + } + if (arg == "-ptc" || arg == "--print-token-count") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.n_print = std::stoi(argv[i]); + return true; + } + if (arg == "--ppl-output-type") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.ppl_output_type = std::stoi(argv[i]); + return true; + } + if (arg == "--hellaswag") { + params.hellaswag = true; + return true; + } + if (arg == "--hellaswag-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.hellaswag_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--winogrande") { + params.winogrande = true; + return true; + } + if (arg == "--winogrande-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.winogrande_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--multiple-choice") { + params.multiple_choice = true; + return true; + } + if (arg == "--multiple-choice-tasks") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.multiple_choice_tasks = std::stoi(argv[i]); + return true; + } + if (arg == "--kl-divergence") { + params.kl_divergence = true; + return true; + } + if (arg == "--ignore-eos") { + params.ignore_eos = true; + return true; + } + if (arg == "--no-penalize-nl") { + sparams.penalize_nl = false; + return true; + } + if (arg == "-l" || arg == "--logit-bias") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::stringstream ss(argv[i]); + llama_token key; + char sign; + std::string value_str; + try { + if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { + sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); + } + else { + throw std::exception(); + } + } + catch (const std::exception&) { + invalid_param = true; + return true; + } + return true; + } + if (arg == "-h" || arg == "--help") { + return false; + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + if (arg == "--random-prompt") { + params.random_prompt = true; + return true; + } + if (arg == "--in-prefix-bos") { + params.input_prefix_bos = true; + return true; + } + if (arg == "--in-prefix") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.input_prefix = argv[i]; + return true; + } + if (arg == "--in-suffix") { + if (++i >= argc) { + invalid_param = true; + return true; + } + params.input_suffix = argv[i]; + return true; + } + if (arg == "--grammar") { + if (++i >= argc) { + invalid_param = true; + return true; + } + sparams.grammar = argv[i]; + return true; + } + if (arg == "--grammar-file") { + if (++i >= argc) { + invalid_param = true; + return true; + } + std::ifstream file(argv[i]); + if (!file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + return true; + } + std::copy( + std::istreambuf_iterator(file), + std::istreambuf_iterator(), + std::back_inserter(sparams.grammar) + ); + return true; + } + if (arg == "--override-kv") { + if (++i >= argc) { + invalid_param = true; + return true; + } + char* sep = strchr(argv[i], '='); + if (sep == nullptr || sep - argv[i] >= 128) { + fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + struct llama_model_kv_override kvo; + std::strncpy(kvo.key, argv[i], sep - argv[i]); + kvo.key[sep - argv[i]] = 0; + sep++; + if (strncmp(sep, "int:", 4) == 0) { + sep += 4; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; + kvo.int_value = std::atol(sep); + } + else if (strncmp(sep, "float:", 6) == 0) { + sep += 6; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; + kvo.float_value = std::atof(sep); + } + else if (strncmp(sep, "bool:", 5) == 0) { + sep += 5; + kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; + if (std::strcmp(sep, "true") == 0) { + kvo.bool_value = true; + } + else if (std::strcmp(sep, "false") == 0) { + kvo.bool_value = false; + } + else { + fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + } + else { + fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); + invalid_param = true; + return true; + } + params.kv_overrides.push_back(kvo); + return true; + } +#ifndef LOG_DISABLE_LOGS + // Parse args for logging parameters + if (log_param_single_parse(argv[i])) { + // Do nothing, log_param_single_parse automatically does it's thing + // and returns if a match was found and parsed. + return true; + } + if (log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i])) { + // We have a matching known parameter requiring an argument, + // now we need to check if there is anything after this argv + // and flag invalid_param or parse it. + if (++i >= argc) { + invalid_param = true; + return true; + } + if (!log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i - 1], argv[i])) { + invalid_param = true; + return true; + } + return true; + } + // End of Parse args for logging parameters +#endif // LOG_DISABLE_LOGS + + return false; +} + bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { bool invalid_param = false; std::string arg; @@ -166,1028 +1200,7 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { std::replace(arg.begin(), arg.end(), '_', '-'); } - bool arg_found = false; - if (arg == "-s" || arg == "--seed") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.seed = std::stoul(argv[i]); - } - if (arg == "-t" || arg == "--threads") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads = std::stoi(argv[i]); - if (params.n_threads <= 0) { - params.n_threads = std::thread::hardware_concurrency(); - } - } - if (arg == "-tb" || arg == "--threads-batch") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch = std::stoi(argv[i]); - if (params.n_threads_batch <= 0) { - params.n_threads_batch = std::thread::hardware_concurrency(); - } - } - if (arg == "-td" || arg == "--threads-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_draft = std::stoi(argv[i]); - if (params.n_threads_draft <= 0) { - params.n_threads_draft = std::thread::hardware_concurrency(); - } - } - if (arg == "-tbd" || arg == "--threads-batch-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_threads_batch_draft = std::stoi(argv[i]); - if (params.n_threads_batch_draft <= 0) { - params.n_threads_batch_draft = std::thread::hardware_concurrency(); - } - } - if (arg == "-p" || arg == "--prompt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.prompt = argv[i]; - } - if (arg == "-e" || arg == "--escape") { - arg_found = true; - params.escape = true; - } - if (arg == "--prompt-cache") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.path_prompt_cache = argv[i]; - } - if (arg == "--prompt-cache-all") { - arg_found = true; - params.prompt_cache_all = true; - } - if (arg == "--prompt-cache-ro") { - arg_found = true; - params.prompt_cache_ro = true; - } - if (arg == "-bf" || arg == "--binary-file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i], std::ios::binary); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::ostringstream ss; - ss << file.rdbuf(); - params.prompt = ss.str(); - fprintf(stderr, "Read %zu bytes from binary file %s\n", params.prompt.size(), argv[i]); - } - if (arg == "-f" || arg == "--file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - // store the external file name in params - params.prompt_file = argv[i]; - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(params.prompt)); - if (!params.prompt.empty() && params.prompt.back() == '\n') { - params.prompt.pop_back(); - } - } - if (arg == "-n" || arg == "--n-predict") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_predict = std::stoi(argv[i]); - } - if (arg == "--top-k") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.top_k = std::stoi(argv[i]); - } - if (arg == "-c" || arg == "--ctx-size") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ctx = std::stoi(argv[i]); - } - if (arg == "--grp-attn-n" || arg == "-gan") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_n = std::stoi(argv[i]); - } - if (arg == "--grp-attn-w" || arg == "-gaw") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - - params.grp_attn_w = std::stoi(argv[i]); - } - if (arg == "--rope-freq-base") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_base = std::stof(argv[i]); - } - if (arg == "--rope-freq-scale") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = std::stof(argv[i]); - } - if (arg == "--rope-scaling") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_NONE; } - else if (value == "linear") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_LINEAR; } - else if (value == "yarn") { params.rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_YARN; } - else { invalid_param = true; break; } - } - if (arg == "--rope-scale") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.rope_freq_scale = 1.0f/std::stof(argv[i]); - } - if (arg == "--yarn-orig-ctx") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_orig_ctx = std::stoi(argv[i]); - } - if (arg == "--yarn-ext-factor") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_ext_factor = std::stof(argv[i]); - } - if (arg == "--yarn-attn-factor") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_attn_factor = std::stof(argv[i]); - } - if (arg == "--yarn-beta-fast") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_fast = std::stof(argv[i]); - } - if (arg == "--yarn-beta-slow") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.yarn_beta_slow = std::stof(argv[i]); - } - if (arg == "--pooling") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "none") { params.pooling_type = LLAMA_POOLING_TYPE_NONE; } - else if (value == "mean") { params.pooling_type = LLAMA_POOLING_TYPE_MEAN; } - else if (value == "cls") { params.pooling_type = LLAMA_POOLING_TYPE_CLS; } - else { invalid_param = true; break; } - } - if (arg == "--defrag-thold" || arg == "-dt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.defrag_thold = std::stof(argv[i]); - } - if (arg == "--samplers") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - const auto sampler_names = string_split(argv[i], ';'); - sparams.samplers_sequence = sampler_types_from_names(sampler_names, true); - } - if (arg == "--sampling-seq") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.samplers_sequence = sampler_types_from_chars(argv[i]); - } - if (arg == "--top-p") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.top_p = std::stof(argv[i]); - } - if (arg == "--min-p") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.min_p = std::stof(argv[i]); - } - if (arg == "--temp") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.temp = std::stof(argv[i]); - sparams.temp = std::max(sparams.temp, 0.0f); - } - if (arg == "--tfs") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.tfs_z = std::stof(argv[i]); - } - if (arg == "--typical") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.typical_p = std::stof(argv[i]); - } - if (arg == "--repeat-last-n") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_last_n = std::stoi(argv[i]); - sparams.n_prev = std::max(sparams.n_prev, sparams.penalty_last_n); - } - if (arg == "--repeat-penalty") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_repeat = std::stof(argv[i]); - } - if (arg == "--frequency-penalty") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_freq = std::stof(argv[i]); - } - if (arg == "--presence-penalty") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.penalty_present = std::stof(argv[i]); - } - if (arg == "--dynatemp-range") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.dynatemp_range = std::stof(argv[i]); - } - if (arg == "--dynatemp-exp") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.dynatemp_exponent = std::stof(argv[i]); - } - if (arg == "--mirostat") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat = std::stoi(argv[i]); - } - if (arg == "--mirostat-lr") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat_eta = std::stof(argv[i]); - } - if (arg == "--mirostat-ent") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.mirostat_tau = std::stof(argv[i]); - } - if (arg == "--cfg-negative-prompt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.cfg_negative_prompt = argv[i]; - } - if (arg == "--cfg-negative-prompt-file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::copy(std::istreambuf_iterator(file), std::istreambuf_iterator(), back_inserter(sparams.cfg_negative_prompt)); - if (!sparams.cfg_negative_prompt.empty() && sparams.cfg_negative_prompt.back() == '\n') { - sparams.cfg_negative_prompt.pop_back(); - } - } - if (arg == "--cfg-scale") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.cfg_scale = std::stof(argv[i]); - } - if (arg == "-b" || arg == "--batch-size") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_batch = std::stoi(argv[i]); - } - if (arg == "-ub" || arg == "--ubatch-size") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_ubatch = std::stoi(argv[i]); - } - if (arg == "--keep") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_keep = std::stoi(argv[i]); - } - if (arg == "--draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_draft = std::stoi(argv[i]); - } - if (arg == "--chunks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_chunks = std::stoi(argv[i]); - } - if (arg == "-np" || arg == "--parallel") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_parallel = std::stoi(argv[i]); - } - if (arg == "-ns" || arg == "--sequences") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_sequences = std::stoi(argv[i]); - } - if (arg == "--p-split" || arg == "-ps") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.p_split = std::stof(argv[i]); - } - if (arg == "-m" || arg == "--model") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.model = argv[i]; - } - if (arg == "-mu" || arg == "--model-url") { - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_url = argv[i]; - } - if (arg == "-md" || arg == "--model-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_draft = argv[i]; - } - if (arg == "-a" || arg == "--alias") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.model_alias = argv[i]; - } - if (arg == "--lora") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.emplace_back(argv[i], 1.0f); - params.use_mmap = false; - } - if (arg == "--lora-scaled") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - const char * lora_adapter = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i])); - params.use_mmap = false; - } - if (arg == "--lora-base") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.lora_base = argv[i]; - } - if (arg == "--control-vector") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vectors.push_back({ 1.0f, argv[i], }); - } - if (arg == "--control-vector-scaled") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - const char * fname = argv[i]; - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vectors.push_back({ std::stof(argv[i]), fname, }); - } - if (arg == "--control-vector-layer-range") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vector_layer_start = std::stoi(argv[i]); - if (++i >= argc) { - invalid_param = true; - break; - } - params.control_vector_layer_end = std::stoi(argv[i]); - } - if (arg == "--mmproj") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.mmproj = argv[i]; - } - if (arg == "--image") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.image = argv[i]; - } - if (arg == "-i" || arg == "--interactive") { - arg_found = true; - params.interactive = true; - } - if (arg == "--embedding") { - arg_found = true; - params.embedding = true; - } - if (arg == "--interactive-first") { - arg_found = true; - params.interactive_first = true; - } - if (arg == "-ins" || arg == "--instruct") { - arg_found = true; - params.instruct = true; - } - if (arg == "-cml" || arg == "--chatml") { - arg_found = true; - params.chatml = true; - } - if (arg == "--infill") { - arg_found = true; - params.infill = true; - } - if (arg == "-dkvc" || arg == "--dump-kv-cache") { - arg_found = true; - params.dump_kv_cache = true; - } - if (arg == "-nkvo" || arg == "--no-kv-offload") { - arg_found = true; - params.no_kv_offload = true; - } - if (arg == "-ctk" || arg == "--cache-type-k") { - arg_found = true; - params.cache_type_k = argv[++i]; - } - if (arg == "-ctv" || arg == "--cache-type-v") { - arg_found = true; - params.cache_type_v = argv[++i]; - } - if (arg == "--multiline-input") { - arg_found = true; - params.multiline_input = true; - } - if (arg == "--simple-io") { - arg_found = true; - params.simple_io = true; - } - if (arg == "-cb" || arg == "--cont-batching") { - arg_found = true; - params.cont_batching = true; - } - if (arg == "--color") { - arg_found = true; - params.use_color = true; - } - if (arg == "--mlock") { - arg_found = true; - params.use_mlock = true; - } - if (arg == "--gpu-layers" || arg == "-ngl" || arg == "--n-gpu-layers") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_gpu_layers = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - if (arg == "--gpu-layers-draft" || arg == "-ngld" || arg == "--n-gpu-layers-draft") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_gpu_layers_draft = std::stoi(argv[i]); - if (!llama_supports_gpu_offload()) { - fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); - fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); - } - } - if (arg == "--main-gpu" || arg == "-mg") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.main_gpu = std::stoi(argv[i]); -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the main GPU has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL - } - if (arg == "--split-mode" || arg == "-sm") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string arg_next = argv[i]; - if (arg_next == "none") { - params.split_mode = LLAMA_SPLIT_MODE_NONE; - } else if (arg_next == "layer") { - params.split_mode = LLAMA_SPLIT_MODE_LAYER; - } else if (arg_next == "row") { -#ifdef GGML_USE_SYCL - fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); - exit(1); -#endif // GGML_USE_SYCL - params.split_mode = LLAMA_SPLIT_MODE_ROW; - } else { - invalid_param = true; - break; - } -#ifndef GGML_USE_CUBLAS_SYCL - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL. Setting the split mode has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL - - } - if (arg == "--tensor-split" || arg == "-ts") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string arg_next = argv[i]; - - // split string by , and / - const std::regex regex{R"([,/]+)"}; - std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; - std::vector split_arg{it, {}}; - if (split_arg.size() >= llama_max_devices()) { - invalid_param = true; - break; - } - for (size_t i = 0; i < llama_max_devices(); ++i) { - if (i < split_arg.size()) { - params.tensor_split[i] = std::stof(split_arg[i]); - } else { - params.tensor_split[i] = 0.0f; - } - } -#ifndef GGML_USE_CUBLAS_SYCL_VULKAN - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS/SYCL/Vulkan. Setting a tensor split has no effect.\n"); -#endif // GGML_USE_CUBLAS_SYCL - } - if (arg == "--no-mmap") { - arg_found = true; - params.use_mmap = false; - } - if (arg == "--numa") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::string value(argv[i]); - /**/ if (value == "distribute" || value == "") { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; } - else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; } - else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; } - else { invalid_param = true; break; } - } - if (arg == "--verbose-prompt") { - arg_found = true; - params.verbose_prompt = true; - } - if (arg == "--no-display-prompt") { - arg_found = true; - params.display_prompt = false; - } - if (arg == "-r" || arg == "--reverse-prompt") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.antiprompt.emplace_back(argv[i]); - } - if (arg == "-ld" || arg == "--logdir") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.logdir = argv[i]; - - if (params.logdir.back() != DIRECTORY_SEPARATOR) { - params.logdir += DIRECTORY_SEPARATOR; - } - } - if (arg == "--save-all-logits" || arg == "--kl-divergence-base") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.logits_file = argv[i]; - } - if (arg == "--perplexity" || arg == "--all-logits") { - arg_found = true; - params.logits_all = true; - } - if (arg == "--ppl-stride") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.ppl_stride = std::stoi(argv[i]); - } - if (arg == "-ptc" || arg == "--print-token-count") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.n_print = std::stoi(argv[i]); - } - if (arg == "--ppl-output-type") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.ppl_output_type = std::stoi(argv[i]); - } - if (arg == "--hellaswag") { - arg_found = true; - params.hellaswag = true; - } - if (arg == "--hellaswag-tasks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.hellaswag_tasks = std::stoi(argv[i]); - } - if (arg == "--winogrande") { - arg_found = true; - params.winogrande = true; - } - if (arg == "--winogrande-tasks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.winogrande_tasks = std::stoi(argv[i]); - } - if (arg == "--multiple-choice") { - arg_found = true; - params.multiple_choice = true; - } - if (arg == "--multiple-choice-tasks") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.multiple_choice_tasks = std::stoi(argv[i]); - } - if (arg == "--kl-divergence") { - arg_found = true; - params.kl_divergence = true; - } - if (arg == "--ignore-eos") { - arg_found = true; - params.ignore_eos = true; - } - if (arg == "--no-penalize-nl") { - arg_found = true; - sparams.penalize_nl = false; - } - if (arg == "-l" || arg == "--logit-bias") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::stringstream ss(argv[i]); - llama_token key; - char sign; - std::string value_str; - try { - if (ss >> key && ss >> sign && std::getline(ss, value_str) && (sign == '+' || sign == '-')) { - sparams.logit_bias[key] = std::stof(value_str) * ((sign == '-') ? -1.0f : 1.0f); - } else { - throw std::exception(); - } - } catch (const std::exception&) { - invalid_param = true; - break; - } - } - if (arg == "-h" || arg == "--help") { - arg_found = true; - return false; - } - if (arg == "--version") { - arg_found = true; - fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); - fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); - exit(0); - } - if (arg == "--random-prompt") { - arg_found = true; - params.random_prompt = true; - } - if (arg == "--in-prefix-bos") { - arg_found = true; - params.input_prefix_bos = true; - } - if (arg == "--in-prefix") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.input_prefix = argv[i]; - } - if (arg == "--in-suffix") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - params.input_suffix = argv[i]; - } - if (arg == "--grammar") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - sparams.grammar = argv[i]; - } - if (arg == "--grammar-file") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - std::ifstream file(argv[i]); - if (!file) { - fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); - invalid_param = true; - break; - } - std::copy( - std::istreambuf_iterator(file), - std::istreambuf_iterator(), - std::back_inserter(sparams.grammar) - ); - } - if (arg == "--override-kv") { - arg_found = true; - if (++i >= argc) { - invalid_param = true; - break; - } - char * sep = strchr(argv[i], '='); - if (sep == nullptr || sep - argv[i] >= 128) { - fprintf(stderr, "error: Malformed KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - struct llama_model_kv_override kvo; - std::strncpy(kvo.key, argv[i], sep - argv[i]); - kvo.key[sep - argv[i]] = 0; - sep++; - if (strncmp(sep, "int:", 4) == 0) { - sep += 4; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT; - kvo.int_value = std::atol(sep); - } else if (strncmp(sep, "float:", 6) == 0) { - sep += 6; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT; - kvo.float_value = std::atof(sep); - } else if (strncmp(sep, "bool:", 5) == 0) { - sep += 5; - kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL; - if (std::strcmp(sep, "true") == 0) { - kvo.bool_value = true; - } else if (std::strcmp(sep, "false") == 0) { - kvo.bool_value = false; - } else { - fprintf(stderr, "error: Invalid boolean value for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - } else { - fprintf(stderr, "error: Invalid type for KV override: %s\n", argv[i]); - invalid_param = true; - break; - } - params.kv_overrides.push_back(kvo); -#ifndef LOG_DISABLE_LOGS - // Parse args for logging parameters - } - if ( log_param_single_parse( argv[i] ) ) { - arg_found = true; - // Do nothing, log_param_single_parse automatically does it's thing - // and returns if a match was found and parsed. - } - if ( log_param_pair_parse( /*check_but_dont_parse*/ true, argv[i] ) ) { - arg_found = true; - // We have a matching known parameter requiring an argument, - // now we need to check if there is anything after this argv - // and flag invalid_param or parse it. - if (++i >= argc) { - invalid_param = true; - break; - } - if( !log_param_pair_parse( /*check_but_dont_parse*/ false, argv[i-1], argv[i]) ) { - invalid_param = true; - break; - } - // End of Parse args for logging parameters -#endif // LOG_DISABLE_LOGS - } - - if (!arg_found) { + if (!gpt_params_find_arg(argc, argv, params, i, invalid_param)) { throw std::invalid_argument("error: unknown argument: " + arg); } } From 2bf8d0f7c4cc1235755ad06961ca761e458c5e55 Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 18 Mar 2024 11:03:04 +0100 Subject: [PATCH 04/28] backend : offload large batches to GPU (#6083) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * backend : offload large batches to GPU * fix hip * code cleanup * fix CUDA split buffers * Update ggml-backend-impl.h Co-authored-by: Johannes Gäßler * cuda : fix memset without set_device * imatrix : remove sched affix from weight names * sched : add a new split if the current one has too many inputs reduce max inputs per split more cleanup * update backends ggml-ci --------- Co-authored-by: Johannes Gäßler --- examples/imatrix/imatrix.cpp | 32 ++- examples/llama-bench/llama-bench.cpp | 4 +- ggml-alloc.c | 10 +- ggml-backend-impl.h | 5 + ggml-backend.c | 278 ++++++++++++++----------- ggml-backend.h | 8 +- ggml-cuda.cu | 297 +++++++++------------------ ggml-cuda.h | 21 +- ggml-kompute.cpp | 1 + ggml-metal.m | 1 + ggml-sycl.cpp | 1 + ggml-vulkan.cpp | 1 + ggml.c | 19 +- llama.cpp | 67 +++--- 14 files changed, 349 insertions(+), 396 deletions(-) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp index f21bc48f3..ea79b9062 100644 --- a/examples/imatrix/imatrix.cpp +++ b/examples/imatrix/imatrix.cpp @@ -56,13 +56,31 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * const struct ggml_tensor * src0 = t->src[0]; const struct ggml_tensor * src1 = t->src[1]; + std::string wname; + { + // remove any prefix and suffixes from the name + // CUDA0#blk.0.attn_k.weight#0 => blk.0.attn_k.weight + const char * p = strchr(src0->name, '#'); + if (p != NULL) { + p = p + 1; + const char * q = strchr(p, '#'); + if (q != NULL) { + wname = std::string(p, q - p); + } else { + wname = p; + } + } else { + wname = src0->name; + } + } + // when ask is true, the scheduler wants to know if we are interested in data from this tensor // if we return true, a follow-up call will be made with ask=false in which we can do the actual collection if (ask) { if (t->op == GGML_OP_MUL_MAT_ID) return true; // collect all indirect matrix multiplications if (t->op != GGML_OP_MUL_MAT) return false; if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return false; - if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return false; + if (!(wname.substr(0, 4) == "blk." || (m_params.collect_output_weight && wname == "output.weight"))) return false; return true; } @@ -94,12 +112,12 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * // this is necessary to guarantee equal number of "ncall" for each tensor for (int ex = 0; ex < n_as; ++ex) { src0 = t->src[2 + ex]; - auto& e = m_stats[src0->name]; + auto& e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ASSERT(false); } // NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger @@ -107,7 +125,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * //if (idx == t->src[0]->ne[0] - 1) ++e.ncall; ++e.ncall; if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { const int excur = m_ids[row*n_as + idx]; @@ -129,17 +147,17 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * } } } else { - auto& e = m_stats[src0->name]; + auto& e = m_stats[wname]; if (e.values.empty()) { e.values.resize(src1->ne[0], 0); } else if (e.values.size() != (size_t)src1->ne[0]) { - fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]); exit(1); //GGML_ASSERT(false); } ++e.ncall; if (m_params.verbosity > 1) { - printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, src0->name, ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); + printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type); } for (int row = 0; row < (int)src1->ne[1]; ++row) { const float * x = data + row * src1->ne[0]; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 32eea7869..4cb230804 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -114,10 +114,10 @@ static std::string get_cpu_info() { static std::string get_gpu_info() { std::string id; #ifdef GGML_USE_CUBLAS - int count = ggml_cuda_get_device_count(); + int count = ggml_backend_cuda_get_device_count(); for (int i = 0; i < count; i++) { char buf[128]; - ggml_cuda_get_device_description(i, buf, sizeof(buf)); + ggml_backend_cuda_get_device_description(i, buf, sizeof(buf)); id += buf; if (i < count - 1) { id += "/"; diff --git a/ggml-alloc.c b/ggml-alloc.c index 8ac1d3e51..643b2e55f 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -548,7 +548,11 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (ggml_is_view(node)) { + // TODO: better way to add external dependencies + // GGML_OP_NONE does not appear normally in the graph nodes, but is used by ggml-backend to add dependencies to + // control when some tensors are allocated and freed. in this case, the dependencies are in `src`, but the node + // itself is never used and should not be considered a dependency + if (ggml_is_view(node) && node->op != GGML_OP_NONE) { struct ggml_tensor * view_src = node->view_src; ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } @@ -565,8 +569,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr ggml_gallocr_hash_get(galloc, src)->n_children += 1; - // allocate explicit inputs and leafs - if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) { + // allocate explicit inputs + if (src->flags & GGML_TENSOR_FLAG_INPUT) { ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i)); } } diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index e475e20e5..f121e1de4 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -103,6 +103,11 @@ extern "C" { // check if the backend supports an operation bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer + // these should be expensive operations with large batch sizes that may benefit from running on this backend + // even if the weight has to be copied from the CPU temporarily + bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op); + // (optional) event synchronization ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend); void (*GGML_CALL event_free) (ggml_backend_event_t event); diff --git a/ggml-backend.c b/ggml-backend.c index 31f8d5a6d..9f0084df7 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -278,7 +278,7 @@ enum ggml_status ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_ return err; } -bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph) { return backend->iface.graph_compute(backend, cgraph); } @@ -286,6 +286,13 @@ bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * return backend->iface.supports_op(backend, op); } +bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) { + if (backend->iface.offload_op != NULL) { + return backend->iface.offload_op(backend, op); + } + return false; +} + // backend copy static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { @@ -761,6 +768,10 @@ GGML_CALL static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(gg if (cpu_plan->cplan.work_size > 0) { cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size); + if (cpu_plan->cplan.work_data == NULL) { + free(cpu_plan); + return NULL; + } } cpu_plan->cplan.abort_callback = cpu_ctx->abort_callback; @@ -834,6 +845,7 @@ static struct ggml_backend_i cpu_backend_i = { /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute, /* .graph_compute = */ ggml_backend_cpu_graph_compute, /* .supports_op = */ ggml_backend_cpu_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, @@ -999,11 +1011,11 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_SPLITS -#define GGML_SCHED_MAX_SPLITS 256 +#define GGML_SCHED_MAX_SPLITS 2048 #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS 16 +#define GGML_SCHED_MAX_SPLIT_INPUTS 4 #endif #ifndef GGML_SCHED_MAX_COPIES @@ -1043,8 +1055,9 @@ struct ggml_backend_sched { struct ggml_cgraph * graph; // graph splits - struct ggml_backend_sched_split splits[GGML_SCHED_MAX_SPLITS]; + struct ggml_backend_sched_split * splits; int n_splits; + int splits_capacity; // pipeline parallelism support int n_copies; @@ -1114,40 +1127,48 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st // TODO: use supports_op to check if the backend supports the op // assign pre-allocated nodes to their backend - // dst - int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor); - if (cur_backend != -1) { + int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor); + if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.dst"); - return cur_backend; + return cur_backend_id; } // view_src if (tensor->view_src != NULL) { - cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); - if (cur_backend != -1) { + cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src); + if (cur_backend_id != -1) { SET_CAUSE(tensor, "1.vsrc"); - return cur_backend; + return cur_backend_id; } } - // input + // graph input if (tensor->flags & GGML_TENSOR_FLAG_INPUT) { - cur_backend = sched->n_backends - 1; // last backend (assumed CPU) + cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU) SET_CAUSE(tensor, "1.inp"); - return cur_backend; + return cur_backend_id; } // assign nodes that use weights to the backend of the weights + // operations with weights are preferably run on the same backend as the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = tensor->src[i]; if (src == NULL) { continue; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - int src_backend = ggml_backend_sched_backend_from_buffer(sched, src); - // operations with weights are always run on the same backend as the weights + int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src); + // check if a backend with higher prio wants to offload the op + if (src_backend_id == sched->n_backends - 1) { + for (int b = 0; b < src_backend_id; b++) { + if (ggml_backend_offload_op(sched->backends[b], tensor)) { + SET_CAUSE(tensor, "1.off"); + return b; + } + } + } SET_CAUSE(tensor, "1.wgt%d", i); - return src_backend; + return src_backend_id; } } @@ -1227,28 +1248,31 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; - if (tensor_backend_id(leaf) != -1) { + int * leaf_backend_id = &tensor_backend_id(leaf); + if (*leaf_backend_id != -1) { // do not overwrite user assignments continue; } - tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf); + *leaf_backend_id = ggml_backend_sched_backend_id_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (tensor_backend_id(node) != -1) { + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { // do not overwrite user assignments continue; } - tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node); + *node_backend_id = ggml_backend_sched_backend_id_from_cur(sched, node); // src for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { continue; } - if (tensor_backend_id(src) == -1) { - tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src); + int * src_backend_id = &tensor_backend_id(src); + if (*src_backend_id == -1) { + *src_backend_id = ggml_backend_sched_backend_id_from_cur(sched, src); } } } @@ -1270,21 +1294,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - if (tensor_backend_id == sched->n_backends - 1) { + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; } else { - cur_backend_id = tensor_backend_id; + cur_backend_id = *node_backend_id; } } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.2"); } } } - // pass 2.1 expand gpu up { int cur_backend_id = -1; @@ -1293,22 +1316,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - if (tensor_backend_id == sched->n_backends - 1) { + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + if (*node_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) cur_backend_id = -1; } else { - cur_backend_id = tensor_backend_id; + cur_backend_id = *node_backend_id; } } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.1"); } } } - - // pass 2.4 expand rest down { int cur_backend_id = -1; @@ -1317,16 +1338,16 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - cur_backend_id = tensor_backend_id; + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + cur_backend_id = *node_backend_id; } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.4"); } } } - // pass 2.3 expand rest up + // pass 2.3 expand rest up { int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { @@ -1334,11 +1355,11 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (ggml_is_view_op(node->op)) { continue; } - int tensor_backend_id = tensor_backend_id(node); - if (tensor_backend_id != -1) { - cur_backend_id = tensor_backend_id; + int * node_backend_id = &tensor_backend_id(node); + if (*node_backend_id != -1) { + cur_backend_id = *node_backend_id; } else { - tensor_backend_id(node) = cur_backend_id; + *node_backend_id = cur_backend_id; SET_CAUSE(node, "2.3"); } } @@ -1351,9 +1372,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - int cur_backend_id = tensor_backend_id(node); - if (node->view_src != NULL && cur_backend_id == -1) { - cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src); + int * cur_backend_id = &tensor_backend_id(node); + if (node->view_src != NULL && *cur_backend_id == -1) { + *cur_backend_id = tensor_backend_id(node->view_src); SET_CAUSE(node, "3.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -1361,14 +1382,14 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg if (src == NULL) { continue; } - int src_backend_id = tensor_backend_id(src); - if (src_backend_id == -1) { + int * src_backend_id = &tensor_backend_id(src); + if (*src_backend_id == -1) { if (src->view_src != NULL) { // views are always on the same backend as the source - tensor_backend_id(src) = tensor_backend_id(src->view_src); + *src_backend_id = tensor_backend_id(src->view_src); SET_CAUSE(src, "3.vsrc"); } else { - tensor_backend_id(src) = cur_backend_id; + *src_backend_id = *cur_backend_id; SET_CAUSE(src, "3.cur"); } } @@ -1380,19 +1401,20 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg // pass 4: split graph, find tensors that need to be copied { - int cur_split = 0; + int i_split = 0; + struct ggml_backend_sched_split * split = &sched->splits[0]; // find the backend of the first split, skipping view ops for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { - sched->splits[0].backend_id = tensor_backend_id(node); + split->backend_id = tensor_backend_id(node); break; } } - sched->splits[0].i_start = 0; - sched->splits[0].n_inputs = 0; - memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - int cur_backend_id = sched->splits[0].backend_id; + split->i_start = 0; + split->n_inputs = 0; + memset(split->inputs, 0, sizeof(split->inputs)); //HACK + int cur_backend_id = split->backend_id; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1400,18 +1422,54 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - int tensor_backend_id = tensor_backend_id(node); + const int node_backend_id = tensor_backend_id(node); - GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now + GGML_ASSERT(node_backend_id != -1); // all nodes should be assigned by now - if (tensor_backend_id != cur_backend_id) { - sched->splits[cur_split].i_end = i; - cur_split++; - GGML_ASSERT(cur_split < GGML_SCHED_MAX_SPLITS); - sched->splits[cur_split].backend_id = tensor_backend_id; - sched->splits[cur_split].i_start = i; - sched->splits[cur_split].n_inputs = 0; - cur_backend_id = tensor_backend_id; + // check if we should start a new split based on the sources of the current node + bool need_new_split = false; + if (node_backend_id == cur_backend_id && split->n_inputs > 0) { + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + continue; + } + // check if a weight is on a different backend + // by starting a new split, the memory of the previously offloaded weights can be reused + if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + int src_backend_id = tensor_backend_id(src); + if (src_backend_id != -1 && src_backend_id != cur_backend_id) { + need_new_split = true; + break; + } + } + // check if the split has too many inputs + if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) { + const size_t id = hash_id(src); + int src_backend_id = sched->tensor_backend_id[id]; + if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) { + //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name); + need_new_split = true; + break; + } + } + } + } + + if (node_backend_id != cur_backend_id || need_new_split) { + split->i_end = i; + i_split++; + if (i_split >= sched->splits_capacity) { + sched->splits_capacity *= 2; + sched->splits = realloc(sched->splits, sched->splits_capacity * sizeof(struct ggml_backend_sched_split)); + GGML_ASSERT(sched->splits != NULL); + } + GGML_ASSERT(i_split < GGML_SCHED_MAX_SPLITS); + split = &sched->splits[i_split]; + split->backend_id = node_backend_id; + split->i_start = i; + split->n_inputs = 0; + cur_backend_id = node_backend_id; } // find inputs that are not on the same backend @@ -1421,10 +1479,10 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg continue; } - int src_backend_id = tensor_backend_id(src); + const int src_backend_id = tensor_backend_id(src); assert(src_backend_id != -1); // all inputs should be assigned by now - if (src->flags & GGML_TENSOR_FLAG_INPUT) { + if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) { size_t id = hash_id(src); if (sched->tensor_copies[id][src_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[src_backend_id]; @@ -1441,7 +1499,6 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } sched->tensor_copies[id][src_backend_id][c] = tensor_copy; - tensor_backend_id(tensor_copy) = src_backend_id; SET_CAUSE(tensor_copy, "4.cpy"); } int n_graph_inputs = sched->n_graph_inputs++; @@ -1450,9 +1507,9 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } } - if (src_backend_id != tensor_backend_id) { + if (src_backend_id != node_backend_id) { // create a copy of the input in the split's backend - size_t id = hash_id(src); + const size_t id = hash_id(src); if (sched->tensor_copies[id][cur_backend_id][0] == NULL) { ggml_backend_t backend = sched->backends[cur_backend_id]; for (int c = 0; c < sched->n_copies; c++) { @@ -1463,76 +1520,42 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg ggml_set_output(tensor_copy); // prevent ggml-alloc from overwriting the tensor } sched->tensor_copies[id][cur_backend_id][c] = tensor_copy; - tensor_backend_id(tensor_copy) = cur_backend_id; SET_CAUSE(tensor_copy, "4.cpy"); } - int n_inputs = sched->splits[cur_split].n_inputs++; + int n_inputs = split->n_inputs++; GGML_ASSERT(n_inputs < GGML_SCHED_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = src; + split->inputs[n_inputs] = src; } node->src[j] = sched->tensor_copies[id][cur_backend_id][sched->cur_copy]; } } } - sched->splits[cur_split].i_end = graph->n_nodes; - sched->n_splits = cur_split + 1; + split->i_end = graph->n_nodes; + sched->n_splits = i_split + 1; } #ifdef DEBUG_PASS4 fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph); #endif -#ifndef NDEBUG - // sanity check: all sources should have the same backend as the node - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node); - if (tensor_backend == NULL) { - fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); - } - if (node->view_src != NULL && tensor_backend != ggml_backend_sched_get_tensor_backend(sched, node->view_src)) { - fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", - node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", - node->view_src->name, ggml_backend_sched_get_tensor_backend(sched, node->view_src) ? - ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, node->view_src)) : "NULL"); - } - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - continue; - } - ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src); - if (src_backend != tensor_backend /* && src_backend != NULL */) { - fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", - node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", - j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL"); - } - if (src->view_src != NULL && src_backend != ggml_backend_sched_get_tensor_backend(sched, src->view_src)) { - fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", - src->name, src_backend ? ggml_backend_name(src_backend) : "NULL", - src->view_src->name, ggml_backend_sched_get_tensor_backend(sched, src->view_src) ? - ggml_backend_name(ggml_backend_sched_get_tensor_backend(sched, src->view_src)) : "NULL"); - } - } - } - fflush(stderr); -#endif - // create copies of the graph for each split // TODO: avoid this copy - struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS, false); + struct ggml_cgraph * graph_copy = ggml_new_graph_custom(sched->ctx, graph->n_nodes + sched->n_splits*GGML_SCHED_MAX_SPLIT_INPUTS*2, false); for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { + assert(graph_copy->size > (graph_copy->n_nodes + 1)); + struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id][sched->cur_copy]; + const size_t input_id = hash_id(input); + struct ggml_tensor * input_cpy = sched->tensor_copies[input_id][split->backend_id][sched->cur_copy]; // add a dependency to the input source so that it is not freed before the copy is done struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); input_dep->src[0] = input; - sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input); + sched->node_backend_ids[graph_copy->n_nodes] = sched->tensor_backend_id[input_id]; graph_copy->nodes[graph_copy->n_nodes++] = input_dep; // add a dependency to the input copy so that it is allocated at the start of the split @@ -1541,6 +1564,7 @@ static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct gg } for (int j = split->i_start; j < split->i_end; j++) { + assert(graph_copy->size > graph_copy->n_nodes); sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; } @@ -1625,13 +1649,12 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s } ggml_backend_tensor_copy(input, input_cpy); } else { + // wait for the split backend to finish using the input before overwriting it if (sched->events[split_backend_id][sched->cur_copy] != NULL) { ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]); } else { ggml_backend_synchronize(split_backend); - ggml_backend_synchronize(input_backend); } - ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy); } } @@ -1701,17 +1724,21 @@ ggml_backend_sched_t ggml_backend_sched_new( struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); // initialize hash table - sched->hash_set = ggml_hash_set_new(graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); + sched->hash_set = ggml_hash_set_new(graph_size); sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); - sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size); - sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), graph_size); + + const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2; + sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), nodes_size); + sched->leaf_backend_ids = calloc(sizeof(sched->leaf_backend_ids[0]), nodes_size); sched->n_backends = n_backends; sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1; - GGML_ASSERT(sched->n_copies <= GGML_SCHED_MAX_COPIES); + const int initial_splits_capacity = 16; + sched->splits = calloc(sizeof(sched->splits[0]), initial_splits_capacity); + sched->splits_capacity = initial_splits_capacity; for (int b = 0; b < n_backends; b++) { sched->backends[b] = backends[b]; @@ -1742,6 +1769,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); + free(sched->splits); free(sched->hash_set.keys); free(sched->tensor_backend_id); free(sched->tensor_copies); @@ -1762,6 +1790,8 @@ void ggml_backend_sched_reset(ggml_backend_sched_t sched) { } bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { + GGML_ASSERT((int)sched->hash_set.size >= measure_graph->n_nodes); + ggml_backend_sched_split_graph(sched, measure_graph); // TODO: extract this to a separate function @@ -1776,7 +1806,7 @@ bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * } bool ggml_backend_sched_alloc_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS); + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes); ggml_backend_sched_split_graph(sched, graph); diff --git a/ggml-backend.h b/ggml-backend.h index 099d9c258..422457ab6 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -70,11 +70,11 @@ extern "C" { GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API enum ggml_status ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); - - GGML_API bool ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_plan_compute (ggml_backend_t backend, ggml_backend_graph_plan_t plan); + GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph); + GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph); GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op); + GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op); // tensor copy between different backends GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index db595409a..139025588 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -82,6 +82,10 @@ #define cudaGetDeviceProperties hipGetDeviceProperties #define cudaGetErrorString hipGetErrorString #define cudaGetLastError hipGetLastError +#define cudaHostRegister hipHostRegister +#define cudaHostRegisterPortable hipHostRegisterPortable +#define cudaHostRegisterReadOnly hipHostRegisterReadOnly +#define cudaHostUnregister hipHostUnregister #define cudaLaunchHostFunc hipLaunchHostFunc #ifdef GGML_HIP_UMA #define cudaMalloc hipMallocManaged @@ -7787,11 +7791,7 @@ struct cuda_pool_alloc { static bool g_cublas_loaded = false; -GGML_CALL bool ggml_cublas_loaded(void) { - return g_cublas_loaded; -} - -GGML_CALL void ggml_init_cublas() { +static void ggml_init_cublas() { static bool initialized = false; if (!initialized) { @@ -7880,7 +7880,7 @@ GGML_CALL void ggml_init_cublas() { } } -GGML_CALL void * ggml_cuda_host_malloc(size_t size) { +static void * ggml_cuda_host_malloc(size_t size) { if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { return nullptr; } @@ -7890,7 +7890,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) { if (err != cudaSuccess) { // clear the error cudaGetLastError(); - fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", + fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, size/1024.0/1024.0, cudaGetErrorString(err)); return nullptr; } @@ -7898,7 +7898,7 @@ GGML_CALL void * ggml_cuda_host_malloc(size_t size) { return ptr; } -GGML_CALL void ggml_cuda_host_free(void * ptr) { +static void ggml_cuda_host_free(void * ptr) { CUDA_CHECK(cudaFreeHost(ptr)); } @@ -9036,21 +9036,13 @@ static void ggml_cuda_op_soft_max( // positions tensor float * src2_dd = nullptr; - cuda_pool_alloc src2_f; ggml_tensor * src2 = dst->src[2]; const bool use_src2 = src2 != nullptr; if (use_src2) { - const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU; - - if (src2_on_device) { - ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; - src2_dd = (float *) src2_extra->data_device[g_main_device]; - } else { - src2_dd = src2_f.alloc(ggml_nelements(src2)); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream)); - } + ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; + src2_dd = (float *) src2_extra->data_device[g_main_device]; } soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream); @@ -9107,55 +9099,24 @@ static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * s ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; - const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_TYPE_GPU; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; - // dd = data device float * src0_ddf = nullptr; float * src1_ddf = nullptr; float * dst_ddf = nullptr; - cuda_pool_alloc src0_f; - cuda_pool_alloc src1_f; - cuda_pool_alloc dst_f; - ggml_cuda_set_device(g_main_device); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - if (src0_on_device) { - src0_ddf = (float *) src0_extra->data_device[g_main_device]; - } else { - src0_ddf = src0_f.alloc(ggml_nelements(src0)); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf, src0, 0, 0, 0, nrows0, main_stream)); - } + src0_ddf = (float *) src0_extra->data_device[g_main_device]; if (use_src1) { - if (src1_on_device) { - src1_ddf = (float *) src1_extra->data_device[g_main_device]; - } else { - src1_ddf = src1_f.alloc(ggml_nelements(src1)); - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); - } - } - if (dst_on_device) { - dst_ddf = (float *) dst_extra->data_device[g_main_device]; - } else { - dst_ddf = dst_f.alloc(ggml_nelements(dst)); + src1_ddf = (float *) src1_extra->data_device[g_main_device]; } + dst_ddf = (float *) dst_extra->data_device[g_main_device]; // do the computation op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); CUDA_CHECK(cudaGetLastError()); - - // copy dst to host if necessary - if (!dst_on_device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); - } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaDeviceSynchronize()); - } } static void ggml_cuda_set_peer_access(const int n_tokens) { @@ -9251,7 +9212,6 @@ static void ggml_cuda_op_mul_mat( ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_on_device = src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); @@ -9322,13 +9282,13 @@ static void ggml_cuda_op_mul_mat( used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool src1_on_device = id == g_main_device; // TODO: check from buffer + const bool dst_on_device = id == g_main_device; ggml_cuda_set_device(id); cudaStream_t stream = g_cudaStreams[id][0]; - if (src0_on_device && src0_is_contiguous) { + if (src0_is_contiguous) { dev[id].src0_dd = (char *) src0_extra->data_device[id]; } else { dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0)); @@ -9374,8 +9334,8 @@ static void ggml_cuda_op_mul_mat( continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device; + const bool src1_on_device = id == g_main_device; // TODO: check from buffer + const bool dst_on_device = id == g_main_device; const int64_t row_diff = dev[id].row_high - dev[id].row_low; ggml_cuda_set_device(id); @@ -9400,12 +9360,12 @@ static void ggml_cuda_op_mul_mat( // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device) { + if (id == g_main_device) { dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary - if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { + if (src1_is_contiguous) { if (id != g_main_device) { if (convert_src1_to_q8_1) { char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset; @@ -9418,19 +9378,19 @@ static void ggml_cuda_op_mul_mat( src1_ncols*ne10*sizeof(float), stream)); } } - } else if (src1->backend == GGML_BACKEND_TYPE_CPU || (src1_on_device && !src1_is_contiguous)) { + } else if (src1_on_device && !src1_is_contiguous) { CUDA_CHECK(ggml_cuda_cpy_tensor_2d( src1_ddf_i, src1, i03, i02, src1_col_0, src1_col_0+src1_ncols, stream)); } else { GGML_ASSERT(false); } - if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_TYPE_CPU || !src1_is_contiguous)) { + if (convert_src1_to_q8_1 && !src1_is_contiguous) { quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); } - if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { + if (src1_col_0 == 0 && !src0_is_contiguous && i02 % i02_divisor == 0) { CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[id].row_low, dev[id].row_high, stream)); } @@ -9441,17 +9401,7 @@ static void ggml_cuda_op_mul_mat( // copy dst to host or other device if necessary if (!dst_on_device) { - void * dst_off_device; - cudaMemcpyKind kind; - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - dst_off_device = dst->data; - kind = cudaMemcpyDeviceToHost; - } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { - dst_off_device = dst_extra->data_device[g_main_device]; - kind = cudaMemcpyDeviceToDevice; - } else { - GGML_ASSERT(false); - } + void * dst_off_device = dst_extra->data_device[g_main_device]; if (split) { // src0 = weight matrix is saved as a transposed matrix for better memory layout. // dst is NOT transposed. @@ -9462,28 +9412,26 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); dhf_dst_i += src1_col_0*ne0 + dev[id].row_low; #if !defined(GGML_USE_HIPBLAS) - if (kind == cudaMemcpyDeviceToDevice) { - // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices - cudaMemcpy3DPeerParms p = {}; - p.dstDevice = g_main_device; - p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols); - p.srcDevice = id; - p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols); - p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1); - CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream)); - } else + // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices + cudaMemcpy3DPeerParms p = {}; + p.dstDevice = g_main_device; + p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols); + p.srcDevice = id; + p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols); + p.extent = make_cudaExtent(row_diff*sizeof(float), src1_ncols, 1); + CUDA_CHECK(cudaMemcpy3DPeerAsync(&p, stream)); +#else + // HIP does not support cudaMemcpy3DPeerAsync or vmm pools + CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), + dst_dd_i, row_diff*sizeof(float), + row_diff*sizeof(float), src1_ncols, + cudaMemcpyDeviceToDevice, stream)); #endif - { - CUDA_CHECK(cudaMemcpy2DAsync(dhf_dst_i, ne0*sizeof(float), - dst_dd_i, row_diff*sizeof(float), - row_diff*sizeof(float), src1_ncols, - kind, stream)); - } } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); dhf_dst_i += src1_col_0*ne0; - CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), kind, stream)); + CUDA_CHECK(cudaMemcpyAsync(dhf_dst_i, dst_dd_i, src1_ncols*ne0*sizeof(float), cudaMemcpyDeviceToDevice, stream)); } } @@ -9510,11 +9458,6 @@ static void ggml_cuda_op_mul_mat( } } } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaDeviceSynchronize()); - } } static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -9599,36 +9542,19 @@ static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, gg static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU; - // dd = data device float * src0_ddf = nullptr; float * src1_ddf = nullptr; float * dst_ddf = nullptr; - cuda_pool_alloc dst_f; - ggml_cuda_set_device(g_main_device); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - if (dst_on_device) { - dst_ddf = (float *) dst_extra->data_device[g_main_device]; - } else { - dst_ddf = dst_f.alloc(ggml_nelements(dst)); - } + dst_ddf = (float *) dst_extra->data_device[g_main_device]; // do the computation ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); CUDA_CHECK(cudaGetLastError()); - - // copy dst to host if necessary - if (!dst_on_device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, dst_ddf, ggml_nbytes(dst), cudaMemcpyDeviceToHost, main_stream)); - } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaDeviceSynchronize()); - } } static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -9639,21 +9565,6 @@ static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); } -GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { - if (!g_cublas_loaded) return false; - - const int64_t ne10 = src1->ne[0]; - - const int64_t ne0 = dst->ne[0]; - const int64_t ne1 = dst->ne[1]; - - // TODO: find the optimal values for these - return (src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && - src1->type == GGML_TYPE_F32 && - dst->type == GGML_TYPE_F32 && - (ne0 >= 32 && ne1 >= 32 && ne10 >= 32); -} - static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); @@ -9891,11 +9802,6 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm } static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const bool all_on_device = - (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && - (src1->backend == GGML_BACKEND_TYPE_GPU) && - ( dst->backend == GGML_BACKEND_TYPE_GPU); - const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; @@ -9972,13 +9878,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); - if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch ggml_cuda_mul_mat_vec_p021(src0, src1, dst); - } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch ggml_cuda_mul_mat_batched_cublas(src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { @@ -10178,6 +10084,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s ggml_cuda_mul_mat_id_cublas(dst); // TODO: mmq/mmv support #endif + cudaStream_t stream = g_cudaStreams[g_main_device][0]; const size_t nb11 = src1->nb[1]; const size_t nb1 = dst->nb[1]; @@ -10187,16 +10094,9 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const int32_t n_as = ((int32_t *) dst->op_params)[1]; std::vector ids_host(ggml_nbytes(ids)); - - cudaStream_t stream = g_cudaStreams[g_main_device][0]; - - if (ids->backend == GGML_BACKEND_TYPE_GPU) { - const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; - CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); - CUDA_CHECK(cudaStreamSynchronize(stream)); - } else { - memcpy(ids_host.data(), ids->data, ggml_nbytes(ids)); - } + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; + CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); + CUDA_CHECK(cudaStreamSynchronize(stream)); const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; @@ -10213,20 +10113,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row.extra = &src1_row_extra; dst_row.extra = &dst_row_extra; - char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? - (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; - char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? - (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; + char * src1_original = (char *) src1_extra->data_device[g_main_device]; + char * dst_original = (char *) dst_extra->data_device[g_main_device]; if (src1->ne[1] == 1) { - GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); - GGML_ASSERT(dst->backend == GGML_BACKEND_TYPE_GPU); - for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { - //int32_t row_id; - //CUDA_CHECK(cudaMemcpyAsync(&row_id, ids_dev + i01*ids->nb[1] + id*ids->nb[0], sizeof(int32_t), cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0])); - //CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0])); - const int32_t row_id = *(const int32_t *) (ids_host.data() + i01*ids->nb[1] + id*ids->nb[0]); GGML_ASSERT(row_id >= 0 && row_id < n_as); @@ -10248,11 +10139,6 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); - const cudaMemcpyKind src1_kind = src1->backend == GGML_BACKEND_TYPE_CPU ? - cudaMemcpyHostToDevice : cudaMemcpyDeviceToDevice; - const cudaMemcpyKind dst_kind = dst->backend == GGML_BACKEND_TYPE_CPU ? - cudaMemcpyDeviceToHost : cudaMemcpyDeviceToDevice; - for (int32_t row_id = 0; row_id < n_as; ++row_id) { const struct ggml_tensor * src0_row = dst->src[row_id + 2]; @@ -10267,7 +10153,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(row_id >= 0 && row_id < n_as); CUDA_CHECK(cudaMemcpyAsync(src1_contiguous.get() + num_src1_rows*nb11, src1_original + i01*nb11, - nb11, src1_kind, stream)); + nb11, cudaMemcpyDeviceToDevice, stream)); num_src1_rows++; } @@ -10299,15 +10185,11 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s GGML_ASSERT(row_id >= 0 && row_id < n_as); CUDA_CHECK(cudaMemcpyAsync(dst_original + i01*nb1, dst_contiguous.get() + num_src1_rows*nb1, - nb1, dst_kind, stream)); + nb1, cudaMemcpyDeviceToDevice, stream)); num_src1_rows++; } } } - - if (dst->backend == GGML_BACKEND_TYPE_CPU) { - CUDA_CHECK(cudaStreamSynchronize(stream)); - } } static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -10435,7 +10317,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -GGML_CALL static void ggml_cuda_set_main_device(const int main_device) { +static void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -10450,18 +10332,9 @@ GGML_CALL static void ggml_cuda_set_main_device(const int main_device) { } } -GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { +static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) { if (!g_cublas_loaded) return false; - ggml_cuda_func_t func; - const bool any_on_device = tensor->backend == GGML_BACKEND_TYPE_GPU - || (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) - || (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_TYPE_GPU); - - if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) { - return false; - } - if (tensor->op == GGML_OP_MUL_MAT) { if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { #ifndef NDEBUG @@ -10471,6 +10344,8 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st } } + ggml_cuda_func_t func; + switch (tensor->op) { case GGML_OP_REPEAT: func = ggml_cuda_repeat; @@ -10548,15 +10423,9 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: - if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[0], tensor->src[1], tensor)) { - return false; - } func = ggml_cuda_mul_mat; break; case GGML_OP_MUL_MAT_ID: - if (!any_on_device && !ggml_cuda_can_mul_mat(tensor->src[2], tensor->src[1], tensor)) { - return false; - } func = ggml_cuda_mul_mat_id; break; case GGML_OP_SCALE: @@ -10613,17 +10482,11 @@ GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, st ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); } - if (params->ith != 0) { - return true; - } - if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) { - return true; - } func(tensor->src[0], tensor->src[1], tensor); return true; } -GGML_CALL int ggml_cuda_get_device_count() { +static int ggml_cuda_get_device_count() { int device_count; if (cudaGetDeviceCount(&device_count) != cudaSuccess) { return 0; @@ -10631,7 +10494,7 @@ GGML_CALL int ggml_cuda_get_device_count() { return device_count; } -GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { +static void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); snprintf(description, description_size, "%s", prop.name); @@ -10736,6 +10599,7 @@ GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); if (padded_size > original_size && tensor->view_src == nullptr) { + ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); } } @@ -10873,6 +10737,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { }; GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { + ggml_init_cublas(); + // FIXME: this is not thread safe if (device >= ggml_backend_cuda_get_device_count()) { return nullptr; @@ -11157,6 +11023,8 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface }; GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { + ggml_init_cublas(); + // FIXME: this is not thread safe static std::map, struct ggml_backend_buffer_type> buft_map; @@ -11348,9 +11216,6 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t ggml_cuda_set_main_device(cuda_ctx->device); - ggml_compute_params params = {}; - params.type = GGML_TASK_TYPE_COMPUTE; - params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -11372,7 +11237,7 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t } #endif - bool ok = ggml_cuda_compute_forward(¶ms, node); + bool ok = ggml_cuda_compute_forward(node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -11509,6 +11374,14 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons UNUSED(backend); } +GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { + const int min_batch_size = 32; + + return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; + + UNUSED(backend); +} + static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; @@ -11571,6 +11444,7 @@ static ggml_backend_i ggml_backend_cuda_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, + /* .offload_op = */ ggml_backend_cuda_offload_op, /* .event_new = */ ggml_backend_cuda_event_new, /* .event_free = */ ggml_backend_cuda_event_free, /* .event_record = */ ggml_backend_cuda_event_record, @@ -11584,7 +11458,7 @@ static ggml_guid_t ggml_backend_cuda_guid() { } GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { - ggml_init_cublas(); // TODO: remove from ggml.c + ggml_init_cublas(); if (device < 0 || device >= ggml_cuda_get_device_count()) { fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); @@ -11627,6 +11501,31 @@ GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, si CUDA_CHECK(cudaMemGetInfo(free, total)); } +GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) { + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return false; + } + + cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + + fprintf(stderr, "%s: warning: failed to register %.2f MiB of pinned memory: %s\n", __func__, + size/1024.0/1024.0, cudaGetErrorString(err)); + return false; + } + return true; +} + +GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer) { + cudaError_t err = cudaHostUnregister(buffer); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + } +} + // backend registry GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); diff --git a/ggml-cuda.h b/ggml-cuda.h index b1ebd61d7..5eb4af40f 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -17,29 +17,17 @@ extern "C" { #define GGML_CUDA_MAX_DEVICES 16 -// Always success. To check if CUDA is actually loaded, use `ggml_cublas_loaded`. -GGML_API GGML_CALL void ggml_init_cublas(void); - -// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`. -GGML_API GGML_CALL bool ggml_cublas_loaded(void); - -GGML_API GGML_CALL void * ggml_cuda_host_malloc(size_t size); -GGML_API GGML_CALL void ggml_cuda_host_free(void * ptr); - -GGML_API GGML_CALL bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API GGML_CALL bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); - -GGML_API GGML_CALL int ggml_cuda_get_device_count(void); -GGML_API GGML_CALL void ggml_cuda_get_device_description(int device, char * description, size_t description_size); - // backend API GGML_API GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device); GGML_API GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend); +// device buffer GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); + // split tensor buffer that splits matrices by rows across multiple devices GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); + // pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); @@ -47,6 +35,9 @@ GGML_API GGML_CALL int ggml_backend_cuda_get_device_count(void); GGML_API GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); GGML_API GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); +GGML_API GGML_CALL bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size); +GGML_API GGML_CALL void ggml_backend_cuda_unregister_host_buffer(void * buffer); + #ifdef __cplusplus } #endif diff --git a/ggml-kompute.cpp b/ggml-kompute.cpp index 4caf2c9e7..81dd50678 100644 --- a/ggml-kompute.cpp +++ b/ggml-kompute.cpp @@ -1951,6 +1951,7 @@ static struct ggml_backend_i kompute_backend_i = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_kompute_graph_compute, /* .supports_op = */ ggml_backend_kompute_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml-metal.m b/ggml-metal.m index c3451a79b..109e5fe6b 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -2837,6 +2837,7 @@ static struct ggml_backend_i ggml_backend_metal_i = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_metal_graph_compute, /* .supports_op = */ ggml_backend_metal_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 6dc5eb20c..d51f23b41 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -17390,6 +17390,7 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 698b31496..cbceaa19f 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5699,6 +5699,7 @@ static ggml_backend_i ggml_backend_vk_interface = { /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_vk_graph_compute, /* .supports_op = */ ggml_backend_vk_supports_op, + /* .offload_op = */ NULL, /* .event_new = */ NULL, /* .event_free = */ NULL, /* .event_record = */ NULL, diff --git a/ggml.c b/ggml.c index fa23cb3c4..1d5854960 100644 --- a/ggml.c +++ b/ggml.c @@ -282,8 +282,6 @@ inline static void * ggml_calloc(size_t num, size_t size) { #else #include #endif -#elif defined(GGML_USE_CUBLAS) -#include "ggml-cuda.h" #elif defined(GGML_USE_CLBLAST) #include "ggml-opencl.h" #elif defined(GGML_USE_VULKAN) @@ -2640,9 +2638,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f); } -#if defined(GGML_USE_CUBLAS) - ggml_init_cublas(); -#elif defined(GGML_USE_CLBLAST) +#if defined(GGML_USE_CLBLAST) ggml_cl_init(); #elif defined(GGML_USE_VULKAN) ggml_vk_init_cpu_assist(); @@ -11105,7 +11101,6 @@ static void ggml_compute_forward_out_prod_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_CLBLAST) #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) @@ -11305,7 +11300,6 @@ static void ggml_compute_forward_out_prod_q_f32( // nb01 >= nb00 - src0 is not transposed // compute by src0 rows - // TODO: #if defined(GGML_USE_CUBLAS) ggml_cuda_out_prod // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST) if (params->type == GGML_TASK_TYPE_INIT) { @@ -16051,14 +16045,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm return; } -#ifdef GGML_USE_CUBLAS - bool skip_cpu = ggml_cuda_compute_forward(params, tensor); - if (skip_cpu) { - return; - } - GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); - GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); -#elif defined(GGML_USE_VULKAN) +#if defined(GGML_USE_VULKAN) const bool skip_cpu = ggml_vk_compute_forward_cpu_assist(params, tensor); #ifdef GGML_VULKAN_CHECK_RESULTS if (skip_cpu) { @@ -16070,7 +16057,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_TYPE_CPU); GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_TYPE_CPU); -#endif // GGML_USE_CUBLAS +#endif // GGML_USE_VULKAN #ifdef GGML_USE_SYCL bool skip_cpu = ggml_sycl_compute_forward(params, tensor); diff --git a/llama.cpp b/llama.cpp index e4db288dd..b8bef6daf 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2040,6 +2040,11 @@ struct llama_model { ggml_free(ctx); } for (ggml_backend_buffer_t buf : bufs) { +#ifdef GGML_USE_CUBLAS + if (ggml_backend_buffer_get_type(buf) == ggml_backend_cpu_buffer_type()) { + ggml_backend_cuda_unregister_host_buffer(ggml_backend_buffer_get_base(buf)); + } +#endif ggml_backend_buffer_free(buf); } } @@ -5033,6 +5038,13 @@ static bool llm_load_tensors( size_t first, last; ml.get_mapping_range(&first, &last, ctx); buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); +#ifdef GGML_USE_CUBLAS + if (n_layer >= n_gpu_layers) { + ggml_backend_cuda_register_host_buffer( + ggml_backend_buffer_get_base(buf), + ggml_backend_buffer_get_size(buf)); + } +#endif } #ifdef GGML_USE_METAL else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { @@ -8231,7 +8243,6 @@ struct llm_build_context { cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, model.layers[il].wo, model.layers[il].bo, Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); - cb(cur, "kqv_out", il); } struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); @@ -8601,12 +8612,15 @@ static struct ggml_cgraph * llama_build_graph( } // norm may be automatically assigned to the backend of the previous layer, increasing data transfer between backends - // to fix this, we assign the norm layer manually to the backend of its layer - if (il != -1 && strcmp(name, "norm") == 0) { - for (auto * backend : lctx.backends) { - if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { - ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); - break; + // FIXME: fix in ggml_backend_sched + const bool full_offload = lctx.model.n_gpu_layers > (int)lctx.model.hparams.n_layer; + if (batch.n_tokens < 32 || full_offload) { + if (il != -1 && strcmp(name, "norm") == 0) { + for (auto * backend : lctx.backends) { + if (ggml_backend_buft_supports_backend(lctx.model.buft_layer[il].buft, backend)) { + ggml_backend_sched_set_tensor_backend(lctx.sched, cur, backend); + break; + } } } } @@ -13107,27 +13121,25 @@ struct llama_context * llama_new_context_with_model( ctx->backends.push_back(ctx->backend_metal); } #elif defined(GGML_USE_CUBLAS) - if (model->n_gpu_layers > 0) { + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { - ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); + ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU + for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); llama_free(ctx); return nullptr; } ctx->backends.push_back(backend); - } else { - // LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU - for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { - ggml_backend_t backend = ggml_backend_cuda_init(device); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); - llama_free(ctx); - return nullptr; - } - ctx->backends.push_back(backend); - } } } #elif defined(GGML_USE_VULKAN) @@ -13285,14 +13297,17 @@ struct llama_context * llama_new_context_with_model( ggml_backend_t backend = ctx->backends[i]; ggml_backend_buffer_type_t buft = backend_buft[i]; size_t size = ggml_backend_sched_get_buffer_size(ctx->sched, backend); - LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, - ggml_backend_buft_name(buft), - size / 1024.0 / 1024.0); + if (size > 1) { + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buft_name(buft), + size / 1024.0 / 1024.0); + } } // note: the number of splits during measure is higher than during inference due to the kv shift int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); - LLAMA_LOG_INFO("%s: graph splits: %d\n", __func__, n_splits); + LLAMA_LOG_INFO("%s: graph nodes = %d\n", __func__, gf->n_nodes); + LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits); } } From 4f6d1337ca5a409dc74aca8c479b7c34408a69c0 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Mar 2024 13:45:27 +0200 Subject: [PATCH 05/28] ci : temporary disable sanitizer builds (#6128) --- .github/workflows/build.yml | 68 ++++++++++++++++++------------------ .github/workflows/server.yml | 6 ++-- 2 files changed, 37 insertions(+), 37 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 945df42f8..992c34a03 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -98,40 +98,40 @@ jobs: cd build ctest -L main --verbose --timeout 900 - ubuntu-latest-cmake-sanitizer: - runs-on: ubuntu-latest - - continue-on-error: true - - strategy: - matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] - build_type: [Debug, Release] - - steps: - - name: Clone - id: checkout - uses: actions/checkout@v3 - - - name: Dependencies - id: depends - run: | - sudo apt-get update - sudo apt-get install build-essential - - - name: Build - id: cmake_build - run: | - mkdir build - cd build - cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} - cmake --build . --config ${{ matrix.build_type }} -j $(nproc) - - - name: Test - id: cmake_test - run: | - cd build - ctest -L main --verbose --timeout 900 +# ubuntu-latest-cmake-sanitizer: +# runs-on: ubuntu-latest +# +# continue-on-error: true +# +# strategy: +# matrix: +# sanitizer: [ADDRESS, THREAD, UNDEFINED] +# build_type: [Debug, Release] +# +# steps: +# - name: Clone +# id: checkout +# uses: actions/checkout@v3 +# +# - name: Dependencies +# id: depends +# run: | +# sudo apt-get update +# sudo apt-get install build-essential +# +# - name: Build +# id: cmake_build +# run: | +# mkdir build +# cd build +# cmake .. -DLLAMA_FATAL_WARNINGS=ON -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} +# cmake --build . --config ${{ matrix.build_type }} -j $(nproc) +# +# - name: Test +# id: cmake_test +# run: | +# cd build +# ctest -L main --verbose --timeout 900 ubuntu-latest-cmake-mpi: runs-on: ubuntu-latest diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 4ea09115a..65ca7d9ca 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -24,13 +24,13 @@ jobs: strategy: matrix: - sanitizer: [ADDRESS, THREAD, UNDEFINED] + # TODO: temporary disabled due to linux kernel issues + #sanitizer: [ADDRESS, THREAD, UNDEFINED] + sanitizer: [UNDEFINED] build_type: [Debug] include: - build_type: Release sanitizer: "" - - build_type: Debug - sanitizer: THREAD disabled_on_pr: true fail-fast: false # While -DLLAMA_SANITIZE_THREAD=ON is broken From ac9ee6a4ad740bc1ee484ede43e9f92b5af244c1 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 18 Mar 2024 13:45:38 +0200 Subject: [PATCH 06/28] ci : disable stale issue messages (#6126) --- .github/workflows/close-issue.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index 2682f308c..eaffd074d 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -15,7 +15,6 @@ jobs: days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" - stale-issue-message: "This issue is stale because it has been open for 30 days with no activity." close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale." days-before-pr-stale: -1 days-before-pr-close: -1 From 5e1b7f94a03e0b3b8e4578625bbdadc7bbd2b93c Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 18 Mar 2024 16:33:44 +0100 Subject: [PATCH 07/28] backend : set max split inputs to GGML_MAX_SRC (#6137) --- ggml-backend.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-backend.c b/ggml-backend.c index 9f0084df7..6026570ae 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1015,7 +1015,7 @@ static bool ggml_is_view_op(enum ggml_op op) { #endif #ifndef GGML_SCHED_MAX_SPLIT_INPUTS -#define GGML_SCHED_MAX_SPLIT_INPUTS 4 +#define GGML_SCHED_MAX_SPLIT_INPUTS GGML_MAX_SRC #endif #ifndef GGML_SCHED_MAX_COPIES From 104f5e0fc156d48476258295457cafeec2a2af10 Mon Sep 17 00:00:00 2001 From: Felix Date: Mon, 18 Mar 2024 16:40:22 +0100 Subject: [PATCH 08/28] clip : fix memory leak (#6138) --- examples/llava/clip.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index a0ed82d7e..690bca2eb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -497,7 +497,6 @@ struct clip_ctx { // memory buffers to evaluate the model ggml_backend_buffer_t params_buffer = NULL; - ggml_backend_buffer_t compute_buffer = NULL; ggml_backend_t backend = NULL; ggml_gallocr_t compute_alloc = NULL; @@ -1676,6 +1675,9 @@ void clip_free(clip_ctx * ctx) { ggml_free(ctx->ctx_data); gguf_free(ctx->ctx_gguf); + ggml_backend_buffer_free(ctx->params_buffer); + ggml_backend_free(ctx->backend); + ggml_gallocr_free(ctx->compute_alloc); delete ctx; } From d199ca79f279e84ebe27caafe0aa59c461d88969 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Mon, 18 Mar 2024 12:49:02 -0400 Subject: [PATCH 09/28] mpt : implement backwards compatiblity with duped output tensor (#6139) --- llama.cpp | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/llama.cpp b/llama.cpp index b8bef6daf..1a9fe0c4d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -540,6 +540,7 @@ static const std::map> LLM_TENSOR_NA { { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output"}, { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" }, @@ -4300,9 +4301,9 @@ static bool llm_load_tensors( { model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); - if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_OUTPUT, "weight").c_str()) >= 0) { - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); - } else { + + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!model.output) { model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU ml.n_created--; // artificial tensor ml.size_data += ggml_nbytes(model.output); @@ -4507,10 +4508,12 @@ static bool llm_load_tensors( model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, false); - // same as tok_embd, duplicated to allow offloading - model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); - ml.n_created--; // artificial tensor - ml.size_data += ggml_nbytes(model.output); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + if (!model.output) { + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // needs to be on GPU + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } } for (int i = 0; i < n_layer; ++i) { From 2d15886bb092c3b780c676b5cc57ff3337af9c83 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Sun, 17 Mar 2024 06:37:44 +0000 Subject: [PATCH 10/28] flake.lock: Update MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Flake lock file updates: • Updated input 'nixpkgs': 'github:NixOS/nixpkgs/9df3e30ce24fd28c7b3e2de0d986769db5d6225d' (2024-03-06) → 'github:NixOS/nixpkgs/d691274a972b3165335d261cc4671335f5c67de9' (2024-03-14) --- flake.lock | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flake.lock b/flake.lock index f9865d5e4..80de76dbf 100644 --- a/flake.lock +++ b/flake.lock @@ -20,11 +20,11 @@ }, "nixpkgs": { "locked": { - "lastModified": 1709703039, - "narHash": "sha256-6hqgQ8OK6gsMu1VtcGKBxKQInRLHtzulDo9Z5jxHEFY=", + "lastModified": 1710451336, + "narHash": "sha256-pP86Pcfu3BrAvRO7R64x7hs+GaQrjFes+mEPowCfkxY=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "9df3e30ce24fd28c7b3e2de0d986769db5d6225d", + "rev": "d691274a972b3165335d261cc4671335f5c67de9", "type": "github" }, "original": { From 4c28b8252907561165827125d2d1a4bad6926ac6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Tue, 19 Mar 2024 01:59:36 -0400 Subject: [PATCH 11/28] common : print usage on '-h' and '--help' (#6145) --- common/common.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index 919182862..5f10718ec 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1056,7 +1056,8 @@ static bool gpt_params_find_arg(int argc, char ** argv, gpt_params & params, int return true; } if (arg == "-h" || arg == "--help") { - return false; + gpt_print_usage(argc, argv, gpt_params()); + exit(0); } if (arg == "--version") { fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); From 970a48060ab9a6cc67aa063870323781c2a7bd7d Mon Sep 17 00:00:00 2001 From: slaren Date: Tue, 19 Mar 2024 09:06:54 +0100 Subject: [PATCH 12/28] ci : exempt some labels from being tagged as stale (#6140) --- .github/workflows/close-issue.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/close-issue.yml b/.github/workflows/close-issue.yml index eaffd074d..a151c6780 100644 --- a/.github/workflows/close-issue.yml +++ b/.github/workflows/close-issue.yml @@ -12,6 +12,7 @@ jobs: steps: - uses: actions/stale@v5 with: + exempt-issue-labels: "refactor,help wanted,good first issue,research" days-before-issue-stale: 30 days-before-issue-close: 14 stale-issue-label: "stale" From b80cf3b2d1dee0ad325f7a794fecc66befce7336 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 19 Mar 2024 10:21:54 +0200 Subject: [PATCH 13/28] common : disable repeat penalties by default (#6127) --- common/sampling.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/common/sampling.h b/common/sampling.h index 48b2459d1..79a998be8 100644 --- a/common/sampling.h +++ b/common/sampling.h @@ -32,13 +32,13 @@ typedef struct llama_sampling_params { float dynatemp_range = 0.00f; // 0.0 = disabled float dynatemp_exponent = 1.00f; // controls how entropy maps to temperature in dynamic temperature sampler int32_t penalty_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) - float penalty_repeat = 1.10f; // 1.0 = disabled + float penalty_repeat = 1.00f; // 1.0 = disabled float penalty_freq = 0.00f; // 0.0 = disabled float penalty_present = 0.00f; // 0.0 = disabled int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate - bool penalize_nl = true; // consider newlines as a repeatable token + bool penalize_nl = false; // consider newlines as a repeatable token std::vector samplers_sequence = { llama_sampler_type::TOP_K, From d0d5de42e5a65865b5fddb6f5c785083539b74c3 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Tue, 19 Mar 2024 12:05:44 +0100 Subject: [PATCH 14/28] gguf-split: split and merge gguf per batch of tensors (#6135) * gguf-split: split and merge gguf files per tensor * gguf-split: build with make toolchain * gguf-split: rename `--split-tensors-size` to `--split-max-tensors`. Set general.split_count KV to all split * split : minor style + fix compile warnings * gguf-split: remove --upload not implemented --------- Co-authored-by: Georgi Gerganov --- Makefile | 4 + examples/CMakeLists.txt | 1 + examples/gguf-split/CMakeLists.txt | 5 + examples/gguf-split/README.md | 9 + examples/gguf-split/gguf-split.cpp | 491 +++++++++++++++++++++++++++++ 5 files changed, 510 insertions(+) create mode 100644 examples/gguf-split/CMakeLists.txt create mode 100644 examples/gguf-split/README.md create mode 100644 examples/gguf-split/gguf-split.cpp diff --git a/Makefile b/Makefile index 838daf5c0..1daad45ed 100644 --- a/Makefile +++ b/Makefile @@ -753,6 +753,10 @@ gguf: examples/gguf/gguf.cpp ggml.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) +gguf-split: examples/gguf-split/gguf-split.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) + $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) + train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp ggml.o llama.o $(COMMON_DEPS) train.o $(OBJS) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index e762cf8b9..b59cc65bf 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -21,6 +21,7 @@ else() add_subdirectory(embedding) add_subdirectory(finetune) add_subdirectory(gritlm) + add_subdirectory(gguf-split) add_subdirectory(infill) add_subdirectory(llama-bench) add_subdirectory(llava) diff --git a/examples/gguf-split/CMakeLists.txt b/examples/gguf-split/CMakeLists.txt new file mode 100644 index 000000000..828e62435 --- /dev/null +++ b/examples/gguf-split/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET gguf-split) +add_executable(${TARGET} gguf-split.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/gguf-split/README.md b/examples/gguf-split/README.md new file mode 100644 index 000000000..ddb1f7649 --- /dev/null +++ b/examples/gguf-split/README.md @@ -0,0 +1,9 @@ +## GGUF split Example + +CLI to split / merge GGUF files. + +**Command line options:** + +- `--split`: split GGUF to multiple GGUF, default operation. +- `--split-max-tensors`: maximum tensors in each split: default(128) +- `--merge`: merge multiple GGUF to a single GGUF. diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp new file mode 100644 index 000000000..5d7040ab5 --- /dev/null +++ b/examples/gguf-split/gguf-split.cpp @@ -0,0 +1,491 @@ +#include "llama.h" +#include "ggml.h" +#include "common.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +enum split_operation : uint8_t { + SPLIT_OP_SPLIT, + SPLIT_OP_MERGE, +}; + +static const char * const LLM_KV_GENERAL_SPLIT_I_SPLIT = "general.split"; +static const char * const LLM_KV_GENERAL_SPLIT_N_SPLIT = "general.split_count"; + +static const int SPLIT_FILENAME_MAX = 256; + +static const char * const SPLIT_FILENAME_FORMAT = "%s-%05d-of-%05d.gguf"; + +struct split_params { + split_operation operation = SPLIT_OP_SPLIT; + int n_split_tensors = 128; + std::string input; + std::string output; +}; + +static void split_print_usage(const char * executable) { + const split_params default_params; + printf("\n"); + printf("usage: %s [options] GGUF_IN GGUF_OUT\n", executable); + printf("\n"); + printf("Apply a GGUF operation on IN to OUT."); + printf("\n"); + printf("options:\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" --version show version and build info\n"); + printf(" --split split GGUF to multiple GGUF (default)\n"); + printf(" --split-max-tensors max tensors in each split: default(%d)\n", default_params.n_split_tensors); + printf(" --merge merge multiple GGUF to a single GGUF\n"); + printf("\n"); +} + +static bool split_params_parse_ex(int argc, const char ** argv, split_params & params) { + std::string arg; + const std::string arg_prefix = "--"; + bool invalid_param = false; + + int arg_idx = 1; + for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) { + arg = argv[arg_idx]; + if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) { + std::replace(arg.begin(), arg.end(), '_', '-'); + } + + bool arg_found = false; + if (arg == "-h" || arg == "--help") { + split_print_usage(argv[0]); + exit(0); + } + if (arg == "--version") { + fprintf(stderr, "version: %d (%s)\n", LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + fprintf(stderr, "built with %s for %s\n", LLAMA_COMPILER, LLAMA_BUILD_TARGET); + exit(0); + } + + if (arg == "--merge") { + arg_found = true; + params.operation = SPLIT_OP_MERGE; + } + if (arg == "--split") { + arg_found = true; + params.operation = SPLIT_OP_SPLIT; + } + if (arg == "--split-max-tensors") { + if (++arg_idx >= argc) { + invalid_param = true; + break; + } + arg_found = true; + params.n_split_tensors = atoi(argv[arg_idx]); + } + + if (!arg_found) { + throw std::invalid_argument("error: unknown argument: " + arg); + } + } + + if (invalid_param) { + throw std::invalid_argument("error: invalid parameter for argument: " + arg); + } + + if (argc - arg_idx < 2) { + printf("%s: bad arguments\n", argv[0]); + split_print_usage(argv[0]); + return false; + } + + params.input = argv[arg_idx++]; + params.output = argv[arg_idx++]; + + return true; +} + +static bool split_params_parse(int argc, const char ** argv, split_params & params) { + bool result = true; + try { + if (!split_params_parse_ex(argc, argv, params)) { + split_print_usage(argv[0]); + exit(1); + } + } + catch (const std::invalid_argument & ex) { + fprintf(stderr, "%s\n", ex.what()); + split_print_usage(argv[0]); + exit(1); + } + return result; +} + +static void zeros(std::ofstream & file, size_t n) { + char zero = 0; + for (size_t i = 0; i < n; ++i) { + file.write(&zero, 1); + } +} + +static std::string split_file_name(const std::string & path, int i_split, int n_split) { + char f_split[SPLIT_FILENAME_MAX] = {0}; + snprintf(f_split, sizeof(f_split), SPLIT_FILENAME_FORMAT, path.c_str(), i_split + 1, n_split); + return std::string(f_split); +} + +struct split_strategy { + const split_params params; + std::ifstream & f_input; + struct gguf_context * ctx_gguf; + struct ggml_context * ctx_meta = NULL; + const int n_tensors; + + const int n_split; + int i_split = 0; + + int i_tensor = 0; + + std::vector read_data; + + struct gguf_context * ctx_out; + std::ofstream fout; + + split_strategy(const split_params & params, + std::ifstream & f_input, + struct gguf_context * ctx_gguf, + struct ggml_context * ctx_meta) : + params(params), + f_input(f_input), + ctx_gguf(ctx_gguf), + ctx_meta(ctx_meta), + n_tensors(gguf_get_n_tensors(ctx_gguf)), + n_split(std::ceil(1. * n_tensors / params.n_split_tensors)) { + } + + bool should_split() const { + return i_tensor < n_tensors && i_tensor % params.n_split_tensors == 0; + } + + void split_start() { + ctx_out = gguf_init_empty(); + + // Save all metadata in first split only + if (i_split == 0) { + gguf_set_kv(ctx_out, ctx_gguf); + } + gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_I_SPLIT, i_split); + gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, n_split); + + // populate the original tensors, so we get an initial metadata + for (int i = i_split * params.n_split_tensors; i < n_tensors && i < (i_split + 1) * params.n_split_tensors; ++i) { + struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + gguf_add_tensor(ctx_out, meta); + } + + auto split_name = split_file_name(params.output, i_split, n_split); + + fprintf(stderr, "%s: %s ...", __func__, split_name.c_str()); + fout = std::ofstream(split_name, std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + auto meta_size = gguf_get_meta_size(ctx_out); + + // placeholder for the meta data + ::zeros(fout, meta_size); + + i_split++; + } + + void next_tensor() { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + auto n_bytes = ggml_nbytes(t); + + if (read_data.size() < n_bytes) { + read_data.resize(n_bytes); + } + + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); + f_input.seekg(offset); + f_input.read((char *)read_data.data(), n_bytes); + + t->data = read_data.data(); + + // write tensor data + padding + fout.write((const char *)t->data, n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + + i_tensor++; + } + + void split_end() { + // go back to beginning of file and write the updated metadata + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + fout.close(); + gguf_free(ctx_out); + + fprintf(stderr, "\033[3Ddone\n"); + } +}; + +static void gguf_split(const split_params & split_params) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + std::ifstream f_input(split_params.input.c_str(), std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(1); + } + + auto * ctx_gguf = gguf_init_from_file(split_params.input.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(1); + } + + split_strategy strategy(split_params, f_input, ctx_gguf, ctx_meta); + fprintf(stderr, "%s: %s -> %s (%d tensors per file)\n", + __func__, split_params.input.c_str(), + split_file_name(split_params.output, strategy.i_split, strategy.n_split).c_str(), + split_params.n_split_tensors); + + strategy.split_start(); + + while (strategy.i_tensor < strategy.n_tensors) { + strategy.next_tensor(); + if (strategy.should_split()) { + strategy.split_end(); + strategy.split_start(); + } + } + strategy.split_end(); + + gguf_free(ctx_gguf); + f_input.close(); + + fprintf(stderr, "%s: %d gguf split written with a total of %d tensors.\n", + __func__, strategy.n_split, strategy.n_tensors); +} + +static void gguf_merge(const split_params & split_params) { + fprintf(stderr, "%s: %s -> %s\n", + __func__, split_params.input.c_str(), + split_params.output.c_str()); + int n_split = 1; + int total_tensors = 0; + + auto * ctx_out = gguf_init_empty(); + std::ofstream fout(split_params.output.c_str(), std::ios::binary); + fout.exceptions(std::ofstream::failbit); // fail fast on write errors + + std::vector read_data; + std::vector ctx_metas; + std::vector ctx_ggufs; + + std::string split_prefix; + + // First pass to find KV and tensors metadata + for (int i_split = 0; i_split < n_split; i_split++) { + struct ggml_context * ctx_meta = NULL; + + struct gguf_init_params params = { + /*.no_alloc = */ true, + /*.ctx = */ &ctx_meta, + }; + + auto split_name = split_params.input; + if (i_split > 0) { + split_name = split_file_name(split_prefix, i_split, n_split); + } + fprintf(stderr, "%s: reading metadata %s ...", __func__, split_name.c_str()); + + auto * ctx_gguf = gguf_init_from_file(split_name.c_str(), params); + if (!ctx_gguf) { + fprintf(stderr, "\n%s: failed to load input GGUF from %s\n", __func__, split_params.input.c_str()); + exit(1); + } + ctx_ggufs.push_back(ctx_gguf); + ctx_metas.push_back(ctx_meta); + + if (i_split == 0) { + auto key_n_split = gguf_find_key(ctx_gguf, LLM_KV_GENERAL_SPLIT_N_SPLIT); + if (key_n_split < 0) { + fprintf(stderr, + "\n%s: input file does not contain %s metadata\n", + __func__, + LLM_KV_GENERAL_SPLIT_N_SPLIT); + gguf_free(ctx_gguf); + gguf_free(ctx_out); + fout.close(); + exit(1); + } + + n_split = gguf_get_val_u8(ctx_gguf, key_n_split); + if (n_split < 1) { + fprintf(stderr, + "\n%s: input file does not contain a valid split count %d\n", + __func__, + n_split); + gguf_free(ctx_gguf); + gguf_free(ctx_out); + fout.close(); + exit(1); + } + + // Do not trigger merge if we try to merge again the output + gguf_set_val_u8(ctx_out, LLM_KV_GENERAL_SPLIT_N_SPLIT, 0); + + // Set metadata from the first split + gguf_set_kv(ctx_out, ctx_gguf); + } + + // Verify the file naming + { + int i_split_file = 0; + int n_split_file = 0; + const char * i_split_format = "-00000-of-00000.gguf"; + + if (split_name.size() < strlen(i_split_format)) { + fprintf(stderr, "\n%s: unexpected input file name: %s\n", __func__, split_params.input.c_str()); + for (auto * _ctx_gguf : ctx_ggufs) { + gguf_free(_ctx_gguf); + } + gguf_free(ctx_out); + fout.close(); + exit(1); + } + + split_prefix = split_name.substr(0, split_name.size() - strlen(i_split_format)); + + const char * split_name_c_str = split_name.c_str(); + int n_part = sscanf(&split_name_c_str[0] + split_prefix.size(), "-%d-of-%d", &i_split_file, &n_split_file); + + if (n_part != 2 || i_split_file - 1 != i_split || n_split_file != n_split) { + fprintf(stderr, "\n%s: unexpected input file name: %s" + " i_split=%d i_split_file=%d" + " n_split=%d n_split_file=%d\n", __func__, + split_params.input.c_str(), + i_split, i_split_file, + n_split, n_split_file); + for (auto * _ctx_gguf : ctx_ggufs) { + gguf_free(_ctx_gguf); + } + gguf_free(ctx_out); + fout.close(); + exit(1); + } + } + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + gguf_add_tensor(ctx_out, t); + } + total_tensors += n_tensors; + + fprintf(stderr, "\033[3Ddone\n"); + } + + // placeholder for the meta data + { + auto meta_size = gguf_get_meta_size(ctx_out); + ::zeros(fout, meta_size); + } + + // Write tensors data + for (int i_split = 0; i_split < n_split; i_split++) { + auto split_name = split_file_name(split_prefix, i_split, n_split); + std::ifstream f_input(split_name.c_str(), std::ios::binary); + if (!f_input.is_open()) { + fprintf(stderr, "%s: failed to open input GGUF from %s\n", __func__, split_name.c_str()); + for (auto * _ctx_gguf : ctx_ggufs) { + gguf_free(_ctx_gguf); + } + gguf_free(ctx_out); + fout.close(); + exit(1); + } + fprintf(stderr, "%s: writing tensors %s ...", __func__, split_name.c_str()); + + auto * ctx_gguf = ctx_ggufs[i_split]; + auto * ctx_meta = ctx_metas[i_split]; + + auto n_tensors = gguf_get_n_tensors(ctx_gguf); + for (int i_tensor = 0; i_tensor < n_tensors; i_tensor++) { + const char * t_name = gguf_get_tensor_name(ctx_gguf, i_tensor); + struct ggml_tensor * t = ggml_get_tensor(ctx_meta, t_name); + + auto n_bytes = ggml_nbytes(t); + + if (read_data.size() < n_bytes) { + read_data.resize(n_bytes); + } + + auto offset = gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, i_tensor); + f_input.seekg(offset); + f_input.read((char *)read_data.data(), n_bytes); + + // write tensor data + padding + fout.write((const char *)read_data.data(), n_bytes); + zeros(fout, GGML_PAD(n_bytes, GGUF_DEFAULT_ALIGNMENT) - n_bytes); + } + + gguf_free(ctx_gguf); + ggml_free(ctx_meta); + f_input.close(); + fprintf(stderr, "\033[3Ddone\n"); + } + + { + // go back to beginning of file and write the updated metadata + fout.seekp(0); + std::vector data(gguf_get_meta_size(ctx_out)); + gguf_get_meta_data(ctx_out, data.data()); + fout.write((const char *)data.data(), data.size()); + + fout.close(); + gguf_free(ctx_out); + } + + fprintf(stderr, "%s: %s merged from %d split with %d tensors.\n", + __func__, split_params.output.c_str(), n_split, total_tensors); +} + +int main(int argc, const char ** argv) { + if (argc < 3) { + split_print_usage(argv[0]); + } + + split_params params; + split_params_parse(argc, argv, params); + + switch (params.operation) { + case SPLIT_OP_SPLIT: gguf_split(params); + break; + case SPLIT_OP_MERGE: gguf_merge(params); + break; + default:split_print_usage(argv[0]); + exit(1); + } + + return 0; +} From d8b009a9456bf5284376149f3deb09300a37701a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?DAN=E2=84=A2?= Date: Tue, 19 Mar 2024 12:16:09 -0400 Subject: [PATCH 15/28] Remove undeed header file. (#6158) --- examples/gguf-split/gguf-split.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp index 5d7040ab5..8e12e6493 100644 --- a/examples/gguf-split/gguf-split.cpp +++ b/examples/gguf-split/gguf-split.cpp @@ -13,9 +13,7 @@ #include #include -#include #include -#include enum split_operation : uint8_t { SPLIT_OP_SPLIT, From d26e8b669dbf1f5f5a0afe4d2d885e86cf566302 Mon Sep 17 00:00:00 2001 From: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Date: Wed, 20 Mar 2024 08:28:49 +0530 Subject: [PATCH 16/28] increase igpu cluster limit (#6159) --- ggml-sycl.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-sycl.h b/ggml-sycl.h index c549a64a1..1c9d52115 100644 --- a/ggml-sycl.h +++ b/ggml-sycl.h @@ -13,7 +13,7 @@ extern "C" { #endif -#define GGML_SYCL_MAX_DEVICES 16 +#define GGML_SYCL_MAX_DEVICES 48 #define GGML_SYCL_NAME "SYCL" GGML_API void ggml_init_sycl(void); From 6c0b287748327741b113d7d6018b68c63039b1c5 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Wed, 20 Mar 2024 11:21:41 +0800 Subject: [PATCH 17/28] update readme sycl for new update (#6151) * update readme sycl for new update * Update README-sycl.md Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> * Update README-sycl.md Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> * Update README-sycl.md Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> * Update README-sycl.md Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> * Update README-sycl.md Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> * Update README-sycl.md Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> * update by review comments * update w64devkit link * update for verify device id part * Update README-sycl.md Co-authored-by: Meng, Hengyu --------- Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> Co-authored-by: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Co-authored-by: Meng, Hengyu --- README-sycl.md | 129 ++++++++++++++++++++++--------- examples/sycl/win-run-llama2.bat | 2 - 2 files changed, 93 insertions(+), 38 deletions(-) diff --git a/README-sycl.md b/README-sycl.md index 9359a9490..32adfda47 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -29,6 +29,7 @@ For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). ## News - 2024.3 + - New base line is ready: [tag b2437](https://github.com/ggerganov/llama.cpp/tree/b2437). - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing. - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE. - Support detecting all GPUs with level-zero and same top **Max compute units**. @@ -81,7 +82,7 @@ For dGPU, please make sure the device memory is enough. For llama-2-7b.Q4_0, rec |-|-|-| |Ampere Series| Support| A100| -### oneMKL +### oneMKL for CUDA The current oneMKL release does not contain the oneMKL cuBlas backend. As a result for Nvidia GPU's oneMKL must be built from source. @@ -254,16 +255,16 @@ Run without parameter: Check the ID in startup log, like: ``` -found 4 SYCL devices: - Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, - max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 - Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, - max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 - Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, - max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 - Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0, - max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 - +found 6 SYCL devices: +| | | |Compute |Max compute|Max work|Max sub| | +|ID| Device Type| Name|capability|units |group |group |Global mem size| +|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| +| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| +| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| +| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| +| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| +| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| +| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` |Attribute|Note| @@ -271,12 +272,35 @@ found 4 SYCL devices: |compute capability 1.3|Level-zero running time, recommended | |compute capability 3.0|OpenCL running time, slower than level-zero in most cases| -4. Set device ID and execute llama.cpp +4. Device selection and execution of llama.cpp -Set device ID = 0 by **GGML_SYCL_DEVICE=0** +There are two device selection modes: + +- Single device: Use one device assigned by user. +- Multiple devices: Automatically choose the devices with the same biggest Max compute units. + +|Device selection|Parameter| +|-|-| +|Single device|--split-mode none --main-gpu DEVICE_ID | +|Multiple devices|--split-mode layer (default)| + +Examples: + +- Use device 0: ```sh -GGML_SYCL_DEVICE=0 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 +ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm none -mg 0 +``` +or run by script: + +```sh +./examples/sycl/run_llama2.sh 0 +``` + +- Use multiple devices: + +```sh +ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 400 -e -ngl 33 -sm layer ``` or run by script: @@ -289,12 +313,18 @@ Note: - By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue. -5. Check the device ID in output +5. Verify the device ID in output + +Verify to see if the selected GPU is shown in the output, like: -Like: ``` -Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device +detect 1 SYCL GPUs: [0] with top Max compute units:512 ``` +Or +``` +use 1 SYCL GPUs: [0] with Max compute units:512 +``` + ## Windows @@ -355,7 +385,7 @@ a. Download & install cmake for Windows: https://cmake.org/download/ b. Download & install mingw-w64 make for Windows provided by w64devkit -- Download the latest fortran version of [w64devkit](https://github.com/skeeto/w64devkit/releases). +- Download the 1.19.0 version of [w64devkit](https://github.com/skeeto/w64devkit/releases/download/v1.19.0/w64devkit-1.19.0.zip). - Extract `w64devkit` on your pc. @@ -430,15 +460,16 @@ build\bin\main.exe Check the ID in startup log, like: ``` -found 4 SYCL devices: - Device 0: Intel(R) Arc(TM) A770 Graphics, compute capability 1.3, - max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 - Device 1: Intel(R) FPGA Emulation Device, compute capability 1.2, - max compute_units 24, max work group size 67108864, max sub group size 64, global mem size 67065057280 - Device 2: 13th Gen Intel(R) Core(TM) i7-13700K, compute capability 3.0, - max compute_units 24, max work group size 8192, max sub group size 64, global mem size 67065057280 - Device 3: Intel(R) Arc(TM) A770 Graphics, compute capability 3.0, - max compute_units 512, max work group size 1024, max sub group size 32, global mem size 16225243136 +found 6 SYCL devices: +| | | |Compute |Max compute|Max work|Max sub| | +|ID| Device Type| Name|capability|units |group |group |Global mem size| +|--|------------------|---------------------------------------------|----------|-----------|--------|-------|---------------| +| 0|[level_zero:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 1.3| 512| 1024| 32| 16225243136| +| 1|[level_zero:gpu:1]| Intel(R) UHD Graphics 770| 1.3| 32| 512| 32| 53651849216| +| 2| [opencl:gpu:0]| Intel(R) Arc(TM) A770 Graphics| 3.0| 512| 1024| 32| 16225243136| +| 3| [opencl:gpu:1]| Intel(R) UHD Graphics 770| 3.0| 32| 512| 32| 53651849216| +| 4| [opencl:cpu:0]| 13th Gen Intel(R) Core(TM) i7-13700K| 3.0| 24| 8192| 64| 67064815616| +| 5| [opencl:acc:0]| Intel(R) FPGA Emulation Device| 1.2| 24|67108864| 64| 67064815616| ``` @@ -447,13 +478,31 @@ found 4 SYCL devices: |compute capability 1.3|Level-zero running time, recommended | |compute capability 3.0|OpenCL running time, slower than level-zero in most cases| -4. Set device ID and execute llama.cpp -Set device ID = 0 by **set GGML_SYCL_DEVICE=0** +4. Device selection and execution of llama.cpp + +There are two device selection modes: + +- Single device: Use one device assigned by user. +- Multiple devices: Automatically choose the devices with the same biggest Max compute units. + +|Device selection|Parameter| +|-|-| +|Single device|--split-mode none --main-gpu DEVICE_ID | +|Multiple devices|--split-mode layer (default)| + +Examples: + +- Use device 0: ``` -set GGML_SYCL_DEVICE=0 -build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 +build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm none -mg 0 +``` + +- Use multiple devices: + +``` +build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p "Building a website can be done in 10 simple steps:\nStep 1:" -n 400 -e -ngl 33 -s 0 -sm layer ``` or run by script: @@ -466,11 +515,17 @@ Note: - By default, mmap is used to read model file. In some cases, it leads to the hang issue. Recommend to use parameter **--no-mmap** to disable mmap() to skip this issue. -5. Check the device ID in output -Like: +5. Verify the device ID in output + +Verify to see if the selected GPU is shown in the output, like: + ``` -Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device +detect 1 SYCL GPUs: [0] with top Max compute units:512 +``` +Or +``` +use 1 SYCL GPUs: [0] with Max compute units:512 ``` ## Environment Variable @@ -489,7 +544,6 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device |Name|Value|Function| |-|-|-| -|GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output| |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| |ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer| @@ -507,6 +561,9 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device ## Q&A +Note: please add prefix **[SYCL]** in issue title, so that we will check it as soon as possible. + + - Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. Miss to enable oneAPI running environment. @@ -538,4 +595,4 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device ## Todo -- Support multiple cards. +- Support row layer split for multiple card runs. diff --git a/examples/sycl/win-run-llama2.bat b/examples/sycl/win-run-llama2.bat index cf621c675..1d4d7d2cd 100644 --- a/examples/sycl/win-run-llama2.bat +++ b/examples/sycl/win-run-llama2.bat @@ -6,8 +6,6 @@ set INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" @call "C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64 --force -set GGML_SYCL_DEVICE=0 -rem set GGML_SYCL_DEBUG=1 .\build\bin\main.exe -m models\llama-2-7b.Q4_0.gguf -p %INPUT2% -n 400 -e -ngl 33 -s 0 From bd60d82d0cc8b6852ec535495a5042dbdf05de24 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Wed, 20 Mar 2024 01:33:49 -0400 Subject: [PATCH 18/28] server tests : more pythonic process management; fix bare `except:` (#6146) * server tests : remove seemingly redundant newlines in print() * server tests : use built-in subprocess features, not os.kill and psutil * server tests : do not catch e.g. SystemExit; use print_exc * server tests: handle TimeoutExpired exception * server tests: fix connect on dual-stack systems * server: tests: add new tokens regex on windows generated following new repeat penalties default changed in (#6127) * server: tests: remove the hack on windows since now we get the good socket family * server: tests: add new tokens regex following new repeat penalties default changed in (#6127) * server: tests: add new tokens regex following new repeat penalties default changed in (#6127) --------- Co-authored-by: Pierrick HYMBERT --- examples/server/tests/features/environment.py | 74 ++++++------------- examples/server/tests/features/server.feature | 14 ++-- examples/server/tests/features/steps/steps.py | 33 +++++---- examples/server/tests/requirements.txt | 1 - 4 files changed, 46 insertions(+), 76 deletions(-) diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 82104e920..e7845dc2f 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -5,15 +5,14 @@ import sys import time import traceback from contextlib import closing - -import psutil +from subprocess import TimeoutExpired def before_scenario(context, scenario): context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON' if context.debug: - print("DEBUG=ON\n") - print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n") + print("DEBUG=ON") + print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m") port = 8080 if 'PORT' in os.environ: port = int(os.environ['PORT']) @@ -27,60 +26,40 @@ def after_scenario(context, scenario): return if scenario.status == "failed": if 'GITHUB_ACTIONS' in os.environ: - print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n") + print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n") if os.path.isfile('llama.log'): with closing(open('llama.log', 'r')) as f: for line in f: print(line) if not is_server_listening(context.server_fqdn, context.server_port): - print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n") + print("\x1b[33;101mERROR: Server stopped listening\x1b[0m") - if not pid_exists(context.server_process.pid): + if context.server_process.poll() is not None: assert False, f"Server not running pid={context.server_process.pid} ..." - server_graceful_shutdown(context) + server_graceful_shutdown(context) # SIGINT - # Wait few for socket to free up - time.sleep(0.05) + try: + context.server_process.wait(0.5) + except TimeoutExpired: + print(f"server still alive after 500ms, force-killing pid={context.server_process.pid} ...") + context.server_process.kill() # SIGKILL + context.server_process.wait() - attempts = 0 - while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port): - server_kill(context) + while is_server_listening(context.server_fqdn, context.server_port): time.sleep(0.1) - attempts += 1 - if attempts > 5: - server_kill_hard(context) - except: - exc = sys.exception() - print("error in after scenario: \n") - print(exc) - print("*** print_tb: \n") - traceback.print_tb(exc.__traceback__, file=sys.stdout) + except Exception: + print("ignoring error in after_scenario:") + traceback.print_exc(file=sys.stdout) def server_graceful_shutdown(context): - print(f"shutting down server pid={context.server_process.pid} ...\n") + print(f"shutting down server pid={context.server_process.pid} ...") if os.name == 'nt': - os.kill(context.server_process.pid, signal.CTRL_C_EVENT) + interrupt = signal.CTRL_C_EVENT else: - os.kill(context.server_process.pid, signal.SIGINT) - - -def server_kill(context): - print(f"killing server pid={context.server_process.pid} ...\n") - context.server_process.kill() - - -def server_kill_hard(context): - pid = context.server_process.pid - path = context.server_path - - print(f"Server dangling exits, hard killing force {pid}={path}...\n") - try: - psutil.Process(pid).kill() - except psutil.NoSuchProcess: - return False - return True + interrupt = signal.SIGINT + context.server_process.send_signal(interrupt) def is_server_listening(server_fqdn, server_port): @@ -88,14 +67,5 @@ def is_server_listening(server_fqdn, server_port): result = sock.connect_ex((server_fqdn, server_port)) _is_server_listening = result == 0 if _is_server_listening: - print(f"server is listening on {server_fqdn}:{server_port}...\n") + print(f"server is listening on {server_fqdn}:{server_port}...") return _is_server_listening - - -def pid_exists(pid): - try: - psutil.Process(pid) - except psutil.NoSuchProcess: - return False - return True - diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index 7448986e7..45a988db6 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -35,9 +35,9 @@ Feature: llama.cpp server And metric llamacpp:tokens_predicted is Examples: Prompts - | prompt | n_predict | re_content | n_prompt | n_predicted | truncated | - | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not | - | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids)+ | 46 | 64 | not | + | prompt | n_predict | re_content | n_prompt | n_predicted | truncated | + | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not | + | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not | Scenario: Completion prompt truncated Given a prompt: @@ -48,7 +48,7 @@ Feature: llama.cpp server Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum. """ And a completion request with no api error - Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry + Then 64 tokens are predicted matching fun|Annaks|popcorns|pictry|bowl And the completion is truncated And 109 prompt tokens are processed @@ -65,9 +65,9 @@ Feature: llama.cpp server And the completion is truncated Examples: Prompts - | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated | - | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not | - | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird)+ | -1 | 64 | enabled | | + | model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated | + | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | | Scenario: Tokenize / Detokenize diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 9e348d5fc..40c97001a 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -66,7 +66,7 @@ def step_server_config(context, server_fqdn, server_port): def step_download_hf_model(context, hf_file, hf_repo): context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file) if context.debug: - print(f"model file: {context.model_file}\n") + print(f"model file: {context.model_file}") @step('a model file {model_file}') @@ -137,9 +137,12 @@ def step_start_server(context): if 'GITHUB_ACTIONS' in os.environ: max_attempts *= 2 + addrs = socket.getaddrinfo(context.server_fqdn, context.server_port, type=socket.SOCK_STREAM) + family, typ, proto, _, sockaddr = addrs[0] + while True: - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: - result = sock.connect_ex((context.server_fqdn, context.server_port)) + with closing(socket.socket(family, typ, proto)) as sock: + result = sock.connect_ex(sockaddr) if result == 0: print("\x1b[33;46mserver started!\x1b[0m") return @@ -209,7 +212,7 @@ async def step_request_completion(context, api_error): user_api_key=context.user_api_key) context.tasks_result.append(completion) if context.debug: - print(f"Completion response: {completion}\n") + print(f"Completion response: {completion}") if expect_api_error: assert completion == 401, f"completion must be an 401 status code: {completion}" @@ -354,7 +357,7 @@ def step_prompt_passkey(context, passkey, i_pos): prompt += context.prompt_junk_suffix if context.debug: passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m" - print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n") + print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```") context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix) context.n_prompts = len(context.prompts) @@ -363,7 +366,7 @@ def step_prompt_passkey(context, passkey, i_pos): @async_run_until_complete async def step_oai_chat_completions(context, api_error): if context.debug: - print(f"Submitting OAI compatible completions request...\n") + print(f"Submitting OAI compatible completions request...") expect_api_error = api_error == 'raised' completion = await oai_chat_completions(context.prompts.pop(), context.system_prompt, @@ -508,12 +511,12 @@ async def step_all_embeddings_are_the_same(context): embedding1 = np.array(embeddings[i]) embedding2 = np.array(embeddings[j]) if context.debug: - print(f"embedding1: {embedding1[-8:]}\n") - print(f"embedding2: {embedding2[-8:]}\n") + print(f"embedding1: {embedding1[-8:]}") + print(f"embedding2: {embedding2[-8:]}") similarity = np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2)) msg = f"Similarity between {i} and {j}: {similarity:.10f}" if context.debug: - print(f"{msg}\n") + print(f"{msg}") assert np.isclose(similarity, 1.0, rtol=1e-05, atol=1e-08, equal_nan=False), msg @@ -630,7 +633,7 @@ async def step_prometheus_metrics_exported(context): metrics_raw = await metrics_response.text() metric_exported = False if context.debug: - print(f"/metrics answer:\n{metrics_raw}\n") + print(f"/metrics answer:\n{metrics_raw}") context.metrics = {} for metric in parser.text_string_to_metric_families(metrics_raw): match metric.name: @@ -932,7 +935,7 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re last_match = end highlighted += content[last_match:] if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': - print(f"Checking completion response: {highlighted}\n") + print(f"Checking completion response: {highlighted}") assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```' if expected_predicted_n and expected_predicted_n > 0: assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' @@ -942,7 +945,7 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re async def gather_tasks_results(context): n_tasks = len(context.concurrent_tasks) if context.debug: - print(f"Waiting for all {n_tasks} tasks results...\n") + print(f"Waiting for all {n_tasks} tasks results...") for task_no in range(n_tasks): context.tasks_result.append(await context.concurrent_tasks.pop()) n_completions = len(context.tasks_result) @@ -959,7 +962,7 @@ async def wait_for_health_status(context, slots_processing=None, expected_slots=None): if context.debug: - print(f"Starting checking for health for expected_health_status={expected_health_status}\n") + print(f"Starting checking for health for expected_health_status={expected_health_status}") interval = 0.5 counter = 0 if 'GITHUB_ACTIONS' in os.environ: @@ -1048,8 +1051,6 @@ def start_server_background(context): if 'LLAMA_SERVER_BIN_PATH' in os.environ: context.server_path = os.environ['LLAMA_SERVER_BIN_PATH'] server_listen_addr = context.server_fqdn - if os.name == 'nt': - server_listen_addr = '0.0.0.0' server_args = [ '--host', server_listen_addr, '--port', context.server_port, @@ -1088,7 +1089,7 @@ def start_server_background(context): server_args.append('--verbose') if 'SERVER_LOG_FORMAT_JSON' not in os.environ: server_args.extend(['--log-format', "text"]) - print(f"starting server with: {context.server_path} {server_args}\n") + print(f"starting server with: {context.server_path} {server_args}") flags = 0 if 'nt' == os.name: flags |= subprocess.DETACHED_PROCESS diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index c2c960102..2e4f42ad2 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -3,5 +3,4 @@ behave~=1.2.6 huggingface_hub~=0.20.3 numpy~=1.24.4 openai~=0.25.0 -psutil~=5.9.8 prometheus-client~=0.20.0 From 47cc7a7bf9793f81a6dfe7c2096c499403f64dd6 Mon Sep 17 00:00:00 2001 From: Karthick Date: Wed, 20 Mar 2024 16:32:34 +0530 Subject: [PATCH 19/28] Server: Handle n_keep parameter in the request (#6174) --- examples/server/utils.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index 2ddb2cd21..cc5fce691 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -371,6 +371,7 @@ static json oaicompat_completion_params_parse( llama_params["repeat_last_n"] = json_value(body, "repeat_last_n", default_sparams.penalty_last_n); llama_params["ignore_eos"] = json_value(body, "ignore_eos", false); llama_params["tfs_z"] = json_value(body, "tfs_z", default_sparams.tfs_z); + llama_params["n_keep"] = json_value(body, "n_keep", 0); if (body.count("grammar") != 0) { llama_params["grammar"] = json_value(body, "grammar", json::object()); From f8c4e745e1e728204ab26dbadf52853545e6789c Mon Sep 17 00:00:00 2001 From: Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> Date: Wed, 20 Mar 2024 19:20:37 +0800 Subject: [PATCH 20/28] llava : add a MobileVLM_V2-1.7B backup (#6152) * Add MobileVLM_V2 backup * Update MobileVLM-README.md * Update examples/llava/MobileVLM-README.md Co-authored-by: Georgi Gerganov * Update examples/llava/convert-image-encoder-to-gguf.py Co-authored-by: Georgi Gerganov * clip : fix whitespace --------- Co-authored-by: Georgi Gerganov --- examples/llava/MobileVLM-README.md | 14 ++++++- examples/llava/clip.cpp | 42 ++++++++++++++++++- .../llava/convert-image-encoder-to-gguf.py | 9 ++-- 3 files changed, 59 insertions(+), 6 deletions(-) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 9eba791da..c1f361d17 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -1,11 +1,13 @@ # MobileVLM -Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants. +Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants. for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM) The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. +Notice: The overall process of model inference for both **MobilVLM** and **MobilVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown. + ## Usage Build with cmake or run `make llava-cli` to build it. @@ -34,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B ``` -3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF: +3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: ```sh python ./examples/llava/convert-image-encoder-to-gguf \ @@ -44,6 +46,14 @@ python ./examples/llava/convert-image-encoder-to-gguf \ --projector-type ldp ``` +```sh +python ./examples/llava/convert-image-encoder-to-gguf \ + -m path/to/clip-vit-large-patch14-336 \ + --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ + --output-dir path/to/MobileVLM-1.7B_V2 \ + --projector-type ldpv2 +``` + 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: ```sh diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 690bca2eb..6d7b4950f 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -119,6 +119,7 @@ static std::string format(const char * fmt, ...) { #define TN_LLAVA_PROJ "mm.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" +#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" @@ -126,12 +127,14 @@ enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, + PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_UNKNOWN, }; static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_LDPV2, "ldpv2"}, }; @@ -807,6 +810,29 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } embeddings = block_1; } + else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) + { + int n_patch = 24; + struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + embeddings = peg_0; + } else { GGML_ASSERT(false); } @@ -1177,7 +1203,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); - } else { + } + else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2) + { + // MobilVLM_V2 projection + vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight")); + vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias")); + vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight")); + vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias")); + vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); + vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); + } + else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } @@ -1966,6 +2003,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP) { return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; } + if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { + return ctx->vision_model.mm_model_peg_0_b->ne[0]; + } if (ctx->proj_type == PROJECTOR_TYPE_MLP) { return ctx->vision_model.mm_2_b->ne[0]; } diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index c69f89ac2..b00bf7c6d 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -1,6 +1,7 @@ import argparse import os import json +import re import torch import numpy as np @@ -38,9 +39,11 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b def get_tensor_name(name: str) -> str: if "projection" in name: return name - if "mm_projector" in name: - return name.replace("model.mm_projector", "mm") + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") @@ -83,7 +86,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False, ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, help="The clip model is from openclip (for ViT-SO400M type))") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") -ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 From d795988d9e09f75412efe2134512914c42780ad5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 Mar 2024 13:29:49 +0200 Subject: [PATCH 21/28] Revert "llava : add a MobileVLM_V2-1.7B backup (#6152)" This reverts commit f8c4e745e1e728204ab26dbadf52853545e6789c. --- examples/llava/MobileVLM-README.md | 14 +------ examples/llava/clip.cpp | 42 +------------------ .../llava/convert-image-encoder-to-gguf.py | 9 ++-- 3 files changed, 6 insertions(+), 59 deletions(-) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index c1f361d17..9eba791da 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -1,13 +1,11 @@ # MobileVLM -Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants. +Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants. for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM) The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. -Notice: The overall process of model inference for both **MobilVLM** and **MobilVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown. - ## Usage Build with cmake or run `make llava-cli` to build it. @@ -36,7 +34,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B ``` -3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: +3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF: ```sh python ./examples/llava/convert-image-encoder-to-gguf \ @@ -46,14 +44,6 @@ python ./examples/llava/convert-image-encoder-to-gguf \ --projector-type ldp ``` -```sh -python ./examples/llava/convert-image-encoder-to-gguf \ - -m path/to/clip-vit-large-patch14-336 \ - --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ - --output-dir path/to/MobileVLM-1.7B_V2 \ - --projector-type ldpv2 -``` - 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: ```sh diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 6d7b4950f..690bca2eb 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -119,7 +119,6 @@ static std::string format(const char * fmt, ...) { #define TN_LLAVA_PROJ "mm.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" -#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" @@ -127,14 +126,12 @@ enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, - PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_UNKNOWN, }; static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, - { PROJECTOR_TYPE_LDPV2, "ldpv2"}, }; @@ -810,29 +807,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } embeddings = block_1; } - else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) - { - int n_patch = 24; - struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); - mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); - mlp_0 = ggml_gelu(ctx0, mlp_0); - struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); - mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); - // mlp_2 ne = [2048, 576, 1, 1] - // // AVG Pool Layer 2*2, strides = 2 - mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); - // mlp_2 ne = [576, 2048, 1, 1] - mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); - // mlp_2 ne [24, 24, 2048, 1] - mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); - // weight ne = [3, 3, 2048, 1] - struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); - peg_0 = ggml_add(ctx0, peg_0, mlp_2); - peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); - peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); - peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); - embeddings = peg_0; - } else { GGML_ASSERT(false); } @@ -1203,18 +1177,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); - } - else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2) - { - // MobilVLM_V2 projection - vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight")); - vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias")); - vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight")); - vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias")); - vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); - vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); - } - else { + } else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } @@ -2003,9 +1966,6 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP) { return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; } - if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { - return ctx->vision_model.mm_model_peg_0_b->ne[0]; - } if (ctx->proj_type == PROJECTOR_TYPE_MLP) { return ctx->vision_model.mm_2_b->ne[0]; } diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index b00bf7c6d..c69f89ac2 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -1,7 +1,6 @@ import argparse import os import json -import re import torch import numpy as np @@ -39,11 +38,9 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b def get_tensor_name(name: str) -> str: if "projection" in name: return name + if "mm_projector" in name: - name = name.replace("model.mm_projector", "mm") - name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) - name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) - return name + return name.replace("model.mm_projector", "mm") return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") @@ -86,7 +83,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False, ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, help="The clip model is from openclip (for ViT-SO400M type))") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") -ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 From bc0baab2ea8f806961dd3fb9f534cfeb59a2e1fc Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 Mar 2024 14:14:32 +0200 Subject: [PATCH 22/28] server : allow to override -ngl in tests (#6170) --- examples/server/tests/features/steps/steps.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 40c97001a..42cdd8985 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -24,12 +24,16 @@ from prometheus_client import parser def step_server_config(context, server_fqdn, server_port): context.server_fqdn = server_fqdn context.server_port = int(server_port) + context.n_gpu_layer = None if 'PORT' in os.environ: context.server_port = int(os.environ['PORT']) print(f"$PORT set, overriding server port with to {context.server_port}") if 'FQDN' in os.environ: context.server_fqdn = os.environ['FQDN'] print(f"$FQDN set, overriding server fqdn with to {context.server_fqdn}") + if 'N_GPU_LAYERS' in os.environ: + context.n_gpu_layer = int(os.environ['N_GPU_LAYERS']) + print(f"$N_GPU_LAYERS set, overriding n_gpu_layer with to {context.n_gpu_layer}") context.base_url = f'http://{context.server_fqdn}:{context.server_port}' @@ -41,7 +45,6 @@ def step_server_config(context, server_fqdn, server_port): context.n_ctx = None context.n_ga = None context.n_ga_w = None - context.n_gpu_layer = None context.n_predict = None context.n_prompts = 0 context.n_server_predict = None From 6b7e76d28cdf4361740054140708c71828453522 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 20 Mar 2024 14:17:34 +0200 Subject: [PATCH 23/28] gitignore : ignore curl-related files --- .gitignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.gitignore b/.gitignore index 1ad8d929b..fbeb9223c 100644 --- a/.gitignore +++ b/.gitignore @@ -12,6 +12,8 @@ *.dot *.bat *.metallib +*.etag +*.lastModified .DS_Store .build/ .cache/ From 91f8ad167dcd24b54615b468d9dd764ebe1d37ad Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Wed, 20 Mar 2024 13:30:36 +0100 Subject: [PATCH 24/28] Server: version bump for httplib and json (#6169) * server: version bump for httplib and json * fix build * bring back content_length --- examples/server/httplib.h | 1807 +++++++++++++++++++++++++------------ examples/server/json.hpp | 1664 +++++++++++++++++++--------------- 2 files changed, 2156 insertions(+), 1315 deletions(-) diff --git a/examples/server/httplib.h b/examples/server/httplib.h index 28746000c..f360bd93e 100644 --- a/examples/server/httplib.h +++ b/examples/server/httplib.h @@ -1,14 +1,14 @@ // // httplib.h // -// Copyright (c) 2023 Yuji Hirose. All rights reserved. +// Copyright (c) 2024 Yuji Hirose. All rights reserved. // MIT License // #ifndef CPPHTTPLIB_HTTPLIB_H #define CPPHTTPLIB_HTTPLIB_H -#define CPPHTTPLIB_VERSION "0.12.2" +#define CPPHTTPLIB_VERSION "0.15.3" /* * Configuration @@ -82,6 +82,10 @@ #define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 8192 #endif +#ifndef CPPHTTPLIB_RANGE_MAX_COUNT +#define CPPHTTPLIB_RANGE_MAX_COUNT 1024 +#endif + #ifndef CPPHTTPLIB_TCP_NODELAY #define CPPHTTPLIB_TCP_NODELAY false #endif @@ -160,10 +164,6 @@ using ssize_t = long; #define WSA_FLAG_NO_HANDLE_INHERIT 0x80 #endif -#ifndef strcasecmp -#define strcasecmp _stricmp -#endif // strcasecmp - using socket_t = SOCKET; #ifdef CPPHTTPLIB_USE_POLL #define poll(fds, nfds, timeout) WSAPoll(fds, nfds, timeout) @@ -172,9 +172,15 @@ using socket_t = SOCKET; #else // not _WIN32 #include -#ifndef _AIX +#if !defined(_AIX) && !defined(__MVS__) #include #endif +#ifdef __MVS__ +#include +#ifndef NI_MAXHOST +#define NI_MAXHOST 1025 +#endif +#endif #include #include #include @@ -187,6 +193,7 @@ using socket_t = SOCKET; #endif #include #include +#include #include #include #include @@ -207,6 +214,7 @@ using socket_t = int; #include #include #include +#include #include #include #include @@ -223,6 +231,9 @@ using socket_t = int; #include #include #include +#include +#include +#include #ifdef CPPHTTPLIB_OPENSSL_SUPPORT #ifdef _WIN32 @@ -237,7 +248,6 @@ using socket_t = int; #ifdef _MSC_VER #pragma comment(lib, "crypt32.lib") -#pragma comment(lib, "cryptui.lib") #endif #elif defined(CPPHTTPLIB_USE_CERTS_FROM_MACOSX_KEYCHAIN) && defined(__APPLE__) #include @@ -259,10 +269,8 @@ using socket_t = int; #include #include -#if OPENSSL_VERSION_NUMBER < 0x1010100fL -#error Sorry, OpenSSL versions prior to 1.1.1 are not supported -#elif OPENSSL_VERSION_NUMBER < 0x30000000L -#define SSL_get1_peer_certificate SSL_get_peer_certificate +#if OPENSSL_VERSION_NUMBER < 0x30000000L +#error Sorry, OpenSSL versions prior to 3.0.0 are not supported #endif #endif @@ -321,7 +329,7 @@ struct scope_exit { explicit scope_exit(std::function &&f) : exit_function(std::move(f)), execute_on_destruction{true} {} - scope_exit(scope_exit &&rhs) + scope_exit(scope_exit &&rhs) noexcept : exit_function(std::move(rhs.exit_function)), execute_on_destruction{rhs.execute_on_destruction} { rhs.release(); @@ -344,6 +352,81 @@ private: } // namespace detail +enum StatusCode { + // Information responses + Continue_100 = 100, + SwitchingProtocol_101 = 101, + Processing_102 = 102, + EarlyHints_103 = 103, + + // Successful responses + OK_200 = 200, + Created_201 = 201, + Accepted_202 = 202, + NonAuthoritativeInformation_203 = 203, + NoContent_204 = 204, + ResetContent_205 = 205, + PartialContent_206 = 206, + MultiStatus_207 = 207, + AlreadyReported_208 = 208, + IMUsed_226 = 226, + + // Redirection messages + MultipleChoices_300 = 300, + MovedPermanently_301 = 301, + Found_302 = 302, + SeeOther_303 = 303, + NotModified_304 = 304, + UseProxy_305 = 305, + unused_306 = 306, + TemporaryRedirect_307 = 307, + PermanentRedirect_308 = 308, + + // Client error responses + BadRequest_400 = 400, + Unauthorized_401 = 401, + PaymentRequired_402 = 402, + Forbidden_403 = 403, + NotFound_404 = 404, + MethodNotAllowed_405 = 405, + NotAcceptable_406 = 406, + ProxyAuthenticationRequired_407 = 407, + RequestTimeout_408 = 408, + Conflict_409 = 409, + Gone_410 = 410, + LengthRequired_411 = 411, + PreconditionFailed_412 = 412, + PayloadTooLarge_413 = 413, + UriTooLong_414 = 414, + UnsupportedMediaType_415 = 415, + RangeNotSatisfiable_416 = 416, + ExpectationFailed_417 = 417, + ImATeapot_418 = 418, + MisdirectedRequest_421 = 421, + UnprocessableContent_422 = 422, + Locked_423 = 423, + FailedDependency_424 = 424, + TooEarly_425 = 425, + UpgradeRequired_426 = 426, + PreconditionRequired_428 = 428, + TooManyRequests_429 = 429, + RequestHeaderFieldsTooLarge_431 = 431, + UnavailableForLegalReasons_451 = 451, + + // Server error responses + InternalServerError_500 = 500, + NotImplemented_501 = 501, + BadGateway_502 = 502, + ServiceUnavailable_503 = 503, + GatewayTimeout_504 = 504, + HttpVersionNotSupported_505 = 505, + VariantAlsoNegotiates_506 = 506, + InsufficientStorage_507 = 507, + LoopDetected_508 = 508, + NotExtended_510 = 510, + NetworkAuthenticationRequired_511 = 511, +}; + using Headers = std::multimap; using Params = std::multimap; @@ -373,17 +456,18 @@ public: DataSink &operator=(DataSink &&) = delete; std::function write; + std::function is_writable; std::function done; std::function done_with_trailer; std::ostream os; private: - class data_sink_streambuf : public std::streambuf { + class data_sink_streambuf final : public std::streambuf { public: explicit data_sink_streambuf(DataSink &sink) : sink_(sink) {} protected: - std::streamsize xsputn(const char *s, std::streamsize n) { + std::streamsize xsputn(const char *s, std::streamsize n) override { sink_.write(s, static_cast(n)); return n; } @@ -465,6 +549,7 @@ struct Request { MultipartFormDataMap files; Ranges ranges; Match matches; + std::unordered_map path_params; // for client ResponseHandler response_handler; @@ -476,8 +561,7 @@ struct Request { bool has_header(const std::string &key) const; std::string get_header_value(const std::string &key, size_t id = 0) const; - template - T get_header_value(const std::string &key, size_t id = 0) const; + uint64_t get_header_value_u64(const std::string &key, size_t id = 0) const; size_t get_header_value_count(const std::string &key) const; void set_header(const std::string &key, const std::string &val); @@ -509,14 +593,14 @@ struct Response { bool has_header(const std::string &key) const; std::string get_header_value(const std::string &key, size_t id = 0) const; - template - T get_header_value(const std::string &key, size_t id = 0) const; + uint64_t get_header_value_u64(const std::string &key, size_t id = 0) const; size_t get_header_value_count(const std::string &key) const; void set_header(const std::string &key, const std::string &val); - void set_redirect(const std::string &url, int status = 302); + void set_redirect(const std::string &url, int status = StatusCode::Found_302); void set_content(const char *s, size_t n, const std::string &content_type); void set_content(const std::string &s, const std::string &content_type); + void set_content(std::string &&s, const std::string &content_type); void set_content_provider( size_t length, const std::string &content_type, ContentProvider provider, @@ -573,15 +657,16 @@ public: TaskQueue() = default; virtual ~TaskQueue() = default; - virtual void enqueue(std::function fn) = 0; + virtual bool enqueue(std::function fn) = 0; virtual void shutdown() = 0; virtual void on_idle() {} }; -class ThreadPool : public TaskQueue { +class ThreadPool final : public TaskQueue { public: - explicit ThreadPool(size_t n) : shutdown_(false) { + explicit ThreadPool(size_t n, size_t mqr = 0) + : shutdown_(false), max_queued_requests_(mqr) { while (n) { threads_.emplace_back(worker(*this)); n--; @@ -591,13 +676,17 @@ public: ThreadPool(const ThreadPool &) = delete; ~ThreadPool() override = default; - void enqueue(std::function fn) override { + bool enqueue(std::function fn) override { { std::unique_lock lock(mutex_); + if (max_queued_requests_ > 0 && jobs_.size() >= max_queued_requests_) { + return false; + } jobs_.push_back(std::move(fn)); } cond_.notify_one(); + return true; } void shutdown() override { @@ -647,6 +736,7 @@ private: std::list> jobs_; bool shutdown_; + size_t max_queued_requests_ = 0; std::condition_variable cond_; std::mutex mutex_; @@ -658,6 +748,82 @@ using SocketOptions = std::function; void default_socket_options(socket_t sock); +const char *status_message(int status); + +std::string get_bearer_token_auth(const Request &req); + +namespace detail { + +class MatcherBase { +public: + virtual ~MatcherBase() = default; + + // Match request path and populate its matches and + virtual bool match(Request &request) const = 0; +}; + +/** + * Captures parameters in request path and stores them in Request::path_params + * + * Capture name is a substring of a pattern from : to /. + * The rest of the pattern is matched agains the request path directly + * Parameters are captured starting from the next character after + * the end of the last matched static pattern fragment until the next /. + * + * Example pattern: + * "/path/fragments/:capture/more/fragments/:second_capture" + * Static fragments: + * "/path/fragments/", "more/fragments/" + * + * Given the following request path: + * "/path/fragments/:1/more/fragments/:2" + * the resulting capture will be + * {{"capture", "1"}, {"second_capture", "2"}} + */ +class PathParamsMatcher final : public MatcherBase { +public: + PathParamsMatcher(const std::string &pattern); + + bool match(Request &request) const override; + +private: + static constexpr char marker = ':'; + // Treat segment separators as the end of path parameter capture + // Does not need to handle query parameters as they are parsed before path + // matching + static constexpr char separator = '/'; + + // Contains static path fragments to match against, excluding the '/' after + // path params + // Fragments are separated by path params + std::vector static_fragments_; + // Stores the names of the path parameters to be used as keys in the + // Request::path_params map + std::vector param_names_; +}; + +/** + * Performs std::regex_match on request path + * and stores the result in Request::matches + * + * Note that regex match is performed directly on the whole request. + * This means that wildcard patterns may match multiple path segments with /: + * "/begin/(.*)/end" will match both "/begin/middle/end" and "/begin/1/2/end". + */ +class RegexMatcher final : public MatcherBase { +public: + RegexMatcher(const std::string &pattern) : regex_(pattern) {} + + bool match(Request &request) const override; + +private: + std::regex regex_; +}; + +ssize_t write_headers(Stream &strm, const Headers &headers); + +} // namespace detail + class Server { public: using Handler = std::function; @@ -702,6 +868,7 @@ public: bool remove_mount_point(const std::string &mount_point); Server &set_file_extension_and_mimetype_mapping(const std::string &ext, const std::string &mime); + Server &set_default_file_mimetype(const std::string &mime); Server &set_file_request_handler(Handler handler); Server &set_error_handler(HandlerWithResponse handler); @@ -718,6 +885,8 @@ public: Server &set_socket_options(SocketOptions socket_options); Server &set_default_headers(Headers headers); + Server & + set_header_writer(std::function const &writer); Server &set_keep_alive_max_count(size_t count); Server &set_keep_alive_timeout(time_t sec); @@ -765,9 +934,14 @@ protected: size_t payload_max_length_ = CPPHTTPLIB_PAYLOAD_MAX_LENGTH; private: - using Handlers = std::vector>; + using Handlers = + std::vector, Handler>>; using HandlersForContentReader = - std::vector>; + std::vector, + HandlerWithContentReader>>; + + static std::unique_ptr + make_matcher(const std::string &pattern); socket_t create_server_socket(const std::string &host, int port, int socket_flags, @@ -778,16 +952,16 @@ private: bool routing(Request &req, Response &res, Stream &strm); bool handle_file_request(const Request &req, Response &res, bool head = false); - bool dispatch_request(Request &req, Response &res, const Handlers &handlers); - bool - dispatch_request_for_content_reader(Request &req, Response &res, - ContentReader content_reader, - const HandlersForContentReader &handlers); + bool dispatch_request(Request &req, Response &res, + const Handlers &handlers) const; + bool dispatch_request_for_content_reader( + Request &req, Response &res, ContentReader content_reader, + const HandlersForContentReader &handlers) const; - bool parse_request_line(const char *s, Request &req); + bool parse_request_line(const char *s, Request &req) const; void apply_ranges(const Request &req, Response &res, - std::string &content_type, std::string &boundary); - bool write_response(Stream &strm, bool close_connection, const Request &req, + std::string &content_type, std::string &boundary) const; + bool write_response(Stream &strm, bool close_connection, Request &req, Response &res); bool write_response_with_content(Stream &strm, bool close_connection, const Request &req, Response &res); @@ -806,21 +980,23 @@ private: bool read_content_core(Stream &strm, Request &req, Response &res, ContentReceiver receiver, MultipartContentHeader multipart_header, - ContentReceiver multipart_receiver); + ContentReceiver multipart_receiver) const; virtual bool process_and_close_socket(socket_t sock); + std::atomic is_running_{false}; + std::atomic done_{false}; + struct MountPointEntry { std::string mount_point; std::string base_dir; Headers headers; }; std::vector base_dirs_; - - std::atomic is_running_{false}; - std::atomic done_{false}; std::map file_extension_and_mimetype_map_; + std::string default_file_mimetype_ = "application/octet-stream"; Handler file_request_handler_; + Handlers get_handlers_; Handlers post_handlers_; HandlersForContentReader post_handlers_for_content_reader_; @@ -831,18 +1007,22 @@ private: Handlers delete_handlers_; HandlersForContentReader delete_handlers_for_content_reader_; Handlers options_handlers_; + HandlerWithResponse error_handler_; ExceptionHandler exception_handler_; HandlerWithResponse pre_routing_handler_; Handler post_routing_handler_; - Logger logger_; Expect100ContinueHandler expect_100_continue_handler_; + Logger logger_; + int address_family_ = AF_UNSPEC; bool tcp_nodelay_ = CPPHTTPLIB_TCP_NODELAY; SocketOptions socket_options_ = default_socket_options; Headers default_headers_; + std::function header_writer_ = + detail::write_headers; }; enum class Error { @@ -860,17 +1040,19 @@ enum class Error { UnsupportedMultipartBoundaryChars, Compression, ConnectionTimeout, + ProxyConnection, // For internal use only SSLPeerCouldBeClosed_, }; -std::string to_string(const Error error); +std::string to_string(Error error); std::ostream &operator<<(std::ostream &os, const Error &obj); class Result { public: + Result() = default; Result(std::unique_ptr &&res, Error err, Headers &&request_headers = Headers{}) : res_(std::move(res)), err_(err), @@ -893,13 +1075,13 @@ public: bool has_request_header(const std::string &key) const; std::string get_request_header_value(const std::string &key, size_t id = 0) const; - template - T get_request_header_value(const std::string &key, size_t id = 0) const; + uint64_t get_request_header_value_u64(const std::string &key, + size_t id = 0) const; size_t get_request_header_value_count(const std::string &key) const; private: std::unique_ptr res_; - Error err_; + Error err_ = Error::Unknown; Headers request_headers_; }; @@ -1059,16 +1241,21 @@ public: bool send(Request &req, Response &res, Error &error); Result send(const Request &req); - size_t is_socket_open() const; - - socket_t socket() const; - void stop(); + std::string host() const; + int port() const; + + size_t is_socket_open() const; + socket_t socket() const; + void set_hostname_addr_map(std::map addr_map); void set_default_headers(Headers headers); + void + set_header_writer(std::function const &writer); + void set_address_family(int family); void set_tcp_nodelay(bool on); void set_socket_options(SocketOptions socket_options); @@ -1117,6 +1304,7 @@ public: void set_ca_cert_path(const std::string &ca_cert_file_path, const std::string &ca_cert_dir_path = std::string()); void set_ca_cert_store(X509_STORE *ca_cert_store); + X509_STORE *create_ca_cert_store(const char *ca_cert, std::size_t size) const; #endif #ifdef CPPHTTPLIB_OPENSSL_SUPPORT @@ -1145,14 +1333,14 @@ protected: // Also, shutdown_ssl and close_socket should also NOT be called concurrently // with a DIFFERENT thread sending requests using that socket. virtual void shutdown_ssl(Socket &socket, bool shutdown_gracefully); - void shutdown_socket(Socket &socket); + void shutdown_socket(Socket &socket) const; void close_socket(Socket &socket); bool process_request(Stream &strm, Request &req, Response &res, bool close_connection, Error &error); bool write_content_with_provider(Stream &strm, const Request &req, - Error &error); + Error &error) const; void copy_settings(const ClientImpl &rhs); @@ -1177,6 +1365,10 @@ protected: // Default headers Headers default_headers_; + // Header writer + std::function header_writer_ = + detail::write_headers; + // Settings std::string client_cert_path_; std::string client_key_path_; @@ -1239,7 +1431,8 @@ private: Result send_(Request &&req); socket_t create_client_socket(Error &error) const; - bool read_response_line(Stream &strm, const Request &req, Response &res); + bool read_response_line(Stream &strm, const Request &req, + Response &res) const; bool write_request(Stream &strm, Request &req, bool close_connection, Error &error); bool redirect(Request &req, Response &res, Error &error); @@ -1258,7 +1451,7 @@ private: const std::string &content_type); ContentProviderWithoutLength get_multipart_content_provider( const std::string &boundary, const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items); + const MultipartFormDataProviderItems &provider_items) const; std::string adjust_host_string(const std::string &host) const; @@ -1431,16 +1624,21 @@ public: bool send(Request &req, Response &res, Error &error); Result send(const Request &req); - size_t is_socket_open() const; - - socket_t socket() const; - void stop(); + std::string host() const; + int port() const; + + size_t is_socket_open() const; + socket_t socket() const; + void set_hostname_addr_map(std::map addr_map); void set_default_headers(Headers headers); + void + set_header_writer(std::function const &writer); + void set_address_family(int family); void set_tcp_nodelay(bool on); void set_socket_options(SocketOptions socket_options); @@ -1497,6 +1695,7 @@ public: const std::string &ca_cert_dir_path = std::string()); void set_ca_cert_store(X509_STORE *ca_cert_store); + void load_ca_cert_store(const char *ca_cert, std::size_t size); long get_openssl_verify_result() const; @@ -1538,7 +1737,7 @@ private: std::mutex ctx_mutex_; }; -class SSLClient : public ClientImpl { +class SSLClient final : public ClientImpl { public: explicit SSLClient(const std::string &host); @@ -1546,16 +1745,19 @@ public: explicit SSLClient(const std::string &host, int port, const std::string &client_cert_path, - const std::string &client_key_path); + const std::string &client_key_path, + const std::string &private_key_password = std::string()); explicit SSLClient(const std::string &host, int port, X509 *client_cert, - EVP_PKEY *client_key); + EVP_PKEY *client_key, + const std::string &private_key_password = std::string()); ~SSLClient() override; bool is_valid() const override; void set_ca_cert_store(X509_STORE *ca_cert_store); + void load_ca_cert_store(const char *ca_cert, std::size_t size); long get_openssl_verify_result() const; @@ -1564,7 +1766,7 @@ public: private: bool create_and_connect_socket(Socket &socket, Error &error) override; void shutdown_ssl(Socket &socket, bool shutdown_gracefully) override; - void shutdown_ssl_impl(Socket &socket, bool shutdown_socket); + void shutdown_ssl_impl(Socket &socket, bool shutdown_gracefully); bool process_socket(const Socket &socket, std::function callback) override; @@ -1608,15 +1810,9 @@ inline void duration_to_sec_and_usec(const T &duration, U callback) { callback(static_cast(sec), static_cast(usec)); } -template -inline T get_header_value(const Headers & /*headers*/, - const std::string & /*key*/, size_t /*id*/ = 0, - uint64_t /*def*/ = 0) {} - -template <> -inline uint64_t get_header_value(const Headers &headers, - const std::string &key, size_t id, - uint64_t def) { +inline uint64_t get_header_value_u64(const Headers &headers, + const std::string &key, size_t id, + uint64_t def) { auto rng = headers.equal_range(key); auto it = rng.first; std::advance(it, static_cast(id)); @@ -1628,14 +1824,14 @@ inline uint64_t get_header_value(const Headers &headers, } // namespace detail -template -inline T Request::get_header_value(const std::string &key, size_t id) const { - return detail::get_header_value(headers, key, id, 0); +inline uint64_t Request::get_header_value_u64(const std::string &key, + size_t id) const { + return detail::get_header_value_u64(headers, key, id, 0); } -template -inline T Response::get_header_value(const std::string &key, size_t id) const { - return detail::get_header_value(headers, key, id, 0); +inline uint64_t Response::get_header_value_u64(const std::string &key, + size_t id) const { + return detail::get_header_value_u64(headers, key, id, 0); } template @@ -1665,21 +1861,106 @@ inline ssize_t Stream::write_format(const char *fmt, const Args &...args) { inline void default_socket_options(socket_t sock) { int yes = 1; #ifdef _WIN32 - setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast(&yes), - sizeof(yes)); + setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + reinterpret_cast(&yes), sizeof(yes)); setsockopt(sock, SOL_SOCKET, SO_EXCLUSIVEADDRUSE, - reinterpret_cast(&yes), sizeof(yes)); + reinterpret_cast(&yes), sizeof(yes)); #else #ifdef SO_REUSEPORT - setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, reinterpret_cast(&yes), - sizeof(yes)); + setsockopt(sock, SOL_SOCKET, SO_REUSEPORT, + reinterpret_cast(&yes), sizeof(yes)); #else - setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, reinterpret_cast(&yes), - sizeof(yes)); + setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, + reinterpret_cast(&yes), sizeof(yes)); #endif #endif } +inline const char *status_message(int status) { + switch (status) { + case StatusCode::Continue_100: return "Continue"; + case StatusCode::SwitchingProtocol_101: return "Switching Protocol"; + case StatusCode::Processing_102: return "Processing"; + case StatusCode::EarlyHints_103: return "Early Hints"; + case StatusCode::OK_200: return "OK"; + case StatusCode::Created_201: return "Created"; + case StatusCode::Accepted_202: return "Accepted"; + case StatusCode::NonAuthoritativeInformation_203: + return "Non-Authoritative Information"; + case StatusCode::NoContent_204: return "No Content"; + case StatusCode::ResetContent_205: return "Reset Content"; + case StatusCode::PartialContent_206: return "Partial Content"; + case StatusCode::MultiStatus_207: return "Multi-Status"; + case StatusCode::AlreadyReported_208: return "Already Reported"; + case StatusCode::IMUsed_226: return "IM Used"; + case StatusCode::MultipleChoices_300: return "Multiple Choices"; + case StatusCode::MovedPermanently_301: return "Moved Permanently"; + case StatusCode::Found_302: return "Found"; + case StatusCode::SeeOther_303: return "See Other"; + case StatusCode::NotModified_304: return "Not Modified"; + case StatusCode::UseProxy_305: return "Use Proxy"; + case StatusCode::unused_306: return "unused"; + case StatusCode::TemporaryRedirect_307: return "Temporary Redirect"; + case StatusCode::PermanentRedirect_308: return "Permanent Redirect"; + case StatusCode::BadRequest_400: return "Bad Request"; + case StatusCode::Unauthorized_401: return "Unauthorized"; + case StatusCode::PaymentRequired_402: return "Payment Required"; + case StatusCode::Forbidden_403: return "Forbidden"; + case StatusCode::NotFound_404: return "Not Found"; + case StatusCode::MethodNotAllowed_405: return "Method Not Allowed"; + case StatusCode::NotAcceptable_406: return "Not Acceptable"; + case StatusCode::ProxyAuthenticationRequired_407: + return "Proxy Authentication Required"; + case StatusCode::RequestTimeout_408: return "Request Timeout"; + case StatusCode::Conflict_409: return "Conflict"; + case StatusCode::Gone_410: return "Gone"; + case StatusCode::LengthRequired_411: return "Length Required"; + case StatusCode::PreconditionFailed_412: return "Precondition Failed"; + case StatusCode::PayloadTooLarge_413: return "Payload Too Large"; + case StatusCode::UriTooLong_414: return "URI Too Long"; + case StatusCode::UnsupportedMediaType_415: return "Unsupported Media Type"; + case StatusCode::RangeNotSatisfiable_416: return "Range Not Satisfiable"; + case StatusCode::ExpectationFailed_417: return "Expectation Failed"; + case StatusCode::ImATeapot_418: return "I'm a teapot"; + case StatusCode::MisdirectedRequest_421: return "Misdirected Request"; + case StatusCode::UnprocessableContent_422: return "Unprocessable Content"; + case StatusCode::Locked_423: return "Locked"; + case StatusCode::FailedDependency_424: return "Failed Dependency"; + case StatusCode::TooEarly_425: return "Too Early"; + case StatusCode::UpgradeRequired_426: return "Upgrade Required"; + case StatusCode::PreconditionRequired_428: return "Precondition Required"; + case StatusCode::TooManyRequests_429: return "Too Many Requests"; + case StatusCode::RequestHeaderFieldsTooLarge_431: + return "Request Header Fields Too Large"; + case StatusCode::UnavailableForLegalReasons_451: + return "Unavailable For Legal Reasons"; + case StatusCode::NotImplemented_501: return "Not Implemented"; + case StatusCode::BadGateway_502: return "Bad Gateway"; + case StatusCode::ServiceUnavailable_503: return "Service Unavailable"; + case StatusCode::GatewayTimeout_504: return "Gateway Timeout"; + case StatusCode::HttpVersionNotSupported_505: + return "HTTP Version Not Supported"; + case StatusCode::VariantAlsoNegotiates_506: return "Variant Also Negotiates"; + case StatusCode::InsufficientStorage_507: return "Insufficient Storage"; + case StatusCode::LoopDetected_508: return "Loop Detected"; + case StatusCode::NotExtended_510: return "Not Extended"; + case StatusCode::NetworkAuthenticationRequired_511: + return "Network Authentication Required"; + + default: + case StatusCode::InternalServerError_500: return "Internal Server Error"; + } +} + +inline std::string get_bearer_token_auth(const Request &req) { + if (req.has_header("Authorization")) { + static std::string BearerHeaderPrefix = "Bearer "; + return req.get_header_value("Authorization") + .substr(BearerHeaderPrefix.length()); + } + return ""; +} + template inline Server & Server::set_read_timeout(const std::chrono::duration &duration) { @@ -1720,6 +2001,7 @@ inline std::string to_string(const Error error) { return "Unsupported HTTP multipart boundary characters"; case Error::Compression: return "Compression failed"; case Error::ConnectionTimeout: return "Connection timed out"; + case Error::ProxyConnection: return "Proxy connection failed"; case Error::Unknown: return "Unknown"; default: break; } @@ -1733,10 +2015,9 @@ inline std::ostream &operator<<(std::ostream &os, const Error &obj) { return os; } -template -inline T Result::get_request_header_value(const std::string &key, - size_t id) const { - return detail::get_header_value(request_headers_, key, id, 0); +inline uint64_t Result::get_request_header_value_u64(const std::string &key, + size_t id) const { + return detail::get_header_value_u64(request_headers_, key, id, 0); } template @@ -1790,7 +2071,7 @@ void hosted_at(const std::string &hostname, std::vector &addrs); std::string append_query_params(const std::string &path, const Params ¶ms); -std::pair make_range_header(Ranges ranges); +std::pair make_range_header(const Ranges &ranges); std::pair make_basic_authentication_header(const std::string &username, @@ -1810,6 +2091,9 @@ std::string trim_copy(const std::string &s); void split(const char *b, const char *e, char d, std::function fn); +void split(const char *b, const char *e, char d, size_t m, + std::function fn); + bool process_client_socket(socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec, time_t write_timeout_usec, @@ -1844,7 +2128,7 @@ enum class EncodingType { None = 0, Gzip, Brotli }; EncodingType encoding_type(const Request &req, const Response &res); -class BufferStream : public Stream { +class BufferStream final : public Stream { public: BufferStream() = default; ~BufferStream() override = default; @@ -1884,19 +2168,19 @@ public: Callback callback) = 0; }; -class nocompressor : public compressor { +class nocompressor final : public compressor { public: - virtual ~nocompressor() = default; + ~nocompressor() override = default; bool compress(const char *data, size_t data_length, bool /*last*/, Callback callback) override; }; #ifdef CPPHTTPLIB_ZLIB_SUPPORT -class gzip_compressor : public compressor { +class gzip_compressor final : public compressor { public: gzip_compressor(); - ~gzip_compressor(); + ~gzip_compressor() override; bool compress(const char *data, size_t data_length, bool last, Callback callback) override; @@ -1906,10 +2190,10 @@ private: z_stream strm_; }; -class gzip_decompressor : public decompressor { +class gzip_decompressor final : public decompressor { public: gzip_decompressor(); - ~gzip_decompressor(); + ~gzip_decompressor() override; bool is_valid() const override; @@ -1923,7 +2207,7 @@ private: #endif #ifdef CPPHTTPLIB_BROTLI_SUPPORT -class brotli_compressor : public compressor { +class brotli_compressor final : public compressor { public: brotli_compressor(); ~brotli_compressor(); @@ -1935,7 +2219,7 @@ private: BrotliEncoderState *state_ = nullptr; }; -class brotli_decompressor : public decompressor { +class brotli_decompressor final : public decompressor { public: brotli_decompressor(); ~brotli_decompressor(); @@ -1972,6 +2256,29 @@ private: std::string glowable_buffer_; }; +class mmap { +public: + mmap(const char *path); + ~mmap(); + + bool open(const char *path); + void close(); + + bool is_open() const; + size_t size() const; + const char *data() const; + +private: +#if defined(_WIN32) + HANDLE hFile_; + HANDLE hMapping_; +#else + int fd_; +#endif + size_t size_; + void *addr_; +}; + } // namespace detail // ---------------------------------------------------------------------------- @@ -2003,7 +2310,7 @@ inline bool from_hex_to_i(const std::string &s, size_t i, size_t cnt, val = 0; for (; cnt; i++, cnt--) { if (!s[i]) { return false; } - int v = 0; + auto v = 0; if (is_hex(s[i], v)) { val = val * 16 + v; } else { @@ -2014,7 +2321,7 @@ inline bool from_hex_to_i(const std::string &s, size_t i, size_t cnt, } inline std::string from_i_to_hex(size_t n) { - const char *charset = "0123456789abcdef"; + static const auto charset = "0123456789abcdef"; std::string ret; do { ret = charset[n & 15] + ret; @@ -2025,7 +2332,7 @@ inline std::string from_i_to_hex(size_t n) { inline size_t to_utf8(int code, char *buff) { if (code < 0x0080) { - buff[0] = (code & 0x7F); + buff[0] = static_cast(code & 0x7F); return 1; } else if (code < 0x0800) { buff[0] = static_cast(0xC0 | ((code >> 6) & 0x1F)); @@ -2064,8 +2371,8 @@ inline std::string base64_encode(const std::string &in) { std::string out; out.reserve(in.size()); - int val = 0; - int valb = -6; + auto val = 0; + auto valb = -6; for (auto c : in) { val = (val << 8) + static_cast(c); @@ -2112,6 +2419,11 @@ inline bool is_valid_path(const std::string &path) { // Read component auto beg = i; while (i < path.size() && path[i] != '/') { + if (path[i] == '\0') { + return false; + } else if (path[i] == '\\') { + return false; + } i++; } @@ -2196,7 +2508,7 @@ inline std::string decode_url(const std::string &s, for (size_t i = 0; i < s.size(); i++) { if (s[i] == '%' && i + 1 < s.size()) { if (s[i + 1] == 'u') { - int val = 0; + auto val = 0; if (from_hex_to_i(s, i + 2, 4, val)) { // 4 digits Unicode codes char buff[4]; @@ -2207,7 +2519,7 @@ inline std::string decode_url(const std::string &s, result += s[i]; } } else { - int val = 0; + auto val = 0; if (from_hex_to_i(s, i + 1, 2, val)) { // 2 digits hex codes result += static_cast(val); @@ -2260,16 +2572,30 @@ inline std::string trim_copy(const std::string &s) { return s.substr(r.first, r.second - r.first); } +inline std::string trim_double_quotes_copy(const std::string &s) { + if (s.length() >= 2 && s.front() == '"' && s.back() == '"') { + return s.substr(1, s.size() - 2); + } + return s; +} + inline void split(const char *b, const char *e, char d, std::function fn) { + return split(b, e, d, (std::numeric_limits::max)(), std::move(fn)); +} + +inline void split(const char *b, const char *e, char d, size_t m, + std::function fn) { size_t i = 0; size_t beg = 0; + size_t count = 1; while (e ? (b + i < e) : (b[i] != '\0')) { - if (b[i] == d) { + if (b[i] == d && count < m) { auto r = trim(b, e, beg, i); if (r.first < r.second) { fn(&b[r.first], &b[r.second]); } beg = i + 1; + count++; } i++; } @@ -2345,6 +2671,105 @@ inline void stream_line_reader::append(char c) { } } +inline mmap::mmap(const char *path) +#if defined(_WIN32) + : hFile_(NULL), hMapping_(NULL) +#else + : fd_(-1) +#endif + , + size_(0), addr_(nullptr) { + open(path); +} + +inline mmap::~mmap() { close(); } + +inline bool mmap::open(const char *path) { + close(); + +#if defined(_WIN32) + std::wstring wpath; + for (size_t i = 0; i < strlen(path); i++) { + wpath += path[i]; + } + + hFile_ = ::CreateFile2(wpath.c_str(), GENERIC_READ, FILE_SHARE_READ, + OPEN_EXISTING, NULL); + + if (hFile_ == INVALID_HANDLE_VALUE) { return false; } + + LARGE_INTEGER size{}; + if (!::GetFileSizeEx(hFile_, &size)) { return false; } + size_ = static_cast(size.QuadPart); + + hMapping_ = + ::CreateFileMappingFromApp(hFile_, NULL, PAGE_READONLY, size_, NULL); + + if (hMapping_ == NULL) { + close(); + return false; + } + + addr_ = ::MapViewOfFileFromApp(hMapping_, FILE_MAP_READ, 0, 0); +#else + fd_ = ::open(path, O_RDONLY); + if (fd_ == -1) { return false; } + + struct stat sb; + if (fstat(fd_, &sb) == -1) { + close(); + return false; + } + size_ = static_cast(sb.st_size); + + addr_ = ::mmap(NULL, size_, PROT_READ, MAP_PRIVATE, fd_, 0); +#endif + + if (addr_ == nullptr) { + close(); + return false; + } + + return true; +} + +inline bool mmap::is_open() const { return addr_ != nullptr; } + +inline size_t mmap::size() const { return size_; } + +inline const char *mmap::data() const { + return static_cast(addr_); +} + +inline void mmap::close() { +#if defined(_WIN32) + if (addr_) { + ::UnmapViewOfFile(addr_); + addr_ = nullptr; + } + + if (hMapping_) { + ::CloseHandle(hMapping_); + hMapping_ = NULL; + } + + if (hFile_ != INVALID_HANDLE_VALUE) { + ::CloseHandle(hFile_); + hFile_ = INVALID_HANDLE_VALUE; + } +#else + if (addr_ != nullptr) { + munmap(addr_, size_); + addr_ = nullptr; + } + + if (fd_ != -1) { + ::close(fd_); + fd_ = -1; + } +#endif + size_ = 0; +} inline int close_socket(socket_t sock) { #ifdef _WIN32 return closesocket(sock); @@ -2354,7 +2779,7 @@ inline int close_socket(socket_t sock) { } template inline ssize_t handle_EINTR(T fn) { - ssize_t res = false; + ssize_t res = 0; while (true) { res = fn(); if (res < 0 && errno == EINTR) { continue; } @@ -2399,7 +2824,7 @@ inline ssize_t select_read(socket_t sock, time_t sec, time_t usec) { return handle_EINTR([&]() { return poll(&pfd_read, 1, timeout); }); #else #ifndef _WIN32 - if (sock >= FD_SETSIZE) { return 1; } + if (sock >= FD_SETSIZE) { return -1; } #endif fd_set fds; @@ -2427,7 +2852,7 @@ inline ssize_t select_write(socket_t sock, time_t sec, time_t usec) { return handle_EINTR([&]() { return poll(&pfd_read, 1, timeout); }); #else #ifndef _WIN32 - if (sock >= FD_SETSIZE) { return 1; } + if (sock >= FD_SETSIZE) { return -1; } #endif fd_set fds; @@ -2458,7 +2883,7 @@ inline Error wait_until_socket_is_ready(socket_t sock, time_t sec, if (poll_res == 0) { return Error::ConnectionTimeout; } if (poll_res > 0 && pfd_read.revents & (POLLIN | POLLOUT)) { - int error = 0; + auto error = 0; socklen_t len = sizeof(error); auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR, reinterpret_cast(&error), &len); @@ -2490,7 +2915,7 @@ inline Error wait_until_socket_is_ready(socket_t sock, time_t sec, if (ret == 0) { return Error::ConnectionTimeout; } if (ret > 0 && (FD_ISSET(sock, &fdsr) || FD_ISSET(sock, &fdsw))) { - int error = 0; + auto error = 0; socklen_t len = sizeof(error); auto res = getsockopt(sock, SOL_SOCKET, SO_ERROR, reinterpret_cast(&error), &len); @@ -2512,7 +2937,7 @@ inline bool is_socket_alive(socket_t sock) { return detail::read_socket(sock, &buf[0], sizeof(buf), MSG_PEEK) > 0; } -class SocketStream : public Stream { +class SocketStream final : public Stream { public: SocketStream(socket_t sock, time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec, time_t write_timeout_usec); @@ -2537,11 +2962,11 @@ private: size_t read_buff_off_ = 0; size_t read_buff_content_size_ = 0; - static const size_t read_buff_size_ = 1024 * 4; + static const size_t read_buff_size_ = 1024l * 4; }; #ifdef CPPHTTPLIB_OPENSSL_SUPPORT -class SSLSocketStream : public Stream { +class SSLSocketStream final : public Stream { public: SSLSocketStream(socket_t sock, SSL *ssl, time_t read_timeout_sec, time_t read_timeout_usec, time_t write_timeout_sec, @@ -2666,7 +3091,7 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, #ifndef _WIN32 if (hints.ai_family == AF_UNIX) { const auto addrlen = host.length(); - if (addrlen > sizeof(sockaddr_un::sun_path)) return INVALID_SOCKET; + if (addrlen > sizeof(sockaddr_un::sun_path)) { return INVALID_SOCKET; } auto sock = socket(hints.ai_family, hints.ai_socktype, hints.ai_protocol); if (sock != INVALID_SOCKET) { @@ -2735,17 +3160,27 @@ socket_t create_socket(const std::string &host, const std::string &ip, int port, #endif if (tcp_nodelay) { - int yes = 1; - setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, reinterpret_cast(&yes), - sizeof(yes)); + auto yes = 1; +#ifdef _WIN32 + setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, + reinterpret_cast(&yes), sizeof(yes)); +#else + setsockopt(sock, IPPROTO_TCP, TCP_NODELAY, + reinterpret_cast(&yes), sizeof(yes)); +#endif } if (socket_options) { socket_options(sock); } if (rp->ai_family == AF_INET6) { - int no = 0; - setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, reinterpret_cast(&no), - sizeof(no)); + auto no = 0; +#ifdef _WIN32 + setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + reinterpret_cast(&no), sizeof(no)); +#else + setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY, + reinterpret_cast(&no), sizeof(no)); +#endif } // bind or connect @@ -2804,7 +3239,7 @@ inline bool bind_ip_address(socket_t sock, const std::string &host) { return ret; } -#if !defined _WIN32 && !defined ANDROID && !defined _AIX +#if !defined _WIN32 && !defined ANDROID && !defined _AIX && !defined __MVS__ #define USE_IF2IP #endif @@ -2860,7 +3295,7 @@ inline socket_t create_client_socket( #ifdef USE_IF2IP auto ip_from_if = if2ip(address_family, intf); if (ip_from_if.empty()) { ip_from_if = intf; } - if (!bind_ip_address(sock2, ip_from_if.c_str())) { + if (!bind_ip_address(sock2, ip_from_if)) { error = Error::BindIPAddress; return false; } @@ -2888,13 +3323,14 @@ inline socket_t create_client_socket( #ifdef _WIN32 auto timeout = static_cast(read_timeout_sec * 1000 + read_timeout_usec / 1000); - setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, - sizeof(timeout)); + setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, + reinterpret_cast(&timeout), sizeof(timeout)); #else timeval tv; tv.tv_sec = static_cast(read_timeout_sec); tv.tv_usec = static_cast(read_timeout_usec); - setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv)); + setsockopt(sock2, SOL_SOCKET, SO_RCVTIMEO, + reinterpret_cast(&tv), sizeof(tv)); #endif } { @@ -2902,13 +3338,14 @@ inline socket_t create_client_socket( #ifdef _WIN32 auto timeout = static_cast(write_timeout_sec * 1000 + write_timeout_usec / 1000); - setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, - sizeof(timeout)); + setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, + reinterpret_cast(&timeout), sizeof(timeout)); #else timeval tv; tv.tv_sec = static_cast(write_timeout_sec); tv.tv_usec = static_cast(write_timeout_usec); - setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv)); + setsockopt(sock2, SOL_SOCKET, SO_SNDTIMEO, + reinterpret_cast(&tv), sizeof(tv)); #endif } @@ -3008,18 +3445,20 @@ inline constexpr unsigned int operator"" _t(const char *s, size_t l) { } // namespace udl -inline const char * +inline std::string find_content_type(const std::string &path, - const std::map &user_data) { + const std::map &user_data, + const std::string &default_content_type) { auto ext = file_extension(path); auto it = user_data.find(ext); - if (it != user_data.end()) { return it->second.c_str(); } + if (it != user_data.end()) { return it->second; } using udl::operator""_t; switch (str2tag(ext)) { - default: return nullptr; + default: return default_content_type; + case "css"_t: return "text/css"; case "csv"_t: return "text/csv"; case "htm"_t: @@ -3072,76 +3511,6 @@ find_content_type(const std::string &path, } } -inline const char *status_message(int status) { - switch (status) { - case 100: return "Continue"; - case 101: return "Switching Protocol"; - case 102: return "Processing"; - case 103: return "Early Hints"; - case 200: return "OK"; - case 201: return "Created"; - case 202: return "Accepted"; - case 203: return "Non-Authoritative Information"; - case 204: return "No Content"; - case 205: return "Reset Content"; - case 206: return "Partial Content"; - case 207: return "Multi-Status"; - case 208: return "Already Reported"; - case 226: return "IM Used"; - case 300: return "Multiple Choice"; - case 301: return "Moved Permanently"; - case 302: return "Found"; - case 303: return "See Other"; - case 304: return "Not Modified"; - case 305: return "Use Proxy"; - case 306: return "unused"; - case 307: return "Temporary Redirect"; - case 308: return "Permanent Redirect"; - case 400: return "Bad Request"; - case 401: return "Unauthorized"; - case 402: return "Payment Required"; - case 403: return "Forbidden"; - case 404: return "Not Found"; - case 405: return "Method Not Allowed"; - case 406: return "Not Acceptable"; - case 407: return "Proxy Authentication Required"; - case 408: return "Request Timeout"; - case 409: return "Conflict"; - case 410: return "Gone"; - case 411: return "Length Required"; - case 412: return "Precondition Failed"; - case 413: return "Payload Too Large"; - case 414: return "URI Too Long"; - case 415: return "Unsupported Media Type"; - case 416: return "Range Not Satisfiable"; - case 417: return "Expectation Failed"; - case 418: return "I'm a teapot"; - case 421: return "Misdirected Request"; - case 422: return "Unprocessable Entity"; - case 423: return "Locked"; - case 424: return "Failed Dependency"; - case 425: return "Too Early"; - case 426: return "Upgrade Required"; - case 428: return "Precondition Required"; - case 429: return "Too Many Requests"; - case 431: return "Request Header Fields Too Large"; - case 451: return "Unavailable For Legal Reasons"; - case 501: return "Not Implemented"; - case 502: return "Bad Gateway"; - case 503: return "Service Unavailable"; - case 504: return "Gateway Timeout"; - case 505: return "HTTP Version Not Supported"; - case 506: return "Variant Also Negotiates"; - case 507: return "Insufficient Storage"; - case 508: return "Loop Detected"; - case 510: return "Not Extended"; - case 511: return "Network Authentication Required"; - - default: - case 500: return "Internal Server Error"; - } -} - inline bool can_compress_content_type(const std::string &content_type) { using udl::operator""_t; @@ -3218,7 +3587,7 @@ inline bool gzip_compressor::compress(const char *data, size_t data_length, data += strm_.avail_in; auto flush = (last && data_length == 0) ? Z_FINISH : Z_NO_FLUSH; - int ret = Z_OK; + auto ret = Z_OK; std::array buff{}; do { @@ -3262,7 +3631,7 @@ inline bool gzip_decompressor::decompress(const char *data, size_t data_length, Callback callback) { assert(is_valid_); - int ret = Z_OK; + auto ret = Z_OK; do { constexpr size_t max_avail_in = @@ -3276,16 +3645,12 @@ inline bool gzip_decompressor::decompress(const char *data, size_t data_length, data += strm_.avail_in; std::array buff{}; - while (strm_.avail_in > 0) { + while (strm_.avail_in > 0 && ret == Z_OK) { strm_.avail_out = static_cast(buff.size()); strm_.next_out = reinterpret_cast(buff.data()); - auto prev_avail_in = strm_.avail_in; - ret = inflate(&strm_, Z_NO_FLUSH); - if (prev_avail_in - strm_.avail_in == 0) { return false; } - assert(ret != Z_STREAM_ERROR); switch (ret) { case Z_NEED_DICT: @@ -3298,7 +3663,7 @@ inline bool gzip_decompressor::decompress(const char *data, size_t data_length, } } - if (ret != Z_OK && ret != Z_STREAM_END) return false; + if (ret != Z_OK && ret != Z_STREAM_END) { return false; } } while (data_length > 0); @@ -3367,7 +3732,7 @@ inline bool brotli_decompressor::decompress(const char *data, return 0; } - const uint8_t *next_in = (const uint8_t *)data; + auto next_in = reinterpret_cast(data); size_t avail_in = data_length; size_t total_out; @@ -3437,6 +3802,9 @@ inline bool parse_header(const char *beg, const char *end, T fn) { } if (p < end) { + auto key_len = key_end - beg; + if (!key_len) { return false; } + auto key = std::string(beg, key_end); auto val = compare_case_ignore(key, "Location") ? std::string(p, end) @@ -3526,11 +3894,7 @@ inline bool read_content_without_length(Stream &strm, uint64_t r = 0; for (;;) { auto n = strm.read(buf, CPPHTTPLIB_RECV_BUFSIZ); - if (n < 0) { - return false; - } else if (n == 0) { - return true; - } + if (n <= 0) { return true; } if (!out(buf, static_cast(n), r, 0)) { return false; } r += static_cast(n); @@ -3566,7 +3930,7 @@ inline bool read_content_chunked(Stream &strm, T &x, if (!line_reader.getline()) { return false; } - if (strcmp(line_reader.ptr(), "\r\n")) { return false; } + if (strcmp(line_reader.ptr(), "\r\n") != 0) { return false; } if (!line_reader.getline()) { return false; } } @@ -3576,7 +3940,7 @@ inline bool read_content_chunked(Stream &strm, T &x, // Trailer if (!line_reader.getline()) { return false; } - while (strcmp(line_reader.ptr(), "\r\n")) { + while (strcmp(line_reader.ptr(), "\r\n") != 0) { if (line_reader.size() > CPPHTTPLIB_HEADER_MAX_LENGTH) { return false; } // Exclude line terminator @@ -3595,8 +3959,8 @@ inline bool read_content_chunked(Stream &strm, T &x, } inline bool is_chunked_transfer_encoding(const Headers &headers) { - return !strcasecmp(get_header_value(headers, "Transfer-Encoding", 0, ""), - "chunked"); + return compare_case_ignore( + get_header_value(headers, "Transfer-Encoding", 0, ""), "chunked"); } template @@ -3611,14 +3975,14 @@ bool prepare_content_receiver(T &x, int &status, #ifdef CPPHTTPLIB_ZLIB_SUPPORT decompressor = detail::make_unique(); #else - status = 415; + status = StatusCode::UnsupportedMediaType_415; return false; #endif } else if (encoding.find("br") != std::string::npos) { #ifdef CPPHTTPLIB_BROTLI_SUPPORT decompressor = detail::make_unique(); #else - status = 415; + status = StatusCode::UnsupportedMediaType_415; return false; #endif } @@ -3634,7 +3998,7 @@ bool prepare_content_receiver(T &x, int &status, }; return callback(std::move(out)); } else { - status = 500; + status = StatusCode::InternalServerError_500; return false; } } @@ -3662,7 +4026,7 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status, } else if (!has_header(x.headers, "Content-Length")) { ret = read_content_without_length(strm, out); } else { - auto len = get_header_value(x.headers, "Content-Length"); + auto len = get_header_value_u64(x.headers, "Content-Length", 0, 0); if (len > payload_max_length) { exceed_payload_max_length = true; skip_content_with_length(strm, len); @@ -3672,7 +4036,10 @@ bool read_content(Stream &strm, T &x, size_t payload_max_length, int &status, } } - if (!ret) { status = exceed_payload_max_length ? 413 : 400; } + if (!ret) { + status = exceed_payload_max_length ? StatusCode::PayloadTooLarge_413 + : StatusCode::BadRequest_400; + } return ret; }); } // namespace detail @@ -3720,6 +4087,8 @@ inline bool write_content(Stream &strm, const ContentProvider &content_provider, return ok; }; + data_sink.is_writable = [&]() -> bool { return strm.is_writable(); }; + while (offset < end_offset && !is_shutting_down()) { if (!strm.is_writable()) { error = Error::Write; @@ -3764,6 +4133,8 @@ write_content_without_length(Stream &strm, return ok; }; + data_sink.is_writable = [&]() -> bool { return strm.is_writable(); }; + data_sink.done = [&](void) { data_available = false; }; while (data_available && !is_shutting_down()) { @@ -3814,6 +4185,8 @@ write_content_chunked(Stream &strm, const ContentProvider &content_provider, return ok; }; + data_sink.is_writable = [&]() -> bool { return strm.is_writable(); }; + auto done_with_trailer = [&](const Headers *trailer) { if (!ok) { return; } @@ -3898,7 +4271,8 @@ inline bool redirect(T &cli, Request &req, Response &res, new_req.path = path; new_req.redirect_count_ -= 1; - if (res.status == 303 && (req.method != "GET" && req.method != "HEAD")) { + if (res.status == StatusCode::SeeOther_303 && + (req.method != "GET" && req.method != "HEAD")) { new_req.method = "GET"; new_req.body.clear(); new_req.headers.clear(); @@ -3910,7 +4284,8 @@ inline bool redirect(T &cli, Request &req, Response &res, if (ret) { req = new_req; res = new_res; - res.location = location; + + if (res.location.empty()) { res.location = location; } } return ret; } @@ -3957,14 +4332,34 @@ inline bool parse_multipart_boundary(const std::string &content_type, if (pos == std::string::npos) { return false; } auto end = content_type.find(';', pos); auto beg = pos + strlen(boundary_keyword); - boundary = content_type.substr(beg, end - beg); - if (boundary.length() >= 2 && boundary.front() == '"' && - boundary.back() == '"') { - boundary = boundary.substr(1, boundary.size() - 2); - } + boundary = trim_double_quotes_copy(content_type.substr(beg, end - beg)); return !boundary.empty(); } +inline void parse_disposition_params(const std::string &s, Params ¶ms) { + std::set cache; + split(s.data(), s.data() + s.size(), ';', [&](const char *b, const char *e) { + std::string kv(b, e); + if (cache.find(kv) != cache.end()) { return; } + cache.insert(kv); + + std::string key; + std::string val; + split(b, e, '=', [&](const char *b2, const char *e2) { + if (key.empty()) { + key.assign(b2, e2); + } else { + val.assign(b2, e2); + } + }); + + if (!key.empty()) { + params.emplace(trim_double_quotes_copy((key)), + trim_double_quotes_copy((val))); + } + }); +} + #ifdef CPPHTTPLIB_NO_EXCEPTIONS inline bool parse_range_header(const std::string &s, Ranges &ranges) { #else @@ -3975,9 +4370,9 @@ inline bool parse_range_header(const std::string &s, Ranges &ranges) try { if (std::regex_match(s, m, re_first_range)) { auto pos = static_cast(m.position(1)); auto len = static_cast(m.length(1)); - bool all_valid_ranges = true; + auto all_valid_ranges = true; split(&s[pos], &s[pos + len], ',', [&](const char *b, const char *e) { - if (!all_valid_ranges) return; + if (!all_valid_ranges) { return; } static auto re_another_range = std::regex(R"(\s*(\d*)-(\d*))"); std::cmatch cm; if (std::regex_match(b, e, cm, re_another_range)) { @@ -4022,11 +4417,6 @@ public: bool parse(const char *buf, size_t n, const ContentReceiver &content_callback, const MultipartContentHeader &header_callback) { - // TODO: support 'filename*' - static const std::regex re_content_disposition( - R"~(^Content-Disposition:\s*form-data;\s*name="(.*?)"(?:;\s*filename="(.*?)")?(?:;\s*filename\*=\S+)?\s*$)~", - std::regex_constants::icase); - buf_append(buf, n); while (buf_size() > 0) { @@ -4059,18 +4449,54 @@ public: break; } - static const std::string header_name = "content-type:"; const auto header = buf_head(pos); - if (start_with_case_ignore(header, header_name)) { - file_.content_type = trim_copy(header.substr(header_name.size())); + + if (!parse_header(header.data(), header.data() + header.size(), + [&](std::string &&, std::string &&) {})) { + is_valid_ = false; + return false; + } + + static const std::string header_content_type = "Content-Type:"; + + if (start_with_case_ignore(header, header_content_type)) { + file_.content_type = + trim_copy(header.substr(header_content_type.size())); } else { + static const std::regex re_content_disposition( + R"~(^Content-Disposition:\s*form-data;\s*(.*)$)~", + std::regex_constants::icase); + std::smatch m; if (std::regex_match(header, m, re_content_disposition)) { - file_.name = m[1]; - file_.filename = m[2]; - } else { - is_valid_ = false; - return false; + Params params; + parse_disposition_params(m[1], params); + + auto it = params.find("name"); + if (it != params.end()) { + file_.name = it->second; + } else { + is_valid_ = false; + return false; + } + + it = params.find("filename"); + if (it != params.end()) { file_.filename = it->second; } + + it = params.find("filename*"); + if (it != params.end()) { + // Only allow UTF-8 enconnding... + static const std::regex re_rfc5987_encoding( + R"~(^UTF-8''(.+?)$)~", std::regex_constants::icase); + + std::smatch m2; + if (std::regex_match(it->second, m2, re_rfc5987_encoding)) { + file_.filename = decode_url(m2[1], false); // override... + } else { + is_valid_ = false; + return false; + } + } } } buf_erase(pos + crlf_.size()); @@ -4108,9 +4534,9 @@ public: buf_erase(crlf_.size()); state_ = 1; } else { - if (dash_crlf_.size() > buf_size()) { return true; } - if (buf_start_with(dash_crlf_)) { - buf_erase(dash_crlf_.size()); + if (dash_.size() > buf_size()) { return true; } + if (buf_start_with(dash_)) { + buf_erase(dash_.size()); is_valid_ = true; buf_erase(buf_size()); // Remove epilogue } else { @@ -4143,7 +4569,6 @@ private: const std::string dash_ = "--"; const std::string crlf_ = "\r\n"; - const std::string dash_crlf_ = "--\r\n"; std::string boundary_; std::string dash_boundary_crlf_; std::string crlf_dash_boundary_; @@ -4230,28 +4655,32 @@ inline std::string to_lower(const char *beg, const char *end) { return out; } -inline std::string make_multipart_data_boundary() { +inline std::string random_string(size_t length) { static const char data[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"; // std::random_device might actually be deterministic on some // platforms, but due to lack of support in the c++ standard library, // doing better requires either some ugly hacks or breaking portability. - std::random_device seed_gen; + static std::random_device seed_gen; // Request 128 bits of entropy for initialization - std::seed_seq seed_sequence{seed_gen(), seed_gen(), seed_gen(), seed_gen()}; - std::mt19937 engine(seed_sequence); + static std::seed_seq seed_sequence{seed_gen(), seed_gen(), seed_gen(), + seed_gen()}; - std::string result = "--cpp-httplib-multipart-data-"; + static std::mt19937 engine(seed_sequence); - for (auto i = 0; i < 16; i++) { + std::string result; + for (size_t i = 0; i < length; i++) { result += data[engine() % (sizeof(data) - 1)]; } - return result; } +inline std::string make_multipart_data_boundary() { + return "--cpp-httplib-multipart-data-" + detail::random_string(16); +} + inline bool is_multipart_boundary_chars_valid(const std::string &boundary) { auto valid = true; for (size_t i = 0; i < boundary.size(); i++) { @@ -4304,48 +4733,97 @@ serialize_multipart_formdata(const MultipartFormDataItems &items, body += item.content + serialize_multipart_formdata_item_end(); } - if (finish) body += serialize_multipart_formdata_finish(boundary); + if (finish) { body += serialize_multipart_formdata_finish(boundary); } return body; } +inline bool range_error(Request &req, Response &res) { + if (!req.ranges.empty() && 200 <= res.status && res.status < 300) { + ssize_t contant_len = static_cast( + res.content_length_ ? res.content_length_ : res.body.size()); + + ssize_t prev_first_pos = -1; + ssize_t prev_last_pos = -1; + size_t overwrapping_count = 0; + + // NOTE: The following Range check is based on '14.2. Range' in RFC 9110 + // 'HTTP Semantics' to avoid potential denial-of-service attacks. + // https://www.rfc-editor.org/rfc/rfc9110#section-14.2 + + // Too many ranges + if (req.ranges.size() > CPPHTTPLIB_RANGE_MAX_COUNT) { return true; } + + for (auto &r : req.ranges) { + auto &first_pos = r.first; + auto &last_pos = r.second; + + if (first_pos == -1 && last_pos == -1) { + first_pos = 0; + last_pos = contant_len; + } + + if (first_pos == -1) { + first_pos = contant_len - last_pos; + last_pos = contant_len - 1; + } + + if (last_pos == -1) { last_pos = contant_len - 1; } + + // Range must be within content length + if (!(0 <= first_pos && first_pos <= last_pos && + last_pos <= contant_len - 1)) { + return true; + } + + // Ranges must be in ascending order + if (first_pos <= prev_first_pos) { return true; } + + // Request must not have more than two overlapping ranges + if (first_pos <= prev_last_pos) { + overwrapping_count++; + if (overwrapping_count > 2) { return true; } + } + + prev_first_pos = (std::max)(prev_first_pos, first_pos); + prev_last_pos = (std::max)(prev_last_pos, last_pos); + } + } + + return false; +} + inline std::pair -get_range_offset_and_length(const Request &req, size_t content_length, - size_t index) { - auto r = req.ranges[index]; +get_range_offset_and_length(Range r, size_t content_length) { + (void)(content_length); // patch to get rid of "unused parameter" on release build + assert(r.first != -1 && r.second != -1); + assert(0 <= r.first && r.first < static_cast(content_length)); + assert(r.first <= r.second && + r.second < static_cast(content_length)); - if (r.first == -1 && r.second == -1) { - return std::make_pair(0, content_length); - } - - auto slen = static_cast(content_length); - - if (r.first == -1) { - r.first = (std::max)(static_cast(0), slen - r.second); - r.second = slen - 1; - } - - if (r.second == -1) { r.second = slen - 1; } return std::make_pair(r.first, static_cast(r.second - r.first) + 1); } -inline std::string make_content_range_header_field(size_t offset, size_t length, - size_t content_length) { +inline std::string make_content_range_header_field( + const std::pair &offset_and_length, size_t content_length) { + auto st = offset_and_length.first; + auto ed = st + offset_and_length.second - 1; + std::string field = "bytes "; - field += std::to_string(offset); + field += std::to_string(st); field += "-"; - field += std::to_string(offset + length - 1); + field += std::to_string(ed); field += "/"; field += std::to_string(content_length); return field; } template -bool process_multipart_ranges_data(const Request &req, Response &res, +bool process_multipart_ranges_data(const Request &req, const std::string &boundary, const std::string &content_type, - SToken stoken, CToken ctoken, - Content content) { + size_t content_length, SToken stoken, + CToken ctoken, Content content) { for (size_t i = 0; i < req.ranges.size(); i++) { ctoken("--"); stoken(boundary); @@ -4356,50 +4834,51 @@ bool process_multipart_ranges_data(const Request &req, Response &res, ctoken("\r\n"); } - auto offsets = get_range_offset_and_length(req, res.body.size(), i); - auto offset = offsets.first; - auto length = offsets.second; + auto offset_and_length = + get_range_offset_and_length(req.ranges[i], content_length); ctoken("Content-Range: "); - stoken(make_content_range_header_field(offset, length, res.body.size())); + stoken(make_content_range_header_field(offset_and_length, content_length)); ctoken("\r\n"); ctoken("\r\n"); - if (!content(offset, length)) { return false; } + + if (!content(offset_and_length.first, offset_and_length.second)) { + return false; + } ctoken("\r\n"); } ctoken("--"); stoken(boundary); - ctoken("--\r\n"); + ctoken("--"); return true; } -inline bool make_multipart_ranges_data(const Request &req, Response &res, +inline void make_multipart_ranges_data(const Request &req, Response &res, const std::string &boundary, const std::string &content_type, + size_t content_length, std::string &data) { - return process_multipart_ranges_data( - req, res, boundary, content_type, + process_multipart_ranges_data( + req, boundary, content_type, content_length, [&](const std::string &token) { data += token; }, [&](const std::string &token) { data += token; }, [&](size_t offset, size_t length) { - if (offset < res.body.size()) { - data += res.body.substr(offset, length); - return true; - } - return false; + assert(offset + length <= content_length); + data += res.body.substr(offset, length); + return true; }); } -inline size_t -get_multipart_ranges_data_length(const Request &req, Response &res, - const std::string &boundary, - const std::string &content_type) { +inline size_t get_multipart_ranges_data_length(const Request &req, + const std::string &boundary, + const std::string &content_type, + size_t content_length) { size_t data_length = 0; process_multipart_ranges_data( - req, res, boundary, content_type, + req, boundary, content_type, content_length, [&](const std::string &token) { data_length += token.size(); }, [&](const std::string &token) { data_length += token.size(); }, [&](size_t /*offset*/, size_t length) { @@ -4411,13 +4890,13 @@ get_multipart_ranges_data_length(const Request &req, Response &res, } template -inline bool write_multipart_ranges_data(Stream &strm, const Request &req, - Response &res, - const std::string &boundary, - const std::string &content_type, - const T &is_shutting_down) { +inline bool +write_multipart_ranges_data(Stream &strm, const Request &req, Response &res, + const std::string &boundary, + const std::string &content_type, + size_t content_length, const T &is_shutting_down) { return process_multipart_ranges_data( - req, res, boundary, content_type, + req, boundary, content_type, content_length, [&](const std::string &token) { strm.write(token); }, [&](const std::string &token) { strm.write(token); }, [&](size_t offset, size_t length) { @@ -4426,18 +4905,6 @@ inline bool write_multipart_ranges_data(Stream &strm, const Request &req, }); } -inline std::pair -get_range_offset_and_length(const Request &req, const Response &res, - size_t index) { - auto r = req.ranges[index]; - - if (r.second == -1) { - r.second = static_cast(res.content_length_) - 1; - } - - return std::make_pair(r.first, r.second - r.first + 1); -} - inline bool expect_content(const Request &req) { if (req.method == "POST" || req.method == "PUT" || req.method == "PATCH" || req.method == "PRI" || req.method == "DELETE") { @@ -4471,7 +4938,7 @@ inline std::string message_digest(const std::string &s, const EVP_MD *algo) { std::stringstream ss; for (auto i = 0u; i < hash_length; ++i) { ss << std::hex << std::setw(2) << std::setfill('0') - << (unsigned int)hash[i]; + << static_cast(hash[i]); } return ss.str(); @@ -4564,7 +5031,7 @@ inline bool retrieve_root_certs_from_keychain(CFObjectPtr &certs) { inline bool add_certs_to_x509_store(CFArrayRef certs, X509_STORE *store) { auto result = false; - for (int i = 0; i < CFArrayGetCount(certs); ++i) { + for (auto i = 0; i < CFArrayGetCount(certs); ++i) { const auto cert = reinterpret_cast( CFArrayGetValueAtIndex(certs, i)); @@ -4707,7 +5174,7 @@ inline bool parse_www_authenticate(const Response &res, s = s.substr(pos + 1); auto beg = std::sregex_iterator(s.begin(), s.end(), re); for (auto i = beg; i != std::sregex_iterator(); ++i) { - auto m = *i; + const auto &m = *i; auto key = s.substr(static_cast(m.position(1)), static_cast(m.length(1))); auto val = m.length(2) > 0 @@ -4724,20 +5191,6 @@ inline bool parse_www_authenticate(const Response &res, return false; } -// https://stackoverflow.com/questions/440133/how-do-i-create-a-random-alpha-numeric-string-in-c/440240#answer-440240 -inline std::string random_string(size_t length) { - auto randchar = []() -> char { - const char charset[] = "0123456789" - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz"; - const size_t max_index = (sizeof(charset) - 1); - return charset[static_cast(std::rand()) % max_index]; - }; - std::string str(length, 0); - std::generate_n(str.begin(), length, randchar); - return str; -} - class ContentProviderAdapter { public: explicit ContentProviderAdapter( @@ -4782,7 +5235,7 @@ inline void hosted_at(const std::string &hostname, const auto &addr = *reinterpret_cast(rp->ai_addr); std::string ip; - int dummy = -1; + auto dummy = -1; if (detail::get_ip_and_port(addr, sizeof(struct sockaddr_storage), ip, dummy)) { addrs.push_back(ip); @@ -4802,10 +5255,11 @@ inline std::string append_query_params(const std::string &path, } // Header utilities -inline std::pair make_range_header(Ranges ranges) { +inline std::pair +make_range_header(const Ranges &ranges) { std::string field = "bytes="; auto i = 0; - for (auto r : ranges) { + for (const auto &r : ranges) { if (i != 0) { field += ", "; } if (r.first != -1) { field += std::to_string(r.first); } field += '-'; @@ -4924,7 +5378,7 @@ inline void Response::set_redirect(const std::string &url, int stat) { if (300 <= stat && stat < 400) { this->status = stat; } else { - this->status = 302; + this->status = StatusCode::Found_302; } } } @@ -4943,13 +5397,22 @@ inline void Response::set_content(const std::string &s, set_content(s.data(), s.size(), content_type); } +inline void Response::set_content(std::string &&s, + const std::string &content_type) { + body = std::move(s); + + auto rng = headers.equal_range("Content-Type"); + headers.erase(rng.first, rng.second); + set_header("Content-Type", content_type); +} + inline void Response::set_content_provider( size_t in_length, const std::string &content_type, ContentProvider provider, ContentProviderResourceReleaser resource_releaser) { set_header("Content-Type", content_type); content_length_ = in_length; if (in_length > 0) { content_provider_ = std::move(provider); } - content_provider_resource_releaser_ = resource_releaser; + content_provider_resource_releaser_ = std::move(resource_releaser); is_chunked_content_provider_ = false; } @@ -4959,7 +5422,7 @@ inline void Response::set_content_provider( set_header("Content-Type", content_type); content_length_ = 0; content_provider_ = detail::ContentProviderAdapter(std::move(provider)); - content_provider_resource_releaser_ = resource_releaser; + content_provider_resource_releaser_ = std::move(resource_releaser); is_chunked_content_provider_ = false; } @@ -4969,7 +5432,7 @@ inline void Response::set_chunked_content_provider( set_header("Content-Type", content_type); content_length_ = 0; content_provider_ = detail::ContentProviderAdapter(std::move(provider)); - content_provider_resource_releaser_ = resource_releaser; + content_provider_resource_releaser_ = std::move(resource_releaser); is_chunked_content_provider_ = true; } @@ -5010,7 +5473,7 @@ inline SocketStream::SocketStream(socket_t sock, time_t read_timeout_sec, write_timeout_sec_(write_timeout_sec), write_timeout_usec_(write_timeout_usec), read_buff_(read_buff_size_, 0) {} -inline SocketStream::~SocketStream() {} +inline SocketStream::~SocketStream() = default; inline bool SocketStream::is_readable() const { return select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0; @@ -5120,6 +5583,99 @@ inline socket_t BufferStream::socket() const { return 0; } inline const std::string &BufferStream::get_buffer() const { return buffer; } +inline PathParamsMatcher::PathParamsMatcher(const std::string &pattern) { + // One past the last ending position of a path param substring + std::size_t last_param_end = 0; + +#ifndef CPPHTTPLIB_NO_EXCEPTIONS + // Needed to ensure that parameter names are unique during matcher + // construction + // If exceptions are disabled, only last duplicate path + // parameter will be set + std::unordered_set param_name_set; +#endif + + while (true) { + const auto marker_pos = pattern.find(marker, last_param_end); + if (marker_pos == std::string::npos) { break; } + + static_fragments_.push_back( + pattern.substr(last_param_end, marker_pos - last_param_end)); + + const auto param_name_start = marker_pos + 1; + + auto sep_pos = pattern.find(separator, param_name_start); + if (sep_pos == std::string::npos) { sep_pos = pattern.length(); } + + auto param_name = + pattern.substr(param_name_start, sep_pos - param_name_start); + +#ifndef CPPHTTPLIB_NO_EXCEPTIONS + if (param_name_set.find(param_name) != param_name_set.cend()) { + std::string msg = "Encountered path parameter '" + param_name + + "' multiple times in route pattern '" + pattern + "'."; + throw std::invalid_argument(msg); + } +#endif + + param_names_.push_back(std::move(param_name)); + + last_param_end = sep_pos + 1; + } + + if (last_param_end < pattern.length()) { + static_fragments_.push_back(pattern.substr(last_param_end)); + } +} + +inline bool PathParamsMatcher::match(Request &request) const { + request.matches = std::smatch(); + request.path_params.clear(); + request.path_params.reserve(param_names_.size()); + + // One past the position at which the path matched the pattern last time + std::size_t starting_pos = 0; + for (size_t i = 0; i < static_fragments_.size(); ++i) { + const auto &fragment = static_fragments_[i]; + + if (starting_pos + fragment.length() > request.path.length()) { + return false; + } + + // Avoid unnecessary allocation by using strncmp instead of substr + + // comparison + if (std::strncmp(request.path.c_str() + starting_pos, fragment.c_str(), + fragment.length()) != 0) { + return false; + } + + starting_pos += fragment.length(); + + // Should only happen when we have a static fragment after a param + // Example: '/users/:id/subscriptions' + // The 'subscriptions' fragment here does not have a corresponding param + if (i >= param_names_.size()) { continue; } + + auto sep_pos = request.path.find(separator, starting_pos); + if (sep_pos == std::string::npos) { sep_pos = request.path.length(); } + + const auto ¶m_name = param_names_[i]; + + request.path_params.emplace( + param_name, request.path.substr(starting_pos, sep_pos - starting_pos)); + + // Mark everythin up to '/' as matched + starting_pos = sep_pos + 1; + } + // Returns false if the path is longer than the pattern + return starting_pos >= request.path.length(); +} + +inline bool RegexMatcher::match(Request &request) const { + request.path_params.clear(); + return std::regex_match(request.path, request.matches, regex_); +} + } // namespace detail // HTTP server implementation @@ -5131,69 +5687,72 @@ inline Server::Server() #endif } -inline Server::~Server() {} +inline Server::~Server() = default; + +inline std::unique_ptr +Server::make_matcher(const std::string &pattern) { + if (pattern.find("/:") != std::string::npos) { + return detail::make_unique(pattern); + } else { + return detail::make_unique(pattern); + } +} inline Server &Server::Get(const std::string &pattern, Handler handler) { - get_handlers_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + get_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); return *this; } inline Server &Server::Post(const std::string &pattern, Handler handler) { - post_handlers_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + post_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); return *this; } inline Server &Server::Post(const std::string &pattern, HandlerWithContentReader handler) { - post_handlers_for_content_reader_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + post_handlers_for_content_reader_.emplace_back(make_matcher(pattern), + std::move(handler)); return *this; } inline Server &Server::Put(const std::string &pattern, Handler handler) { - put_handlers_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + put_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); return *this; } inline Server &Server::Put(const std::string &pattern, HandlerWithContentReader handler) { - put_handlers_for_content_reader_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + put_handlers_for_content_reader_.emplace_back(make_matcher(pattern), + std::move(handler)); return *this; } inline Server &Server::Patch(const std::string &pattern, Handler handler) { - patch_handlers_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + patch_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); return *this; } inline Server &Server::Patch(const std::string &pattern, HandlerWithContentReader handler) { - patch_handlers_for_content_reader_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + patch_handlers_for_content_reader_.emplace_back(make_matcher(pattern), + std::move(handler)); return *this; } inline Server &Server::Delete(const std::string &pattern, Handler handler) { - delete_handlers_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + delete_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); return *this; } inline Server &Server::Delete(const std::string &pattern, HandlerWithContentReader handler) { - delete_handlers_for_content_reader_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + delete_handlers_for_content_reader_.emplace_back(make_matcher(pattern), + std::move(handler)); return *this; } inline Server &Server::Options(const std::string &pattern, Handler handler) { - options_handlers_.push_back( - std::make_pair(std::regex(pattern), std::move(handler))); + options_handlers_.emplace_back(make_matcher(pattern), std::move(handler)); return *this; } @@ -5231,6 +5790,11 @@ Server::set_file_extension_and_mimetype_mapping(const std::string &ext, return *this; } +inline Server &Server::set_default_file_mimetype(const std::string &mime) { + default_file_mimetype_ = mime; + return *this; +} + inline Server &Server::set_file_request_handler(Handler handler) { file_request_handler_ = std::move(handler); return *this; @@ -5272,7 +5836,6 @@ inline Server &Server::set_logger(Logger logger) { inline Server & Server::set_expect_100_continue_handler(Expect100ContinueHandler handler) { expect_100_continue_handler_ = std::move(handler); - return *this; } @@ -5296,6 +5859,12 @@ inline Server &Server::set_default_headers(Headers headers) { return *this; } +inline Server &Server::set_header_writer( + std::function const &writer) { + header_writer_ = writer; + return *this; +} + inline Server &Server::set_keep_alive_max_count(size_t count) { keep_alive_max_count_ = count; return *this; @@ -5331,8 +5900,7 @@ inline Server &Server::set_payload_max_length(size_t length) { inline bool Server::bind_to_port(const std::string &host, int port, int socket_flags) { - if (bind_internal(host, port, socket_flags) < 0) return false; - return true; + return bind_internal(host, port, socket_flags) >= 0; } inline int Server::bind_to_any_port(const std::string &host, int socket_flags) { return bind_internal(host, 0, socket_flags); @@ -5366,7 +5934,7 @@ inline void Server::stop() { } } -inline bool Server::parse_request_line(const char *s, Request &req) { +inline bool Server::parse_request_line(const char *s, Request &req) const { auto len = strlen(s); if (len < 2 || s[len - 2] != '\r' || s[len - 1] != '\n') { return false; } len -= 2; @@ -5407,7 +5975,7 @@ inline bool Server::parse_request_line(const char *s, Request &req) { size_t count = 0; detail::split(req.target.data(), req.target.data() + req.target.size(), '?', - [&](const char *b, const char *e) { + 2, [&](const char *b, const char *e) { switch (count) { case 0: req.path = detail::decode_url(std::string(b, e), false); @@ -5430,7 +5998,10 @@ inline bool Server::parse_request_line(const char *s, Request &req) { } inline bool Server::write_response(Stream &strm, bool close_connection, - const Request &req, Response &res) { + Request &req, Response &res) { + // NOTE: `req.ranges` should be empty, otherwise it will be applied + // incorrectly to the error content. + req.ranges.clear(); return write_response_core(strm, close_connection, req, res, false); } @@ -5486,11 +6057,11 @@ inline bool Server::write_response_core(Stream &strm, bool close_connection, detail::BufferStream bstrm; if (!bstrm.write_format("HTTP/1.1 %d %s\r\n", res.status, - detail::status_message(res.status))) { + status_message(res.status))) { return false; } - if (!detail::write_headers(bstrm, res.headers)) { return false; } + if (!header_writer_(bstrm, res.headers)) { return false; } // Flush buffer auto &data = bstrm.get_buffer(); @@ -5508,7 +6079,6 @@ inline bool Server::write_response_core(Stream &strm, bool close_connection, if (write_content_with_provider(strm, req, res, boundary, content_type)) { res.content_provider_success_ = true; } else { - res.content_provider_success_ = false; ret = false; } } @@ -5533,15 +6103,16 @@ Server::write_content_with_provider(Stream &strm, const Request &req, return detail::write_content(strm, res.content_provider_, 0, res.content_length_, is_shutting_down); } else if (req.ranges.size() == 1) { - auto offsets = - detail::get_range_offset_and_length(req, res.content_length_, 0); - auto offset = offsets.first; - auto length = offsets.second; - return detail::write_content(strm, res.content_provider_, offset, length, - is_shutting_down); + auto offset_and_length = detail::get_range_offset_and_length( + req.ranges[0], res.content_length_); + + return detail::write_content(strm, res.content_provider_, + offset_and_length.first, + offset_and_length.second, is_shutting_down); } else { return detail::write_multipart_ranges_data( - strm, req, res, boundary, content_type, is_shutting_down); + strm, req, res, boundary, content_type, res.content_length_, + is_shutting_down); } } else { if (res.is_chunked_content_provider_) { @@ -5598,7 +6169,7 @@ inline bool Server::read_content(Stream &strm, Request &req, Response &res) { const auto &content_type = req.get_header_value("Content-Type"); if (!content_type.find("application/x-www-form-urlencoded")) { if (req.body.size() > CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH) { - res.status = 413; // NOTE: should be 414? + res.status = StatusCode::PayloadTooLarge_413; // NOTE: should be 414? return false; } detail::parse_query_text(req.body, req.params); @@ -5617,10 +6188,11 @@ inline bool Server::read_content_with_content_receiver( std::move(multipart_receiver)); } -inline bool Server::read_content_core(Stream &strm, Request &req, Response &res, - ContentReceiver receiver, - MultipartContentHeader multipart_header, - ContentReceiver multipart_receiver) { +inline bool +Server::read_content_core(Stream &strm, Request &req, Response &res, + ContentReceiver receiver, + MultipartContentHeader multipart_header, + ContentReceiver multipart_receiver) const { detail::MultipartFormDataParser multipart_form_data_parser; ContentReceiverWithProgress out; @@ -5628,7 +6200,7 @@ inline bool Server::read_content_core(Stream &strm, Request &req, Response &res, const auto &content_type = req.get_header_value("Content-Type"); std::string boundary; if (!detail::parse_multipart_boundary(content_type, boundary)) { - res.status = 400; + res.status = StatusCode::BadRequest_400; return false; } @@ -5664,7 +6236,7 @@ inline bool Server::read_content_core(Stream &strm, Request &req, Response &res, if (req.is_multipart_form_data()) { if (!multipart_form_data_parser.is_valid()) { - res.status = 400; + res.status = StatusCode::BadRequest_400; return false; } } @@ -5683,17 +6255,26 @@ inline bool Server::handle_file_request(const Request &req, Response &res, if (path.back() == '/') { path += "index.html"; } if (detail::is_file(path)) { - detail::read_file(path, res.body); - auto type = - detail::find_content_type(path, file_extension_and_mimetype_map_); - if (type) { res.set_header("Content-Type", type); } for (const auto &kv : entry.headers) { - res.set_header(kv.first.c_str(), kv.second); + res.set_header(kv.first, kv.second); } - res.status = req.has_header("Range") ? 206 : 200; + + auto mm = std::make_shared(path.c_str()); + if (!mm->is_open()) { return false; } + + res.set_content_provider( + mm->size(), + detail::find_content_type(path, file_extension_and_mimetype_map_, + default_file_mimetype_), + [mm](size_t offset, size_t length, DataSink &sink) -> bool { + sink.write(mm->data() + offset, length); + return true; + }); + if (!head && file_request_handler_) { file_request_handler_(req, res); } + return true; } } @@ -5789,13 +6370,14 @@ inline bool Server::listen_internal() { #ifdef _WIN32 auto timeout = static_cast(read_timeout_sec_ * 1000 + read_timeout_usec_ / 1000); - setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, - sizeof(timeout)); + setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, + reinterpret_cast(&timeout), sizeof(timeout)); #else timeval tv; tv.tv_sec = static_cast(read_timeout_sec_); tv.tv_usec = static_cast(read_timeout_usec_); - setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&tv, sizeof(tv)); + setsockopt(sock, SOL_SOCKET, SO_RCVTIMEO, + reinterpret_cast(&tv), sizeof(tv)); #endif } { @@ -5803,17 +6385,22 @@ inline bool Server::listen_internal() { #ifdef _WIN32 auto timeout = static_cast(write_timeout_sec_ * 1000 + write_timeout_usec_ / 1000); - setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&timeout, - sizeof(timeout)); + setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, + reinterpret_cast(&timeout), sizeof(timeout)); #else timeval tv; tv.tv_sec = static_cast(write_timeout_sec_); tv.tv_usec = static_cast(write_timeout_usec_); - setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, (char *)&tv, sizeof(tv)); + setsockopt(sock, SOL_SOCKET, SO_SNDTIMEO, + reinterpret_cast(&tv), sizeof(tv)); #endif } - task_queue->enqueue([this, sock]() { process_and_close_socket(sock); }); + if (!task_queue->enqueue( + [this, sock]() { process_and_close_socket(sock); })) { + detail::shutdown_socket(sock); + detail::close_socket(sock); + } } task_queue->shutdown(); @@ -5829,7 +6416,7 @@ inline bool Server::routing(Request &req, Response &res, Stream &strm) { } // File handler - bool is_head_request = req.method == "HEAD"; + auto is_head_request = req.method == "HEAD"; if ((req.method == "GET" || is_head_request) && handle_file_request(req, res, is_head_request)) { return true; @@ -5895,17 +6482,17 @@ inline bool Server::routing(Request &req, Response &res, Stream &strm) { return dispatch_request(req, res, patch_handlers_); } - res.status = 400; + res.status = StatusCode::BadRequest_400; return false; } inline bool Server::dispatch_request(Request &req, Response &res, - const Handlers &handlers) { + const Handlers &handlers) const { for (const auto &x : handlers) { - const auto &pattern = x.first; + const auto &matcher = x.first; const auto &handler = x.second; - if (std::regex_match(req.path, req.matches, pattern)) { + if (matcher->match(req)) { handler(req, res); return true; } @@ -5915,18 +6502,18 @@ inline bool Server::dispatch_request(Request &req, Response &res, inline void Server::apply_ranges(const Request &req, Response &res, std::string &content_type, - std::string &boundary) { + std::string &boundary) const { if (req.ranges.size() > 1) { - boundary = detail::make_multipart_data_boundary(); - auto it = res.headers.find("Content-Type"); if (it != res.headers.end()) { content_type = it->second; res.headers.erase(it); } - res.headers.emplace("Content-Type", - "multipart/byteranges; boundary=" + boundary); + boundary = detail::make_multipart_data_boundary(); + + res.set_header("Content-Type", + "multipart/byteranges; boundary=" + boundary); } auto type = detail::encoding_type(req, res); @@ -5937,16 +6524,17 @@ inline void Server::apply_ranges(const Request &req, Response &res, if (req.ranges.empty()) { length = res.content_length_; } else if (req.ranges.size() == 1) { - auto offsets = - detail::get_range_offset_and_length(req, res.content_length_, 0); - auto offset = offsets.first; - length = offsets.second; + auto offset_and_length = detail::get_range_offset_and_length( + req.ranges[0], res.content_length_); + + length = offset_and_length.second; + auto content_range = detail::make_content_range_header_field( - offset, length, res.content_length_); + offset_and_length, res.content_length_); res.set_header("Content-Range", content_range); } else { - length = detail::get_multipart_ranges_data_length(req, res, boundary, - content_type); + length = detail::get_multipart_ranges_data_length( + req, boundary, content_type, res.content_length_); } res.set_header("Content-Length", std::to_string(length)); } else { @@ -5965,28 +6553,22 @@ inline void Server::apply_ranges(const Request &req, Response &res, if (req.ranges.empty()) { ; } else if (req.ranges.size() == 1) { - auto offsets = - detail::get_range_offset_and_length(req, res.body.size(), 0); - auto offset = offsets.first; - auto length = offsets.second; + auto offset_and_length = + detail::get_range_offset_and_length(req.ranges[0], res.body.size()); + auto offset = offset_and_length.first; + auto length = offset_and_length.second; + auto content_range = detail::make_content_range_header_field( - offset, length, res.body.size()); + offset_and_length, res.body.size()); res.set_header("Content-Range", content_range); - if (offset < res.body.size()) { - res.body = res.body.substr(offset, length); - } else { - res.body.clear(); - res.status = 416; - } + + assert(offset + length <= res.body.size()); + res.body = res.body.substr(offset, length); } else { std::string data; - if (detail::make_multipart_ranges_data(req, res, boundary, content_type, - data)) { - res.body.swap(data); - } else { - res.body.clear(); - res.status = 416; - } + detail::make_multipart_ranges_data(req, res, boundary, content_type, + res.body.size(), data); + res.body.swap(data); } if (type != detail::EncodingType::None) { @@ -6025,12 +6607,12 @@ inline void Server::apply_ranges(const Request &req, Response &res, inline bool Server::dispatch_request_for_content_reader( Request &req, Response &res, ContentReader content_reader, - const HandlersForContentReader &handlers) { + const HandlersForContentReader &handlers) const { for (const auto &x : handlers) { - const auto &pattern = x.first; + const auto &matcher = x.first; const auto &handler = x.second; - if (std::regex_match(req.path, req.matches, pattern)) { + if (matcher->match(req)) { handler(req, res, content_reader); return true; } @@ -6050,15 +6632,10 @@ Server::process_request(Stream &strm, bool close_connection, if (!line_reader.getline()) { return false; } Request req; + Response res; - res.version = "HTTP/1.1"; - - for (const auto &header : default_headers_) { - if (res.headers.find(header.first) == res.headers.end()) { - res.headers.insert(header); - } - } + res.headers = default_headers_; #ifdef _WIN32 // TODO: Increase FD_SETSIZE statically (libzmq), dynamically (MySQL). @@ -6068,7 +6645,7 @@ Server::process_request(Stream &strm, bool close_connection, if (strm.socket() >= FD_SETSIZE) { Headers dummy; detail::read_headers(strm, dummy); - res.status = 500; + res.status = StatusCode::InternalServerError_500; return write_response(strm, close_connection, req, res); } #endif @@ -6078,14 +6655,14 @@ Server::process_request(Stream &strm, bool close_connection, if (line_reader.size() > CPPHTTPLIB_REQUEST_URI_MAX_LENGTH) { Headers dummy; detail::read_headers(strm, dummy); - res.status = 414; + res.status = StatusCode::UriTooLong_414; return write_response(strm, close_connection, req, res); } // Request line and headers if (!parse_request_line(line_reader.ptr(), req) || !detail::read_headers(strm, req.headers)) { - res.status = 400; + res.status = StatusCode::BadRequest_400; return write_response(strm, close_connection, req, res); } @@ -6109,7 +6686,7 @@ Server::process_request(Stream &strm, bool close_connection, if (req.has_header("Range")) { const auto &range_header_value = req.get_header_value("Range"); if (!detail::parse_range_header(range_header_value, req.ranges)) { - res.status = 416; + res.status = StatusCode::RangeNotSatisfiable_416; return write_response(strm, close_connection, req, res); } } @@ -6117,22 +6694,22 @@ Server::process_request(Stream &strm, bool close_connection, if (setup_request) { setup_request(req); } if (req.get_header_value("Expect") == "100-continue") { - auto status = 100; + int status = StatusCode::Continue_100; if (expect_100_continue_handler_) { status = expect_100_continue_handler_(req, res); } switch (status) { - case 100: - case 417: + case StatusCode::Continue_100: + case StatusCode::ExpectationFailed_417: strm.write_format("HTTP/1.1 %d %s\r\n\r\n", status, - detail::status_message(status)); + status_message(status)); break; default: return write_response(strm, close_connection, req, res); } } - // Rounting - bool routed = false; + // Routing + auto routed = false; #ifdef CPPHTTPLIB_NO_EXCEPTIONS routed = routing(req, res, strm); #else @@ -6144,7 +6721,7 @@ Server::process_request(Stream &strm, bool close_connection, exception_handler_(req, res, ep); routed = true; } else { - res.status = 500; + res.status = StatusCode::InternalServerError_500; std::string val; auto s = e.what(); for (size_t i = 0; s[i]; i++) { @@ -6162,17 +6739,29 @@ Server::process_request(Stream &strm, bool close_connection, exception_handler_(req, res, ep); routed = true; } else { - res.status = 500; + res.status = StatusCode::InternalServerError_500; res.set_header("EXCEPTION_WHAT", "UNKNOWN"); } } #endif - if (routed) { - if (res.status == -1) { res.status = req.ranges.empty() ? 200 : 206; } + if (res.status == -1) { + res.status = req.ranges.empty() ? StatusCode::OK_200 + : StatusCode::PartialContent_206; + } + + if (detail::range_error(req, res)) { + res.body.clear(); + res.content_length_ = 0; + res.content_provider_ = nullptr; + res.status = StatusCode::RangeNotSatisfiable_416; + return write_response(strm, close_connection, req, res); + } + return write_response_with_content(strm, close_connection, req, res); } else { - if (res.status == -1) { res.status = 404; } + if (res.status == -1) { res.status = StatusCode::NotFound_404; } + return write_response(strm, close_connection, req, res); } } @@ -6272,7 +6861,7 @@ inline socket_t ClientImpl::create_client_socket(Error &error) const { // Check is custom IP specified for host_ std::string ip; auto it = addr_map_.find(host_); - if (it != addr_map_.end()) ip = it->second; + if (it != addr_map_.end()) { ip = it->second; } return detail::create_client_socket( host_, ip, port_, address_family_, tcp_nodelay_, socket_options_, @@ -6297,7 +6886,7 @@ inline void ClientImpl::shutdown_ssl(Socket & /*socket*/, socket_requests_are_from_thread_ == std::this_thread::get_id()); } -inline void ClientImpl::shutdown_socket(Socket &socket) { +inline void ClientImpl::shutdown_socket(Socket &socket) const { if (socket.sock == INVALID_SOCKET) { return; } detail::shutdown_socket(socket.sock); } @@ -6322,7 +6911,7 @@ inline void ClientImpl::close_socket(Socket &socket) { } inline bool ClientImpl::read_response_line(Stream &strm, const Request &req, - Response &res) { + Response &res) const { std::array buf{}; detail::stream_line_reader line_reader(strm, buf.data(), buf.size()); @@ -6330,9 +6919,9 @@ inline bool ClientImpl::read_response_line(Stream &strm, const Request &req, if (!line_reader.getline()) { return false; } #ifdef CPPHTTPLIB_ALLOW_LF_AS_LINE_TERMINATOR - const static std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r\n"); -#else const static std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r?\n"); +#else + const static std::regex re("(HTTP/1\\.[01]) (\\d{3})(?: (.*?))?\r\n"); #endif std::cmatch m; @@ -6344,7 +6933,7 @@ inline bool ClientImpl::read_response_line(Stream &strm, const Request &req, res.reason = std::string(m[3]); // Ignore '100 Continue' - while (res.status == 100) { + while (res.status == StatusCode::Continue_100) { if (!line_reader.getline()) { return false; } // CRLF if (!line_reader.getline()) { return false; } // next response line @@ -6492,15 +7081,31 @@ inline bool ClientImpl::handle_request(Stream &strm, Request &req, if (!ret) { return false; } + if (res.get_header_value("Connection") == "close" || + (res.version == "HTTP/1.0" && res.reason != "Connection established")) { + // TODO this requires a not-entirely-obvious chain of calls to be correct + // for this to be safe. + + // This is safe to call because handle_request is only called by send_ + // which locks the request mutex during the process. It would be a bug + // to call it from a different thread since it's a thread-safety issue + // to do these things to the socket if another thread is using the socket. + std::lock_guard guard(socket_mutex_); + shutdown_ssl(socket_, true); + shutdown_socket(socket_); + close_socket(socket_); + } + if (300 < res.status && res.status < 400 && follow_location_) { req = req_save; ret = redirect(req, res, error); } #ifdef CPPHTTPLIB_OPENSSL_SUPPORT - if ((res.status == 401 || res.status == 407) && + if ((res.status == StatusCode::Unauthorized_401 || + res.status == StatusCode::ProxyAuthenticationRequired_407) && req.authorization_count_ < 5) { - auto is_proxy = res.status == 407; + auto is_proxy = res.status == StatusCode::ProxyAuthenticationRequired_407; const auto &username = is_proxy ? proxy_digest_auth_username_ : digest_auth_username_; const auto &password = @@ -6571,7 +7176,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) { } else { if (next_scheme == "https") { #ifdef CPPHTTPLIB_OPENSSL_SUPPORT - SSLClient cli(next_host.c_str(), next_port); + SSLClient cli(next_host, next_port); cli.copy_settings(*this); if (ca_cert_store_) { cli.set_ca_cert_store(ca_cert_store_); } return detail::redirect(cli, req, res, path, location, error); @@ -6579,7 +7184,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) { return false; #endif } else { - ClientImpl cli(next_host.c_str(), next_port); + ClientImpl cli(next_host, next_port); cli.copy_settings(*this); return detail::redirect(cli, req, res, path, location, error); } @@ -6588,7 +7193,7 @@ inline bool ClientImpl::redirect(Request &req, Response &res, Error &error) { inline bool ClientImpl::write_content_with_provider(Stream &strm, const Request &req, - Error &error) { + Error &error) const { auto is_shutting_down = []() { return false; }; if (req.is_chunked_content_provider_) { @@ -6616,32 +7221,32 @@ inline bool ClientImpl::write_request(Stream &strm, Request &req, // Prepare additional headers if (close_connection) { if (!req.has_header("Connection")) { - req.headers.emplace("Connection", "close"); + req.set_header("Connection", "close"); } } if (!req.has_header("Host")) { if (is_ssl()) { if (port_ == 443) { - req.headers.emplace("Host", host_); + req.set_header("Host", host_); } else { - req.headers.emplace("Host", host_and_port_); + req.set_header("Host", host_and_port_); } } else { if (port_ == 80) { - req.headers.emplace("Host", host_); + req.set_header("Host", host_); } else { - req.headers.emplace("Host", host_and_port_); + req.set_header("Host", host_and_port_); } } } - if (!req.has_header("Accept")) { req.headers.emplace("Accept", "*/*"); } + if (!req.has_header("Accept")) { req.set_header("Accept", "*/*"); } #ifndef CPPHTTPLIB_NO_DEFAULT_USER_AGENT if (!req.has_header("User-Agent")) { auto agent = std::string("cpp-httplib/") + CPPHTTPLIB_VERSION; - req.headers.emplace("User-Agent", agent); + req.set_header("User-Agent", agent); } #endif @@ -6650,23 +7255,23 @@ inline bool ClientImpl::write_request(Stream &strm, Request &req, if (!req.is_chunked_content_provider_) { if (!req.has_header("Content-Length")) { auto length = std::to_string(req.content_length_); - req.headers.emplace("Content-Length", length); + req.set_header("Content-Length", length); } } } else { if (req.method == "POST" || req.method == "PUT" || req.method == "PATCH") { - req.headers.emplace("Content-Length", "0"); + req.set_header("Content-Length", "0"); } } } else { if (!req.has_header("Content-Type")) { - req.headers.emplace("Content-Type", "text/plain"); + req.set_header("Content-Type", "text/plain"); } if (!req.has_header("Content-Length")) { auto length = std::to_string(req.body.size()); - req.headers.emplace("Content-Length", length); + req.set_header("Content-Length", length); } } @@ -6706,7 +7311,7 @@ inline bool ClientImpl::write_request(Stream &strm, Request &req, const auto &path = url_encode_ ? detail::encode_url(req.path) : req.path; bstrm.write_format("%s %s HTTP/1.1\r\n", req.method.c_str(), path.c_str()); - detail::write_headers(bstrm, req.headers); + header_writer_(bstrm, req.headers); // Flush buffer auto &data = bstrm.get_buffer(); @@ -6734,12 +7339,10 @@ inline std::unique_ptr ClientImpl::send_with_content_provider( ContentProvider content_provider, ContentProviderWithoutLength content_provider_without_length, const std::string &content_type, Error &error) { - if (!content_type.empty()) { - req.headers.emplace("Content-Type", content_type); - } + if (!content_type.empty()) { req.set_header("Content-Type", content_type); } #ifdef CPPHTTPLIB_ZLIB_SUPPORT - if (compress_) { req.headers.emplace("Content-Encoding", "gzip"); } + if (compress_) { req.set_header("Content-Encoding", "gzip"); } #endif #ifdef CPPHTTPLIB_ZLIB_SUPPORT @@ -6800,10 +7403,9 @@ inline std::unique_ptr ClientImpl::send_with_content_provider( req.content_provider_ = detail::ContentProviderAdapter( std::move(content_provider_without_length)); req.is_chunked_content_provider_ = true; - req.headers.emplace("Transfer-Encoding", "chunked"); + req.set_header("Transfer-Encoding", "chunked"); } else { req.body.assign(body, content_length); - ; } } @@ -6864,7 +7466,8 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, } // Body - if ((res.status != 204) && req.method != "HEAD" && req.method != "CONNECT") { + if ((res.status != StatusCode::NoContent_204) && req.method != "HEAD" && + req.method != "CONNECT") { auto redirect = 300 < res.status && res.status < 400 && follow_location_; if (req.response_handler && !redirect) { @@ -6909,24 +7512,6 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, } } - if (res.get_header_value("Connection") == "close" || - (res.version == "HTTP/1.0" && res.reason != "Connection established")) { - // TODO this requires a not-entirely-obvious chain of calls to be correct - // for this to be safe. Maybe a code refactor (such as moving this out to - // the send function and getting rid of the recursiveness of the mutex) - // could make this more obvious. - - // This is safe to call because process_request is only called by - // handle_request which is only called by send, which locks the request - // mutex during the process. It would be a bug to call it from a different - // thread since it's a thread-safety issue to do these things to the socket - // if another thread is using the socket. - std::lock_guard guard(socket_mutex_); - shutdown_ssl(socket_, true); - shutdown_socket(socket_); - close_socket(socket_); - } - // Log if (logger_) { logger_(req, res); } @@ -6935,13 +7520,14 @@ inline bool ClientImpl::process_request(Stream &strm, Request &req, inline ContentProviderWithoutLength ClientImpl::get_multipart_content_provider( const std::string &boundary, const MultipartFormDataItems &items, - const MultipartFormDataProviderItems &provider_items) { - size_t cur_item = 0, cur_start = 0; + const MultipartFormDataProviderItems &provider_items) const { + size_t cur_item = 0; + size_t cur_start = 0; // cur_item and cur_start are copied to within the std::function and maintain // state between successive calls return [&, cur_item, cur_start](size_t offset, DataSink &sink) mutable -> bool { - if (!offset && items.size()) { + if (!offset && !items.empty()) { sink.os << detail::serialize_multipart_formdata(items, boundary, false); return true; } else if (cur_item < provider_items.size()) { @@ -6954,12 +7540,13 @@ inline ContentProviderWithoutLength ClientImpl::get_multipart_content_provider( } DataSink cur_sink; - bool has_data = true; + auto has_data = true; cur_sink.write = sink.write; cur_sink.done = [&]() { has_data = false; }; - if (!provider_items[cur_item].provider(offset - cur_start, cur_sink)) + if (!provider_items[cur_item].provider(offset - cur_start, cur_sink)) { return false; + } if (!has_data) { sink.os << detail::serialize_multipart_formdata_item_end(); @@ -7078,14 +7665,15 @@ inline Result ClientImpl::Get(const std::string &path, const Params ¶ms, if (params.empty()) { return Get(path, headers); } std::string path_with_query = append_query_params(path, params); - return Get(path_with_query.c_str(), headers, progress); + return Get(path_with_query, headers, std::move(progress)); } inline Result ClientImpl::Get(const std::string &path, const Params ¶ms, const Headers &headers, ContentReceiver content_receiver, Progress progress) { - return Get(path, params, headers, nullptr, content_receiver, progress); + return Get(path, params, headers, nullptr, std::move(content_receiver), + std::move(progress)); } inline Result ClientImpl::Get(const std::string &path, const Params ¶ms, @@ -7094,12 +7682,13 @@ inline Result ClientImpl::Get(const std::string &path, const Params ¶ms, ContentReceiver content_receiver, Progress progress) { if (params.empty()) { - return Get(path, headers, response_handler, content_receiver, progress); + return Get(path, headers, std::move(response_handler), + std::move(content_receiver), std::move(progress)); } std::string path_with_query = append_query_params(path, params); - return Get(path_with_query.c_str(), headers, response_handler, - content_receiver, progress); + return Get(path_with_query, headers, std::move(response_handler), + std::move(content_receiver), std::move(progress)); } inline Result ClientImpl::Head(const std::string &path) { @@ -7201,7 +7790,7 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers, const auto &content_type = detail::serialize_multipart_formdata_get_content_type(boundary); const auto &body = detail::serialize_multipart_formdata(items, boundary); - return Post(path, headers, body, content_type.c_str()); + return Post(path, headers, body, content_type); } inline Result ClientImpl::Post(const std::string &path, const Headers &headers, @@ -7214,7 +7803,7 @@ inline Result ClientImpl::Post(const std::string &path, const Headers &headers, const auto &content_type = detail::serialize_multipart_formdata_get_content_type(boundary); const auto &body = detail::serialize_multipart_formdata(items, boundary); - return Post(path, headers, body, content_type.c_str()); + return Post(path, headers, body, content_type); } inline Result @@ -7423,9 +8012,7 @@ inline Result ClientImpl::Delete(const std::string &path, req.headers = headers; req.path = path; - if (!content_type.empty()) { - req.headers.emplace("Content-Type", content_type); - } + if (!content_type.empty()) { req.set_header("Content-Type", content_type); } req.body.assign(body, content_length); return send_(std::move(req)); @@ -7458,13 +8045,6 @@ inline Result ClientImpl::Options(const std::string &path, return send_(std::move(req)); } -inline size_t ClientImpl::is_socket_open() const { - std::lock_guard guard(socket_mutex_); - return socket_.is_open(); -} - -inline socket_t ClientImpl::socket() const { return socket_.sock; } - inline void ClientImpl::stop() { std::lock_guard guard(socket_mutex_); @@ -7488,6 +8068,17 @@ inline void ClientImpl::stop() { close_socket(socket_); } +inline std::string ClientImpl::host() const { return host_; } + +inline int ClientImpl::port() const { return port_; } + +inline size_t ClientImpl::is_socket_open() const { + std::lock_guard guard(socket_mutex_); + return socket_.is_open(); +} + +inline socket_t ClientImpl::socket() const { return socket_.sock; } + inline void ClientImpl::set_connection_timeout(time_t sec, time_t usec) { connection_timeout_sec_ = sec; connection_timeout_usec_ = usec; @@ -7536,6 +8127,11 @@ inline void ClientImpl::set_default_headers(Headers headers) { default_headers_ = std::move(headers); } +inline void ClientImpl::set_header_writer( + std::function const &writer) { + header_writer_ = writer; +} + inline void ClientImpl::set_address_family(int family) { address_family_ = family; } @@ -7575,9 +8171,7 @@ inline void ClientImpl::set_proxy_digest_auth(const std::string &username, proxy_digest_auth_username_ = username; proxy_digest_auth_password_ = password; } -#endif -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT inline void ClientImpl::set_ca_cert_path(const std::string &ca_cert_file_path, const std::string &ca_cert_dir_path) { ca_cert_file_path_ = ca_cert_file_path; @@ -7589,9 +8183,34 @@ inline void ClientImpl::set_ca_cert_store(X509_STORE *ca_cert_store) { ca_cert_store_ = ca_cert_store; } } -#endif -#ifdef CPPHTTPLIB_OPENSSL_SUPPORT +inline X509_STORE *ClientImpl::create_ca_cert_store(const char *ca_cert, + std::size_t size) const { + auto mem = BIO_new_mem_buf(ca_cert, static_cast(size)); + if (!mem) { return nullptr; } + + auto inf = PEM_X509_INFO_read_bio(mem, nullptr, nullptr, nullptr); + if (!inf) { + BIO_free_all(mem); + return nullptr; + } + + auto cts = X509_STORE_new(); + if (cts) { + for (auto i = 0; i < static_cast(sk_X509_INFO_num(inf)); i++) { + auto itmp = sk_X509_INFO_value(inf, i); + if (!itmp) { continue; } + + if (itmp->x509) { X509_STORE_add_cert(cts, itmp->x509); } + if (itmp->crl) { X509_STORE_add_crl(cts, itmp->crl); } + } + } + + sk_X509_INFO_pop_free(inf, X509_INFO_free); + BIO_free_all(mem); + return cts; +} + inline void ClientImpl::enable_server_certificate_verification(bool enabled) { server_certificate_verification_ = enabled; } @@ -7655,7 +8274,7 @@ bool ssl_connect_or_accept_nonblocking(socket_t sock, SSL *ssl, U ssl_connect_or_accept, time_t timeout_sec, time_t timeout_usec) { - int res = 0; + auto res = 0; while ((res = ssl_connect_or_accept(ssl)) != 1) { auto err = SSL_get_error(ssl, res); switch (err) { @@ -7718,7 +8337,7 @@ inline SSLSocketStream::SSLSocketStream(socket_t sock, SSL *ssl, SSL_clear_mode(ssl, SSL_MODE_AUTO_RETRY); } -inline SSLSocketStream::~SSLSocketStream() {} +inline SSLSocketStream::~SSLSocketStream() = default; inline bool SSLSocketStream::is_readable() const { return detail::select_read(sock_, read_timeout_sec_, read_timeout_usec_) > 0; @@ -7736,7 +8355,7 @@ inline ssize_t SSLSocketStream::read(char *ptr, size_t size) { auto ret = SSL_read(ssl_, ptr, static_cast(size)); if (ret < 0) { auto err = SSL_get_error(ssl_, ret); - int n = 1000; + auto n = 1000; #ifdef _WIN32 while (--n >= 0 && (err == SSL_ERROR_WANT_READ || (err == SSL_ERROR_SYSCALL && @@ -7769,7 +8388,7 @@ inline ssize_t SSLSocketStream::write(const char *ptr, size_t size) { auto ret = SSL_write(ssl_, ptr, static_cast(handle_size)); if (ret < 0) { auto err = SSL_get_error(ssl_, ret); - int n = 1000; + auto n = 1000; #ifdef _WIN32 while (--n >= 0 && (err == SSL_ERROR_WANT_WRITE || (err == SSL_ERROR_SYSCALL && @@ -7822,10 +8441,10 @@ inline SSLServer::SSLServer(const char *cert_path, const char *private_key_path, SSL_CTX_set_min_proto_version(ctx_, TLS1_1_VERSION); - // add default password callback before opening encrypted private key if (private_key_password != nullptr && (private_key_password[0] != '\0')) { - SSL_CTX_set_default_passwd_cb_userdata(ctx_, - (char *)private_key_password); + SSL_CTX_set_default_passwd_cb_userdata( + ctx_, + reinterpret_cast(const_cast(private_key_password))); } if (SSL_CTX_use_certificate_chain_file(ctx_, cert_path) != 1 || @@ -7927,16 +8546,23 @@ inline SSLClient::SSLClient(const std::string &host, int port) inline SSLClient::SSLClient(const std::string &host, int port, const std::string &client_cert_path, - const std::string &client_key_path) + const std::string &client_key_path, + const std::string &private_key_password) : ClientImpl(host, port, client_cert_path, client_key_path) { ctx_ = SSL_CTX_new(TLS_client_method()); detail::split(&host_[0], &host_[host_.size()], '.', [&](const char *b, const char *e) { - host_components_.emplace_back(std::string(b, e)); + host_components_.emplace_back(b, e); }); if (!client_cert_path.empty() && !client_key_path.empty()) { + if (!private_key_password.empty()) { + SSL_CTX_set_default_passwd_cb_userdata( + ctx_, reinterpret_cast( + const_cast(private_key_password.c_str()))); + } + if (SSL_CTX_use_certificate_file(ctx_, client_cert_path.c_str(), SSL_FILETYPE_PEM) != 1 || SSL_CTX_use_PrivateKey_file(ctx_, client_key_path.c_str(), @@ -7948,16 +8574,23 @@ inline SSLClient::SSLClient(const std::string &host, int port, } inline SSLClient::SSLClient(const std::string &host, int port, - X509 *client_cert, EVP_PKEY *client_key) + X509 *client_cert, EVP_PKEY *client_key, + const std::string &private_key_password) : ClientImpl(host, port) { ctx_ = SSL_CTX_new(TLS_client_method()); detail::split(&host_[0], &host_[host_.size()], '.', [&](const char *b, const char *e) { - host_components_.emplace_back(std::string(b, e)); + host_components_.emplace_back(b, e); }); if (client_cert != nullptr && client_key != nullptr) { + if (!private_key_password.empty()) { + SSL_CTX_set_default_passwd_cb_userdata( + ctx_, reinterpret_cast( + const_cast(private_key_password.c_str()))); + } + if (SSL_CTX_use_certificate(ctx_, client_cert) != 1 || SSL_CTX_use_PrivateKey(ctx_, client_key) != 1) { SSL_CTX_free(ctx_); @@ -7989,6 +8622,11 @@ inline void SSLClient::set_ca_cert_store(X509_STORE *ca_cert_store) { } } +inline void SSLClient::load_ca_cert_store(const char *ca_cert, + std::size_t size) { + set_ca_cert_store(ClientImpl::create_ca_cert_store(ca_cert, size)); +} + inline long SSLClient::get_openssl_verify_result() const { return verify_result_; } @@ -8003,14 +8641,14 @@ inline bool SSLClient::create_and_connect_socket(Socket &socket, Error &error) { inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res, bool &success, Error &error) { success = true; - Response res2; + Response proxy_res; if (!detail::process_client_socket( socket.sock, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, [&](Stream &strm) { Request req2; req2.method = "CONNECT"; req2.path = host_and_port_; - return process_request(strm, req2, res2, false, error); + return process_request(strm, req2, proxy_res, false, error); })) { // Thread-safe to close everything because we are assuming there are no // requests in flight @@ -8021,12 +8659,12 @@ inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res, return false; } - if (res2.status == 407) { + if (proxy_res.status == StatusCode::ProxyAuthenticationRequired_407) { if (!proxy_digest_auth_username_.empty() && !proxy_digest_auth_password_.empty()) { std::map auth; - if (detail::parse_www_authenticate(res2, auth, true)) { - Response res3; + if (detail::parse_www_authenticate(proxy_res, auth, true)) { + proxy_res = Response(); if (!detail::process_client_socket( socket.sock, read_timeout_sec_, read_timeout_usec_, write_timeout_sec_, write_timeout_usec_, [&](Stream &strm) { @@ -8037,7 +8675,7 @@ inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res, req3, auth, 1, detail::random_string(10), proxy_digest_auth_username_, proxy_digest_auth_password_, true)); - return process_request(strm, req3, res3, false, error); + return process_request(strm, req3, proxy_res, false, error); })) { // Thread-safe to close everything because we are assuming there are // no requests in flight @@ -8048,17 +8686,28 @@ inline bool SSLClient::connect_with_proxy(Socket &socket, Response &res, return false; } } - } else { - res = res2; - return false; } } + // If status code is not 200, proxy request is failed. + // Set error to ProxyConnection and return proxy response + // as the response of the request + if (proxy_res.status != StatusCode::OK_200) { + error = Error::ProxyConnection; + res = std::move(proxy_res); + // Thread-safe to close everything because we are assuming there are + // no requests in flight + shutdown_ssl(socket, true); + shutdown_socket(socket); + close_socket(socket); + return false; + } + return true; } inline bool SSLClient::load_certs() { - bool ret = true; + auto ret = true; std::call_once(initialize_cert_, [&]() { std::lock_guard guard(ctx_mutex_); @@ -8134,7 +8783,11 @@ inline bool SSLClient::initialize_ssl(Socket &socket, Error &error) { return true; }, [&](SSL *ssl2) { - SSL_set_tlsext_host_name(ssl2, host_.c_str()); + // NOTE: Direct call instead of using the OpenSSL macro to suppress + // -Wold-style-cast warning + // SSL_set_tlsext_host_name(ssl2, host_.c_str()); + SSL_ctrl(ssl2, SSL_CTRL_SET_TLSEXT_HOSTNAME, TLSEXT_NAMETYPE_host_name, + static_cast(const_cast(host_.c_str()))); return true; }); @@ -8208,8 +8861,8 @@ SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const { auto type = GEN_DNS; - struct in6_addr addr6; - struct in_addr addr; + struct in6_addr addr6 {}; + struct in_addr addr {}; size_t addr_len = 0; #ifndef __MINGW32__ @@ -8234,8 +8887,9 @@ SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const { for (decltype(count) i = 0; i < count && !dsn_matched; i++) { auto val = sk_GENERAL_NAME_value(alt_names, i); if (val->type == type) { - auto name = (const char *)ASN1_STRING_get0_data(val->d.ia5); - auto name_len = (size_t)ASN1_STRING_length(val->d.ia5); + auto name = + reinterpret_cast(ASN1_STRING_get0_data(val->d.ia5)); + auto name_len = static_cast(ASN1_STRING_length(val->d.ia5)); switch (type) { case GEN_DNS: dsn_matched = check_host_name(name, name_len); break; @@ -8253,7 +8907,8 @@ SSLClient::verify_host_with_subject_alt_name(X509 *server_cert) const { if (dsn_matched || ip_matched) { ret = true; } } - GENERAL_NAMES_free((STACK_OF(GENERAL_NAME) *)alt_names); + GENERAL_NAMES_free(const_cast( + reinterpret_cast(alt_names))); return ret; } @@ -8282,7 +8937,7 @@ inline bool SSLClient::check_host_name(const char *pattern, std::vector pattern_components; detail::split(&pattern[0], &pattern[pattern_len], '.', [&](const char *b, const char *e) { - pattern_components.emplace_back(std::string(b, e)); + pattern_components.emplace_back(b, e); }); if (host_components_.size() != pattern_components.size()) { return false; } @@ -8361,7 +9016,7 @@ inline Client::Client(const std::string &host, int port, : cli_(detail::make_unique(host, port, client_cert_path, client_key_path)) {} -inline Client::~Client() {} +inline Client::~Client() = default; inline bool Client::is_valid() const { return cli_ != nullptr && cli_->is_valid(); @@ -8421,19 +9076,20 @@ inline Result Client::Get(const std::string &path, const Headers &headers, } inline Result Client::Get(const std::string &path, const Params ¶ms, const Headers &headers, Progress progress) { - return cli_->Get(path, params, headers, progress); + return cli_->Get(path, params, headers, std::move(progress)); } inline Result Client::Get(const std::string &path, const Params ¶ms, const Headers &headers, ContentReceiver content_receiver, Progress progress) { - return cli_->Get(path, params, headers, content_receiver, progress); + return cli_->Get(path, params, headers, std::move(content_receiver), + std::move(progress)); } inline Result Client::Get(const std::string &path, const Params ¶ms, const Headers &headers, ResponseHandler response_handler, ContentReceiver content_receiver, Progress progress) { - return cli_->Get(path, params, headers, response_handler, content_receiver, - progress); + return cli_->Get(path, params, headers, std::move(response_handler), + std::move(content_receiver), std::move(progress)); } inline Result Client::Head(const std::string &path) { return cli_->Head(path); } @@ -8665,12 +9321,16 @@ inline bool Client::send(Request &req, Response &res, Error &error) { inline Result Client::send(const Request &req) { return cli_->send(req); } +inline void Client::stop() { cli_->stop(); } + +inline std::string Client::host() const { return cli_->host(); } + +inline int Client::port() const { return cli_->port(); } + inline size_t Client::is_socket_open() const { return cli_->is_socket_open(); } inline socket_t Client::socket() const { return cli_->socket(); } -inline void Client::stop() { cli_->stop(); } - inline void Client::set_hostname_addr_map(std::map addr_map) { cli_->set_hostname_addr_map(std::move(addr_map)); @@ -8680,6 +9340,11 @@ inline void Client::set_default_headers(Headers headers) { cli_->set_default_headers(std::move(headers)); } +inline void Client::set_header_writer( + std::function const &writer) { + cli_->set_header_writer(writer); +} + inline void Client::set_address_family(int family) { cli_->set_address_family(family); } @@ -8754,7 +9419,9 @@ inline void Client::enable_server_certificate_verification(bool enabled) { } #endif -inline void Client::set_logger(Logger logger) { cli_->set_logger(logger); } +inline void Client::set_logger(Logger logger) { + cli_->set_logger(std::move(logger)); +} #ifdef CPPHTTPLIB_OPENSSL_SUPPORT inline void Client::set_ca_cert_path(const std::string &ca_cert_file_path, @@ -8770,6 +9437,10 @@ inline void Client::set_ca_cert_store(X509_STORE *ca_cert_store) { } } +inline void Client::load_ca_cert_store(const char *ca_cert, std::size_t size) { + set_ca_cert_store(cli_->create_ca_cert_store(ca_cert, size)); +} + inline long Client::get_openssl_verify_result() const { if (is_ssl_) { return static_cast(*cli_).get_openssl_verify_result(); diff --git a/examples/server/json.hpp b/examples/server/json.hpp index ea945f346..a858728c4 100644 --- a/examples/server/json.hpp +++ b/examples/server/json.hpp @@ -1,9 +1,9 @@ // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT /****************************************************************************\ @@ -27,7 +27,6 @@ #endif // JSON_NO_IO #include // random_access_iterator_tag #include // unique_ptr -#include // accumulate #include // string, stoi, to_string #include // declval, forward, move, pair, swap #include // vector @@ -35,10 +34,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -48,10 +47,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -60,7 +59,7 @@ #ifndef JSON_SKIP_LIBRARY_VERSION_CHECK #if defined(NLOHMANN_JSON_VERSION_MAJOR) && defined(NLOHMANN_JSON_VERSION_MINOR) && defined(NLOHMANN_JSON_VERSION_PATCH) - #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 2 + #if NLOHMANN_JSON_VERSION_MAJOR != 3 || NLOHMANN_JSON_VERSION_MINOR != 11 || NLOHMANN_JSON_VERSION_PATCH != 3 #warning "Already included a different version of the library!" #endif #endif @@ -68,7 +67,7 @@ #define NLOHMANN_JSON_VERSION_MAJOR 3 // NOLINT(modernize-macro-to-enum) #define NLOHMANN_JSON_VERSION_MINOR 11 // NOLINT(modernize-macro-to-enum) -#define NLOHMANN_JSON_VERSION_PATCH 2 // NOLINT(modernize-macro-to-enum) +#define NLOHMANN_JSON_VERSION_PATCH 3 // NOLINT(modernize-macro-to-enum) #ifndef JSON_DIAGNOSTICS #define JSON_DIAGNOSTICS 0 @@ -150,10 +149,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -173,16 +172,19 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT #include // nullptr_t #include // exception +#if JSON_DIAGNOSTICS + #include // accumulate +#endif #include // runtime_error #include // to_string #include // vector @@ -190,10 +192,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -206,10 +208,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -218,10 +220,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -231,10 +233,10 @@ // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -318,10 +320,10 @@ NLOHMANN_JSON_NAMESPACE_END // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-FileCopyrightText: 2016-2021 Evan Nemerson // SPDX-License-Identifier: MIT @@ -2483,6 +2485,14 @@ JSON_HEDLEY_DIAGNOSTIC_POP #endif #endif +#ifndef JSON_HAS_STATIC_RTTI + #if !defined(_HAS_STATIC_RTTI) || _HAS_STATIC_RTTI != 0 + #define JSON_HAS_STATIC_RTTI 1 + #else + #define JSON_HAS_STATIC_RTTI 0 + #endif +#endif + #ifdef JSON_HAS_CPP_17 #define JSON_INLINE_VARIABLE inline #else @@ -2590,12 +2600,13 @@ JSON_HEDLEY_DIAGNOSTIC_POP class NumberUnsignedType, class NumberFloatType, \ template class AllocatorType, \ template class JSONSerializer, \ - class BinaryType> + class BinaryType, \ + class CustomBaseClass> #define NLOHMANN_BASIC_JSON_TPL \ basic_json + AllocatorType, JSONSerializer, BinaryType, CustomBaseClass> // Macros to simplify conversion from/to types @@ -2745,7 +2756,10 @@ JSON_HEDLEY_DIAGNOSTIC_POP #define NLOHMANN_DEFINE_TYPE_INTRUSIVE_WITH_DEFAULT(Type, ...) \ friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ - friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { Type nlohmann_json_default_obj; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } + friend void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } + +#define NLOHMANN_DEFINE_TYPE_INTRUSIVE_ONLY_SERIALIZE(Type, ...) \ + friend void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } /*! @brief macro @@ -2756,10 +2770,12 @@ JSON_HEDLEY_DIAGNOSTIC_POP inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM, __VA_ARGS__)) } +#define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_ONLY_SERIALIZE(Type, ...) \ + inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } + #define NLOHMANN_DEFINE_TYPE_NON_INTRUSIVE_WITH_DEFAULT(Type, ...) \ inline void to_json(nlohmann::json& nlohmann_json_j, const Type& nlohmann_json_t) { NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_TO, __VA_ARGS__)) } \ - inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { Type nlohmann_json_default_obj; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } - + inline void from_json(const nlohmann::json& nlohmann_json_j, Type& nlohmann_json_t) { const Type nlohmann_json_default_obj{}; NLOHMANN_JSON_EXPAND(NLOHMANN_JSON_PASTE(NLOHMANN_JSON_FROM_WITH_DEFAULT, __VA_ARGS__)) } // inspired from https://stackoverflow.com/a/26745591 // allows to call any std function as if (e.g. with begin): @@ -2923,10 +2939,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -2998,10 +3014,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3040,10 +3056,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-FileCopyrightText: 2018 The Abseil Authors // SPDX-License-Identifier: MIT @@ -3214,10 +3230,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3226,14 +3242,15 @@ NLOHMANN_JSON_NAMESPACE_END #include // false_type, is_constructible, is_integral, is_same, true_type #include // declval #include // tuple +#include // char_traits // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3298,10 +3315,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3318,10 +3335,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -3342,10 +3359,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT #ifndef INCLUDE_NLOHMANN_JSON_FWD_HPP_ @@ -3389,7 +3406,8 @@ NLOHMANN_JSON_NAMESPACE_END template class AllocatorType = std::allocator, template class JSONSerializer = adl_serializer, - class BinaryType = std::vector> + class BinaryType = std::vector, // cppcheck-suppress syntaxError + class CustomBaseClass = void> class basic_json; /// @brief JSON Pointer defines a string syntax for identifying a specific value within a JSON document @@ -3577,6 +3595,63 @@ struct actual_object_comparator template using actual_object_comparator_t = typename actual_object_comparator::type; +///////////////// +// char_traits // +///////////////// + +// Primary template of char_traits calls std char_traits +template +struct char_traits : std::char_traits +{}; + +// Explicitly define char traits for unsigned char since it is not standard +template<> +struct char_traits : std::char_traits +{ + using char_type = unsigned char; + using int_type = uint64_t; + + // Redefine to_int_type function + static int_type to_int_type(char_type c) noexcept + { + return static_cast(c); + } + + static char_type to_char_type(int_type i) noexcept + { + return static_cast(i); + } + + static constexpr int_type eof() noexcept + { + return static_cast(EOF); + } +}; + +// Explicitly define char traits for signed char since it is not standard +template<> +struct char_traits : std::char_traits +{ + using char_type = signed char; + using int_type = uint64_t; + + // Redefine to_int_type function + static int_type to_int_type(char_type c) noexcept + { + return static_cast(c); + } + + static char_type to_char_type(int_type i) noexcept + { + return static_cast(i); + } + + static constexpr int_type eof() noexcept + { + return static_cast(EOF); + } +}; + /////////////////// // is_ functions // /////////////////// @@ -3613,7 +3688,6 @@ template struct is_default_constructible> : conjunction...> {}; - template struct is_constructible : std::is_constructible {}; @@ -3629,7 +3703,6 @@ struct is_constructible> : is_default_constructible struct is_constructible> : is_default_constructible> {}; - template struct is_iterator_traits : std::false_type {}; @@ -4039,7 +4112,6 @@ struct value_in_range_of_impl2 } }; - template struct value_in_range_of_impl2 { @@ -4138,10 +4210,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -4165,28 +4237,28 @@ inline std::size_t concat_length() } template -inline std::size_t concat_length(const char* cstr, Args&& ... rest); +inline std::size_t concat_length(const char* cstr, const Args& ... rest); template -inline std::size_t concat_length(const StringType& str, Args&& ... rest); +inline std::size_t concat_length(const StringType& str, const Args& ... rest); template -inline std::size_t concat_length(const char /*c*/, Args&& ... rest) +inline std::size_t concat_length(const char /*c*/, const Args& ... rest) { - return 1 + concat_length(std::forward(rest)...); + return 1 + concat_length(rest...); } template -inline std::size_t concat_length(const char* cstr, Args&& ... rest) +inline std::size_t concat_length(const char* cstr, const Args& ... rest) { // cppcheck-suppress ignoredReturnValue - return ::strlen(cstr) + concat_length(std::forward(rest)...); + return ::strlen(cstr) + concat_length(rest...); } template -inline std::size_t concat_length(const StringType& str, Args&& ... rest) +inline std::size_t concat_length(const StringType& str, const Args& ... rest) { - return str.size() + concat_length(std::forward(rest)...); + return str.size() + concat_length(rest...); } template @@ -4277,7 +4349,7 @@ template inline OutStringType concat(Args && ... args) { OutStringType str; - str.reserve(concat_length(std::forward(args)...)); + str.reserve(concat_length(args...)); concat_into(str, std::forward(args)...); return str; } @@ -4286,7 +4358,6 @@ inline OutStringType concat(Args && ... args) NLOHMANN_JSON_NAMESPACE_END - NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail { @@ -4334,9 +4405,9 @@ class exception : public std::exception { case value_t::array: { - for (std::size_t i = 0; i < current->m_parent->m_value.array->size(); ++i) + for (std::size_t i = 0; i < current->m_parent->m_data.m_value.array->size(); ++i) { - if (¤t->m_parent->m_value.array->operator[](i) == current) + if (¤t->m_parent->m_data.m_value.array->operator[](i) == current) { tokens.emplace_back(std::to_string(i)); break; @@ -4347,7 +4418,7 @@ class exception : public std::exception case value_t::object: { - for (const auto& element : *current->m_parent->m_value.object) + for (const auto& element : *current->m_parent->m_data.m_value.object) { if (&element.second == current) { @@ -4410,17 +4481,17 @@ class parse_error : public exception template::value, int> = 0> static parse_error create(int id_, const position_t& pos, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("parse_error", id_), "parse error", - position_string(pos), ": ", exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("parse_error", id_), "parse error", + position_string(pos), ": ", exception::diagnostics(context), what_arg); return {id_, pos.chars_read_total, w.c_str()}; } template::value, int> = 0> static parse_error create(int id_, std::size_t byte_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("parse_error", id_), "parse error", - (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""), - ": ", exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("parse_error", id_), "parse error", + (byte_ != 0 ? (concat(" at byte ", std::to_string(byte_))) : ""), + ": ", exception::diagnostics(context), what_arg); return {id_, byte_, w.c_str()}; } @@ -4454,7 +4525,7 @@ class invalid_iterator : public exception template::value, int> = 0> static invalid_iterator create(int id_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("invalid_iterator", id_), exception::diagnostics(context), what_arg); return {id_, w.c_str()}; } @@ -4472,7 +4543,7 @@ class type_error : public exception template::value, int> = 0> static type_error create(int id_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("type_error", id_), exception::diagnostics(context), what_arg); return {id_, w.c_str()}; } @@ -4489,7 +4560,7 @@ class out_of_range : public exception template::value, int> = 0> static out_of_range create(int id_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("out_of_range", id_), exception::diagnostics(context), what_arg); return {id_, w.c_str()}; } @@ -4506,7 +4577,7 @@ class other_error : public exception template::value, int> = 0> static other_error create(int id_, const std::string& what_arg, BasicJsonContext context) { - std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg); + const std::string w = concat(exception::name("other_error", id_), exception::diagnostics(context), what_arg); return {id_, w.c_str()}; } @@ -4525,10 +4596,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -4549,10 +4620,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -5055,10 +5126,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -5075,10 +5146,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -5147,10 +5218,10 @@ template class iteration_proxy_value // older GCCs are a bit fussy and require explicit noexcept specifiers on defaulted functions iteration_proxy_value(iteration_proxy_value&&) noexcept(std::is_nothrow_move_constructible::value - && std::is_nothrow_move_constructible::value) = default; + && std::is_nothrow_move_constructible::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations) iteration_proxy_value& operator=(iteration_proxy_value&&) noexcept(std::is_nothrow_move_assignable::value - && std::is_nothrow_move_assignable::value) = default; + && std::is_nothrow_move_assignable::value) = default; // NOLINT(hicpp-noexcept-move,performance-noexcept-move-constructor,cppcoreguidelines-noexcept-move-operations) ~iteration_proxy_value() = default; /// dereference operator (needed for range-based for) @@ -5297,11 +5368,11 @@ namespace std #pragma clang diagnostic ignored "-Wmismatched-tags" #endif template -class tuple_size<::nlohmann::detail::iteration_proxy_value> +class tuple_size<::nlohmann::detail::iteration_proxy_value> // NOLINT(cert-dcl58-cpp) : public std::integral_constant {}; template -class tuple_element> +class tuple_element> // NOLINT(cert-dcl58-cpp) { public: using type = decltype( @@ -5340,7 +5411,7 @@ namespace detail /* * Note all external_constructor<>::construct functions need to call - * j.m_value.destroy(j.m_type) to avoid a memory leak in case j contains an + * j.m_data.m_value.destroy(j.m_data.m_type) to avoid a memory leak in case j contains an * allocated value (e.g., a string). See bug issue * https://github.com/nlohmann/json/issues/2865 for more information. */ @@ -5353,9 +5424,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::boolean_t b) noexcept { - j.m_value.destroy(j.m_type); - j.m_type = value_t::boolean; - j.m_value = b; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::boolean; + j.m_data.m_value = b; j.assert_invariant(); } }; @@ -5366,18 +5437,18 @@ struct external_constructor template static void construct(BasicJsonType& j, const typename BasicJsonType::string_t& s) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::string; - j.m_value = s; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::string; + j.m_data.m_value = s; j.assert_invariant(); } template static void construct(BasicJsonType& j, typename BasicJsonType::string_t&& s) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::string; - j.m_value = std::move(s); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::string; + j.m_data.m_value = std::move(s); j.assert_invariant(); } @@ -5386,9 +5457,9 @@ struct external_constructor int > = 0 > static void construct(BasicJsonType& j, const CompatibleStringType& str) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::string; - j.m_value.string = j.template create(str); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::string; + j.m_data.m_value.string = j.template create(str); j.assert_invariant(); } }; @@ -5399,18 +5470,18 @@ struct external_constructor template static void construct(BasicJsonType& j, const typename BasicJsonType::binary_t& b) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::binary; - j.m_value = typename BasicJsonType::binary_t(b); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::binary; + j.m_data.m_value = typename BasicJsonType::binary_t(b); j.assert_invariant(); } template static void construct(BasicJsonType& j, typename BasicJsonType::binary_t&& b) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::binary; - j.m_value = typename BasicJsonType::binary_t(std::move(b)); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::binary; + j.m_data.m_value = typename BasicJsonType::binary_t(std::move(b)); j.assert_invariant(); } }; @@ -5421,9 +5492,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::number_float_t val) noexcept { - j.m_value.destroy(j.m_type); - j.m_type = value_t::number_float; - j.m_value = val; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::number_float; + j.m_data.m_value = val; j.assert_invariant(); } }; @@ -5434,9 +5505,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::number_unsigned_t val) noexcept { - j.m_value.destroy(j.m_type); - j.m_type = value_t::number_unsigned; - j.m_value = val; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::number_unsigned; + j.m_data.m_value = val; j.assert_invariant(); } }; @@ -5447,9 +5518,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::number_integer_t val) noexcept { - j.m_value.destroy(j.m_type); - j.m_type = value_t::number_integer; - j.m_value = val; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::number_integer; + j.m_data.m_value = val; j.assert_invariant(); } }; @@ -5460,9 +5531,9 @@ struct external_constructor template static void construct(BasicJsonType& j, const typename BasicJsonType::array_t& arr) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value = arr; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value = arr; j.set_parents(); j.assert_invariant(); } @@ -5470,9 +5541,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::array_t&& arr) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value = std::move(arr); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value = std::move(arr); j.set_parents(); j.assert_invariant(); } @@ -5485,9 +5556,9 @@ struct external_constructor using std::begin; using std::end; - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value.array = j.template create(begin(arr), end(arr)); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value.array = j.template create(begin(arr), end(arr)); j.set_parents(); j.assert_invariant(); } @@ -5495,14 +5566,14 @@ struct external_constructor template static void construct(BasicJsonType& j, const std::vector& arr) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value = value_t::array; - j.m_value.array->reserve(arr.size()); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value = value_t::array; + j.m_data.m_value.array->reserve(arr.size()); for (const bool x : arr) { - j.m_value.array->push_back(x); - j.set_parent(j.m_value.array->back()); + j.m_data.m_value.array->push_back(x); + j.set_parent(j.m_data.m_value.array->back()); } j.assert_invariant(); } @@ -5511,13 +5582,13 @@ struct external_constructor enable_if_t::value, int> = 0> static void construct(BasicJsonType& j, const std::valarray& arr) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::array; - j.m_value = value_t::array; - j.m_value.array->resize(arr.size()); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::array; + j.m_data.m_value = value_t::array; + j.m_data.m_value.array->resize(arr.size()); if (arr.size() > 0) { - std::copy(std::begin(arr), std::end(arr), j.m_value.array->begin()); + std::copy(std::begin(arr), std::end(arr), j.m_data.m_value.array->begin()); } j.set_parents(); j.assert_invariant(); @@ -5530,9 +5601,9 @@ struct external_constructor template static void construct(BasicJsonType& j, const typename BasicJsonType::object_t& obj) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::object; - j.m_value = obj; + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::object; + j.m_data.m_value = obj; j.set_parents(); j.assert_invariant(); } @@ -5540,9 +5611,9 @@ struct external_constructor template static void construct(BasicJsonType& j, typename BasicJsonType::object_t&& obj) { - j.m_value.destroy(j.m_type); - j.m_type = value_t::object; - j.m_value = std::move(obj); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::object; + j.m_data.m_value = std::move(obj); j.set_parents(); j.assert_invariant(); } @@ -5554,9 +5625,9 @@ struct external_constructor using std::begin; using std::end; - j.m_value.destroy(j.m_type); - j.m_type = value_t::object; - j.m_value.object = j.template create(begin(obj), end(obj)); + j.m_data.m_value.destroy(j.m_data.m_type); + j.m_data.m_type = value_t::object; + j.m_data.m_value.object = j.template create(begin(obj), end(obj)); j.set_parents(); j.assert_invariant(); } @@ -5626,7 +5697,8 @@ template::type; - external_constructor::construct(j, static_cast(e)); + static constexpr value_t integral_value_t = std::is_unsigned::value ? value_t::number_unsigned : value_t::number_integer; + external_constructor::construct(j, static_cast(e)); } #endif // JSON_DISABLE_ENUM_SERIALIZATION @@ -5796,10 +5868,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -5908,10 +5980,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -6041,10 +6113,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -6067,10 +6139,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -6094,6 +6166,8 @@ NLOHMANN_JSON_NAMESPACE_END // #include +// #include + NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail @@ -6140,7 +6214,6 @@ class file_input_adapter std::FILE* m_file; }; - /*! Input adapter for a (caching) istream. Ignores a UFT Byte Order Mark at beginning of input. Does not support changing the underlying std::streambuf @@ -6214,16 +6287,16 @@ class iterator_input_adapter : current(std::move(first)), end(std::move(last)) {} - typename std::char_traits::int_type get_character() + typename char_traits::int_type get_character() { if (JSON_HEDLEY_LIKELY(current != end)) { - auto result = std::char_traits::to_int_type(*current); + auto result = char_traits::to_int_type(*current); std::advance(current, 1); return result; } - return std::char_traits::eof(); + return char_traits::eof(); } private: @@ -6239,7 +6312,6 @@ class iterator_input_adapter } }; - template struct wide_string_input_helper; @@ -6363,7 +6435,7 @@ struct wide_string_input_helper } }; -// Wraps another input apdater to convert wide character types into individual bytes. +// Wraps another input adapter to convert wide character types into individual bytes. template class wide_string_input_adapter { @@ -6408,7 +6480,6 @@ class wide_string_input_adapter std::size_t utf8_bytes_filled = 0; }; - template struct iterator_input_adapter_factory { @@ -6565,10 +6636,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -6710,7 +6781,6 @@ struct json_sax virtual ~json_sax() = default; }; - namespace detail { /*! @@ -6812,7 +6882,7 @@ class json_sax_dom_parser JSON_ASSERT(ref_stack.back()->is_object()); // add null at given key and store the reference for later - object_element = &(ref_stack.back()->m_value.object->operator[](val)); + object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val)); return true; } @@ -6887,8 +6957,8 @@ class json_sax_dom_parser if (ref_stack.back()->is_array()) { - ref_stack.back()->m_value.array->emplace_back(std::forward(v)); - return &(ref_stack.back()->m_value.array->back()); + ref_stack.back()->m_data.m_value.array->emplace_back(std::forward(v)); + return &(ref_stack.back()->m_data.m_value.array->back()); } JSON_ASSERT(ref_stack.back()->is_object()); @@ -7007,7 +7077,7 @@ class json_sax_dom_callback_parser // add discarded value at given key and store the reference for later if (keep && ref_stack.back()) { - object_element = &(ref_stack.back()->m_value.object->operator[](val) = discarded); + object_element = &(ref_stack.back()->m_data.m_value.object->operator[](val) = discarded); } return true; @@ -7092,7 +7162,7 @@ class json_sax_dom_callback_parser // remove discarded value if (!keep && !ref_stack.empty() && ref_stack.back()->is_array()) { - ref_stack.back()->m_value.array->pop_back(); + ref_stack.back()->m_data.m_value.array->pop_back(); } return true; @@ -7159,7 +7229,7 @@ class json_sax_dom_callback_parser if (ref_stack.empty()) { root = std::move(value); - return {true, &root}; + return {true, & root}; } // skip this value if we already decided to skip the parent @@ -7175,8 +7245,8 @@ class json_sax_dom_callback_parser // array if (ref_stack.back()->is_array()) { - ref_stack.back()->m_value.array->emplace_back(std::move(value)); - return {true, &(ref_stack.back()->m_value.array->back())}; + ref_stack.back()->m_data.m_value.array->emplace_back(std::move(value)); + return {true, & (ref_stack.back()->m_data.m_value.array->back())}; } // object @@ -7201,9 +7271,9 @@ class json_sax_dom_callback_parser /// stack to model hierarchy of values std::vector ref_stack {}; /// stack to manage which values to keep - std::vector keep_stack {}; + std::vector keep_stack {}; // NOLINT(readability-redundant-member-init) /// stack to manage which object keys to keep - std::vector key_keep_stack {}; + std::vector key_keep_stack {}; // NOLINT(readability-redundant-member-init) /// helper to hold the reference for the next object element BasicJsonType* object_element = nullptr; /// whether a syntax error occurred @@ -7298,10 +7368,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -7322,6 +7392,8 @@ NLOHMANN_JSON_NAMESPACE_END // #include +// #include + NLOHMANN_JSON_NAMESPACE_BEGIN namespace detail @@ -7416,7 +7488,7 @@ class lexer : public lexer_base using number_float_t = typename BasicJsonType::number_float_t; using string_t = typename BasicJsonType::string_t; using char_type = typename InputAdapterType::char_type; - using char_int_type = typename std::char_traits::int_type; + using char_int_type = typename char_traits::int_type; public: using token_type = typename lexer_base::token_type; @@ -7523,7 +7595,7 @@ class lexer : public lexer_base for (auto range = ranges.begin(); range != ranges.end(); ++range) { get(); - if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) + if (JSON_HEDLEY_LIKELY(*range <= current && current <= *(++range))) // NOLINT(bugprone-inc-dec-in-conditions) { add(current); } @@ -7566,7 +7638,7 @@ class lexer : public lexer_base switch (get()) { // end of file while parsing string - case std::char_traits::eof(): + case char_traits::eof(): { error_message = "invalid string: missing closing quote"; return token_type::parse_error; @@ -8155,7 +8227,7 @@ class lexer : public lexer_base { case '\n': case '\r': - case std::char_traits::eof(): + case char_traits::eof(): case '\0': return true; @@ -8172,7 +8244,7 @@ class lexer : public lexer_base { switch (get()) { - case std::char_traits::eof(): + case char_traits::eof(): case '\0': { error_message = "invalid comment; missing closing '*/'"; @@ -8601,10 +8673,10 @@ scan_number_done: token_type scan_literal(const char_type* literal_text, const std::size_t length, token_type return_type) { - JSON_ASSERT(std::char_traits::to_char_type(current) == literal_text[0]); + JSON_ASSERT(char_traits::to_char_type(current) == literal_text[0]); for (std::size_t i = 1; i < length; ++i) { - if (JSON_HEDLEY_UNLIKELY(std::char_traits::to_char_type(get()) != literal_text[i])) + if (JSON_HEDLEY_UNLIKELY(char_traits::to_char_type(get()) != literal_text[i])) { error_message = "invalid literal"; return token_type::parse_error; @@ -8622,7 +8694,7 @@ scan_number_done: { token_buffer.clear(); token_string.clear(); - token_string.push_back(std::char_traits::to_char_type(current)); + token_string.push_back(char_traits::to_char_type(current)); } /* @@ -8630,7 +8702,7 @@ scan_number_done: This function provides the interface to the used input adapter. It does not throw in case the input reached EOF, but returns a - `std::char_traits::eof()` in that case. Stores the scanned characters + `char_traits::eof()` in that case. Stores the scanned characters for use in error messages. @return character read from the input @@ -8650,9 +8722,9 @@ scan_number_done: current = ia.get_character(); } - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + if (JSON_HEDLEY_LIKELY(current != char_traits::eof())) { - token_string.push_back(std::char_traits::to_char_type(current)); + token_string.push_back(char_traits::to_char_type(current)); } if (current == '\n') @@ -8691,7 +8763,7 @@ scan_number_done: --position.chars_read_current_line; } - if (JSON_HEDLEY_LIKELY(current != std::char_traits::eof())) + if (JSON_HEDLEY_LIKELY(current != char_traits::eof())) { JSON_ASSERT(!token_string.empty()); token_string.pop_back(); @@ -8885,7 +8957,7 @@ scan_number_done: // end of input (the null byte is needed when parsing from // string literals) case '\0': - case std::char_traits::eof(): + case char_traits::eof(): return token_type::end_of_input; // error @@ -8903,7 +8975,7 @@ scan_number_done: const bool ignore_comments = false; /// the current character - char_int_type current = std::char_traits::eof(); + char_int_type current = char_traits::eof(); /// whether the next get() call should just return current bool next_unget = false; @@ -8937,10 +9009,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -9129,7 +9201,6 @@ static inline bool little_endianness(int num = 1) noexcept return *reinterpret_cast(&num) == 1; } - /////////////////// // binary reader // /////////////////// @@ -9147,7 +9218,7 @@ class binary_reader using binary_t = typename BasicJsonType::binary_t; using json_sax_t = SAX; using char_type = typename InputAdapterType::char_type; - using char_int_type = typename std::char_traits::int_type; + using char_int_type = typename char_traits::int_type; public: /*! @@ -9220,7 +9291,7 @@ class binary_reader get(); } - if (JSON_HEDLEY_UNLIKELY(current != std::char_traits::eof())) + if (JSON_HEDLEY_UNLIKELY(current != char_traits::eof())) { return sax->parse_error(chars_read, get_token_string(), parse_error::create(110, chars_read, exception_message(input_format, concat("expected end of input; last byte: 0x", get_token_string()), "value"), nullptr)); @@ -9303,7 +9374,7 @@ class binary_reader exception_message(input_format_t::bson, concat("string length must be at least 1, is ", std::to_string(len)), "string"), nullptr)); } - return get_string(input_format_t::bson, len - static_cast(1), result) && get() != std::char_traits::eof(); + return get_string(input_format_t::bson, len - static_cast(1), result) && get() != char_traits::eof(); } /*! @@ -9404,7 +9475,7 @@ class binary_reader { std::array cr{{}}; static_cast((std::snprintf)(cr.data(), cr.size(), "%.2hhX", static_cast(element_type))); // NOLINT(cppcoreguidelines-pro-type-vararg,hicpp-vararg) - std::string cr_str{cr.data()}; + const std::string cr_str{cr.data()}; return sax->parse_error(element_type_parse_position, cr_str, parse_error::create(114, element_type_parse_position, concat("Unsupported BSON record type 0x", cr_str), nullptr)); } @@ -9497,7 +9568,7 @@ class binary_reader switch (get_char ? get() : current) { // EOF - case std::char_traits::eof(): + case char_traits::eof(): return unexpect_eof(input_format_t::cbor, "value"); // Integer 0x00..0x17 (0..23) @@ -10272,7 +10343,7 @@ class binary_reader switch (get()) { // EOF - case std::char_traits::eof(): + case char_traits::eof(): return unexpect_eof(input_format_t::msgpack, "value"); // positive fixint @@ -11339,7 +11410,7 @@ class binary_reader exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr)); } - bool is_error = get_ubjson_size_value(result.first, is_ndarray); + const bool is_error = get_ubjson_size_value(result.first, is_ndarray); if (input_format == input_format_t::bjdata && is_ndarray) { if (inside_ndarray) @@ -11354,7 +11425,7 @@ class binary_reader if (current == '#') { - bool is_error = get_ubjson_size_value(result.first, is_ndarray); + const bool is_error = get_ubjson_size_value(result.first, is_ndarray); if (input_format == input_format_t::bjdata && is_ndarray) { return sax->parse_error(chars_read, get_token_string(), parse_error::create(112, chars_read, @@ -11374,7 +11445,7 @@ class binary_reader { switch (prefix) { - case std::char_traits::eof(): // EOF + case char_traits::eof(): // EOF return unexpect_eof(input_format, "value"); case 'T': // true @@ -11819,7 +11890,7 @@ class binary_reader This function provides the interface to the used input adapter. It does not throw in case the input reached EOF, but returns a -'ve valued - `std::char_traits::eof()` in that case. + `char_traits::eof()` in that case. @return character read from the input */ @@ -11961,7 +12032,7 @@ class binary_reader JSON_HEDLEY_NON_NULL(3) bool unexpect_eof(const input_format_t format, const char* context) const { - if (JSON_HEDLEY_UNLIKELY(current == std::char_traits::eof())) + if (JSON_HEDLEY_UNLIKELY(current == char_traits::eof())) { return sax->parse_error(chars_read, "", parse_error::create(110, chars_read, exception_message(format, "unexpected end of input", context), nullptr)); @@ -12028,7 +12099,7 @@ class binary_reader InputAdapterType ia; /// the current character - char_int_type current = std::char_traits::eof(); + char_int_type current = char_traits::eof(); /// the number of characters read std::size_t chars_read = 0; @@ -12090,10 +12161,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -12439,13 +12510,25 @@ class parser m_lexer.get_token_string(), parse_error::create(101, m_lexer.get_position(), exception_message(token_type::uninitialized, "value"), nullptr)); } + case token_type::end_of_input: + { + if (JSON_HEDLEY_UNLIKELY(m_lexer.get_position().chars_read_total == 1)) + { + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), + "attempting to parse an empty input; check that your input string or stream contains the expected JSON", nullptr)); + } + return sax->parse_error(m_lexer.get_position(), + m_lexer.get_token_string(), + parse_error::create(101, m_lexer.get_position(), exception_message(token_type::literal_or_value, "value"), nullptr)); + } case token_type::uninitialized: case token_type::end_array: case token_type::end_object: case token_type::name_separator: case token_type::value_separator: - case token_type::end_of_input: case token_type::literal_or_value: default: // the last token was unexpected { @@ -12607,10 +12690,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -12620,10 +12703,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -12779,10 +12862,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -12887,7 +12970,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { @@ -12984,17 +13067,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { - m_it.object_iterator = m_object->m_value.object->begin(); + m_it.object_iterator = m_object->m_data.m_value.object->begin(); break; } case value_t::array: { - m_it.array_iterator = m_object->m_value.array->begin(); + m_it.array_iterator = m_object->m_data.m_value.array->begin(); break; } @@ -13028,17 +13111,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { - m_it.object_iterator = m_object->m_value.object->end(); + m_it.object_iterator = m_object->m_data.m_value.object->end(); break; } case value_t::array: { - m_it.array_iterator = m_object->m_value.array->end(); + m_it.array_iterator = m_object->m_data.m_value.array->end(); break; } @@ -13067,17 +13150,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { - JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end()); + JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end()); return m_it.object_iterator->second; } case value_t::array: { - JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end()); + JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end()); return *m_it.array_iterator; } @@ -13111,17 +13194,17 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { - JSON_ASSERT(m_it.object_iterator != m_object->m_value.object->end()); + JSON_ASSERT(m_it.object_iterator != m_object->m_data.m_value.object->end()); return &(m_it.object_iterator->second); } case value_t::array: { - JSON_ASSERT(m_it.array_iterator != m_object->m_value.array->end()); + JSON_ASSERT(m_it.array_iterator != m_object->m_data.m_value.array->end()); return &*m_it.array_iterator; } @@ -13164,7 +13247,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { @@ -13215,7 +13298,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: { @@ -13262,7 +13345,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: return (m_it.object_iterator == other.m_it.object_iterator); @@ -13307,7 +13390,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: JSON_THROW(invalid_iterator::create(213, "cannot compare order of object iterators", m_object)); @@ -13363,7 +13446,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object)); @@ -13442,7 +13525,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: JSON_THROW(invalid_iterator::create(209, "cannot use offsets with object iterators", m_object)); @@ -13471,7 +13554,7 @@ class iter_impl // NOLINT(cppcoreguidelines-special-member-functions,hicpp-speci { JSON_ASSERT(m_object != nullptr); - switch (m_object->m_type) + switch (m_object->m_data.m_type) { case value_t::object: JSON_THROW(invalid_iterator::create(208, "cannot use operator[] for object iterators", m_object)); @@ -13541,10 +13624,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -13673,13 +13756,55 @@ NLOHMANN_JSON_NAMESPACE_END // #include +// #include +// __ _____ _____ _____ +// __| | __| | | | JSON for Modern C++ +// | | |__ | | | | | | version 3.11.3 +// |_____|_____|_____|_|___| https://github.com/nlohmann/json +// +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann +// SPDX-License-Identifier: MIT + + + +#include // conditional, is_same + +// #include + + +NLOHMANN_JSON_NAMESPACE_BEGIN +namespace detail +{ + +/*! +@brief Default base class of the @ref basic_json class. + +So that the correct implementations of the copy / move ctors / assign operators +of @ref basic_json do not require complex case distinctions +(no base class / custom base class used as customization point), +@ref basic_json always has a base class. +By default, this class is used because it is empty and thus has no effect +on the behavior of @ref basic_json. +*/ +struct json_default_base {}; + +template +using json_base_class = typename std::conditional < + std::is_same::value, + json_default_base, + T + >::type; + +} // namespace detail +NLOHMANN_JSON_NAMESPACE_END + // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -13911,7 +14036,7 @@ class json_pointer const char* p = s.c_str(); char* p_end = nullptr; errno = 0; // strtoull doesn't reset errno - unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int) + const unsigned long long res = std::strtoull(p, &p_end, 10); // NOLINT(runtime/int) if (p == p_end // invalid input or empty string || errno == ERANGE // out of range || JSON_HEDLEY_UNLIKELY(static_cast(p_end - p) != s.size())) // incomplete read @@ -14067,7 +14192,7 @@ class json_pointer if (reference_token == "-") { // explicitly treat "-" as index beyond the end - ptr = &ptr->operator[](ptr->m_value.array->size()); + ptr = &ptr->operator[](ptr->m_data.m_value.array->size()); } else { @@ -14119,7 +14244,7 @@ class json_pointer { // "-" always fails the range check JSON_THROW(detail::out_of_range::create(402, detail::concat( - "array index '-' (", std::to_string(ptr->m_value.array->size()), + "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr)); } @@ -14176,7 +14301,7 @@ class json_pointer if (JSON_HEDLEY_UNLIKELY(reference_token == "-")) { // "-" cannot be used for const access - JSON_THROW(detail::out_of_range::create(402, detail::concat("array index '-' (", std::to_string(ptr->m_value.array->size()), ") is out of range"), ptr)); + JSON_THROW(detail::out_of_range::create(402, detail::concat("array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr)); } // use unchecked array access @@ -14226,7 +14351,7 @@ class json_pointer { // "-" always fails the range check JSON_THROW(detail::out_of_range::create(402, detail::concat( - "array index '-' (", std::to_string(ptr->m_value.array->size()), + "array index '-' (", std::to_string(ptr->m_data.m_value.array->size()), ") is out of range"), ptr)); } @@ -14421,7 +14546,7 @@ class json_pointer { case detail::value_t::array: { - if (value.m_value.array->empty()) + if (value.m_data.m_value.array->empty()) { // flatten empty array as null result[reference_string] = nullptr; @@ -14429,10 +14554,10 @@ class json_pointer else { // iterate array and use index as reference string - for (std::size_t i = 0; i < value.m_value.array->size(); ++i) + for (std::size_t i = 0; i < value.m_data.m_value.array->size(); ++i) { flatten(detail::concat(reference_string, '/', std::to_string(i)), - value.m_value.array->operator[](i), result); + value.m_data.m_value.array->operator[](i), result); } } break; @@ -14440,7 +14565,7 @@ class json_pointer case detail::value_t::object: { - if (value.m_value.object->empty()) + if (value.m_data.m_value.object->empty()) { // flatten empty object as null result[reference_string] = nullptr; @@ -14448,7 +14573,7 @@ class json_pointer else { // iterate object and use keys as reference string - for (const auto& element : *value.m_value.object) + for (const auto& element : *value.m_data.m_value.object) { flatten(detail::concat(reference_string, '/', detail::escape(element.first)), element.second, result); } @@ -14495,7 +14620,7 @@ class json_pointer BasicJsonType result; // iterate the JSON object values - for (const auto& element : *value.m_value.object) + for (const auto& element : *value.m_data.m_value.object) { if (JSON_HEDLEY_UNLIKELY(!element.second.is_primitive())) { @@ -14671,10 +14796,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -14763,10 +14888,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -14789,10 +14914,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -14978,7 +15103,7 @@ class binary_writer { case value_t::object: { - write_bson_object(*j.m_value.object); + write_bson_object(*j.m_data.m_value.object); break; } @@ -15013,7 +15138,7 @@ class binary_writer case value_t::boolean: { - oa->write_character(j.m_value.boolean + oa->write_character(j.m_data.m_value.boolean ? to_char_type(0xF5) : to_char_type(0xF4)); break; @@ -15021,42 +15146,42 @@ class binary_writer case value_t::number_integer: { - if (j.m_value.number_integer >= 0) + if (j.m_data.m_value.number_integer >= 0) { // CBOR does not differentiate between positive signed // integers and unsigned integers. Therefore, we used the // code from the value_t::number_unsigned case here. - if (j.m_value.number_integer <= 0x17) + if (j.m_data.m_value.number_integer <= 0x17) { - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x18)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x19)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x1A)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } else { oa->write_character(to_char_type(0x1B)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } } else { // The conversions below encode the sign in the first // byte, and the value is converted to a positive number. - const auto positive_number = -1 - j.m_value.number_integer; - if (j.m_value.number_integer >= -24) + const auto positive_number = -1 - j.m_data.m_value.number_integer; + if (j.m_data.m_value.number_integer >= -24) { write_number(static_cast(0x20 + positive_number)); } @@ -15086,52 +15211,52 @@ class binary_writer case value_t::number_unsigned: { - if (j.m_value.number_unsigned <= 0x17) + if (j.m_data.m_value.number_unsigned <= 0x17) { - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x18)); - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x19)); - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { oa->write_character(to_char_type(0x1A)); - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } else { oa->write_character(to_char_type(0x1B)); - write_number(static_cast(j.m_value.number_unsigned)); + write_number(static_cast(j.m_data.m_value.number_unsigned)); } break; } case value_t::number_float: { - if (std::isnan(j.m_value.number_float)) + if (std::isnan(j.m_data.m_value.number_float)) { // NaN is 0xf97e00 in CBOR oa->write_character(to_char_type(0xF9)); oa->write_character(to_char_type(0x7E)); oa->write_character(to_char_type(0x00)); } - else if (std::isinf(j.m_value.number_float)) + else if (std::isinf(j.m_data.m_value.number_float)) { // Infinity is 0xf97c00, -Infinity is 0xf9fc00 oa->write_character(to_char_type(0xf9)); - oa->write_character(j.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC)); + oa->write_character(j.m_data.m_value.number_float > 0 ? to_char_type(0x7C) : to_char_type(0xFC)); oa->write_character(to_char_type(0x00)); } else { - write_compact_float(j.m_value.number_float, detail::input_format_t::cbor); + write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::cbor); } break; } @@ -15139,7 +15264,7 @@ class binary_writer case value_t::string: { // step 1: write control byte and the string length - const auto N = j.m_value.string->size(); + const auto N = j.m_data.m_value.string->size(); if (N <= 0x17) { write_number(static_cast(0x60 + N)); @@ -15169,15 +15294,15 @@ class binary_writer // step 2: write the string oa->write_characters( - reinterpret_cast(j.m_value.string->c_str()), - j.m_value.string->size()); + reinterpret_cast(j.m_data.m_value.string->c_str()), + j.m_data.m_value.string->size()); break; } case value_t::array: { // step 1: write control byte and the array size - const auto N = j.m_value.array->size(); + const auto N = j.m_data.m_value.array->size(); if (N <= 0x17) { write_number(static_cast(0x80 + N)); @@ -15206,7 +15331,7 @@ class binary_writer // LCOV_EXCL_STOP // step 2: write each element - for (const auto& el : *j.m_value.array) + for (const auto& el : *j.m_data.m_value.array) { write_cbor(el); } @@ -15215,32 +15340,32 @@ class binary_writer case value_t::binary: { - if (j.m_value.binary->has_subtype()) + if (j.m_data.m_value.binary->has_subtype()) { - if (j.m_value.binary->subtype() <= (std::numeric_limits::max)()) + if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits::max)()) { write_number(static_cast(0xd8)); - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } - else if (j.m_value.binary->subtype() <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits::max)()) { write_number(static_cast(0xd9)); - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } - else if (j.m_value.binary->subtype() <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits::max)()) { write_number(static_cast(0xda)); - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } - else if (j.m_value.binary->subtype() <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.binary->subtype() <= (std::numeric_limits::max)()) { write_number(static_cast(0xdb)); - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } } // step 1: write control byte and the binary array size - const auto N = j.m_value.binary->size(); + const auto N = j.m_data.m_value.binary->size(); if (N <= 0x17) { write_number(static_cast(0x40 + N)); @@ -15270,7 +15395,7 @@ class binary_writer // step 2: write each element oa->write_characters( - reinterpret_cast(j.m_value.binary->data()), + reinterpret_cast(j.m_data.m_value.binary->data()), N); break; @@ -15279,7 +15404,7 @@ class binary_writer case value_t::object: { // step 1: write control byte and the object size - const auto N = j.m_value.object->size(); + const auto N = j.m_data.m_value.object->size(); if (N <= 0x17) { write_number(static_cast(0xA0 + N)); @@ -15308,7 +15433,7 @@ class binary_writer // LCOV_EXCL_STOP // step 2: write each element - for (const auto& el : *j.m_value.object) + for (const auto& el : *j.m_data.m_value.object) { write_cbor(el.first); write_cbor(el.second); @@ -15337,7 +15462,7 @@ class binary_writer case value_t::boolean: // true and false { - oa->write_character(j.m_value.boolean + oa->write_character(j.m_data.m_value.boolean ? to_char_type(0xC3) : to_char_type(0xC2)); break; @@ -15345,75 +15470,75 @@ class binary_writer case value_t::number_integer: { - if (j.m_value.number_integer >= 0) + if (j.m_data.m_value.number_integer >= 0) { // MessagePack does not differentiate between positive // signed integers and unsigned integers. Therefore, we used // the code from the value_t::number_unsigned case here. - if (j.m_value.number_unsigned < 128) + if (j.m_data.m_value.number_unsigned < 128) { // positive fixnum - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 8 oa->write_character(to_char_type(0xCC)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 16 oa->write_character(to_char_type(0xCD)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 32 oa->write_character(to_char_type(0xCE)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 64 oa->write_character(to_char_type(0xCF)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } } else { - if (j.m_value.number_integer >= -32) + if (j.m_data.m_value.number_integer >= -32) { // negative fixnum - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer >= (std::numeric_limits::min)() && - j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { // int 8 oa->write_character(to_char_type(0xD0)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer >= (std::numeric_limits::min)() && - j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { // int 16 oa->write_character(to_char_type(0xD1)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer >= (std::numeric_limits::min)() && - j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { // int 32 oa->write_character(to_char_type(0xD2)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_integer >= (std::numeric_limits::min)() && - j.m_value.number_integer <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_integer >= (std::numeric_limits::min)() && + j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { // int 64 oa->write_character(to_char_type(0xD3)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } } break; @@ -15421,48 +15546,48 @@ class binary_writer case value_t::number_unsigned: { - if (j.m_value.number_unsigned < 128) + if (j.m_data.m_value.number_unsigned < 128) { // positive fixnum - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 8 oa->write_character(to_char_type(0xCC)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 16 oa->write_character(to_char_type(0xCD)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 32 oa->write_character(to_char_type(0xCE)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } - else if (j.m_value.number_unsigned <= (std::numeric_limits::max)()) + else if (j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { // uint 64 oa->write_character(to_char_type(0xCF)); - write_number(static_cast(j.m_value.number_integer)); + write_number(static_cast(j.m_data.m_value.number_integer)); } break; } case value_t::number_float: { - write_compact_float(j.m_value.number_float, detail::input_format_t::msgpack); + write_compact_float(j.m_data.m_value.number_float, detail::input_format_t::msgpack); break; } case value_t::string: { // step 1: write control byte and the string length - const auto N = j.m_value.string->size(); + const auto N = j.m_data.m_value.string->size(); if (N <= 31) { // fixstr @@ -15489,15 +15614,15 @@ class binary_writer // step 2: write the string oa->write_characters( - reinterpret_cast(j.m_value.string->c_str()), - j.m_value.string->size()); + reinterpret_cast(j.m_data.m_value.string->c_str()), + j.m_data.m_value.string->size()); break; } case value_t::array: { // step 1: write control byte and the array size - const auto N = j.m_value.array->size(); + const auto N = j.m_data.m_value.array->size(); if (N <= 15) { // fixarray @@ -15517,7 +15642,7 @@ class binary_writer } // step 2: write each element - for (const auto& el : *j.m_value.array) + for (const auto& el : *j.m_data.m_value.array) { write_msgpack(el); } @@ -15528,10 +15653,10 @@ class binary_writer { // step 0: determine if the binary type has a set subtype to // determine whether or not to use the ext or fixext types - const bool use_ext = j.m_value.binary->has_subtype(); + const bool use_ext = j.m_data.m_value.binary->has_subtype(); // step 1: write control byte and the byte string length - const auto N = j.m_value.binary->size(); + const auto N = j.m_data.m_value.binary->size(); if (N <= (std::numeric_limits::max)()) { std::uint8_t output_type{}; @@ -15576,18 +15701,18 @@ class binary_writer } else if (N <= (std::numeric_limits::max)()) { - std::uint8_t output_type = use_ext - ? 0xC8 // ext 16 - : 0xC5; // bin 16 + const std::uint8_t output_type = use_ext + ? 0xC8 // ext 16 + : 0xC5; // bin 16 oa->write_character(to_char_type(output_type)); write_number(static_cast(N)); } else if (N <= (std::numeric_limits::max)()) { - std::uint8_t output_type = use_ext - ? 0xC9 // ext 32 - : 0xC6; // bin 32 + const std::uint8_t output_type = use_ext + ? 0xC9 // ext 32 + : 0xC6; // bin 32 oa->write_character(to_char_type(output_type)); write_number(static_cast(N)); @@ -15596,12 +15721,12 @@ class binary_writer // step 1.5: if this is an ext type, write the subtype if (use_ext) { - write_number(static_cast(j.m_value.binary->subtype())); + write_number(static_cast(j.m_data.m_value.binary->subtype())); } // step 2: write the byte string oa->write_characters( - reinterpret_cast(j.m_value.binary->data()), + reinterpret_cast(j.m_data.m_value.binary->data()), N); break; @@ -15610,7 +15735,7 @@ class binary_writer case value_t::object: { // step 1: write control byte and the object size - const auto N = j.m_value.object->size(); + const auto N = j.m_data.m_value.object->size(); if (N <= 15) { // fixmap @@ -15630,7 +15755,7 @@ class binary_writer } // step 2: write each element - for (const auto& el : *j.m_value.object) + for (const auto& el : *j.m_data.m_value.object) { write_msgpack(el.first); write_msgpack(el.second); @@ -15670,7 +15795,7 @@ class binary_writer { if (add_prefix) { - oa->write_character(j.m_value.boolean + oa->write_character(j.m_data.m_value.boolean ? to_char_type('T') : to_char_type('F')); } @@ -15679,19 +15804,19 @@ class binary_writer case value_t::number_integer: { - write_number_with_ubjson_prefix(j.m_value.number_integer, add_prefix, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.number_integer, add_prefix, use_bjdata); break; } case value_t::number_unsigned: { - write_number_with_ubjson_prefix(j.m_value.number_unsigned, add_prefix, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.number_unsigned, add_prefix, use_bjdata); break; } case value_t::number_float: { - write_number_with_ubjson_prefix(j.m_value.number_float, add_prefix, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.number_float, add_prefix, use_bjdata); break; } @@ -15701,10 +15826,10 @@ class binary_writer { oa->write_character(to_char_type('S')); } - write_number_with_ubjson_prefix(j.m_value.string->size(), true, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.string->size(), true, use_bjdata); oa->write_characters( - reinterpret_cast(j.m_value.string->c_str()), - j.m_value.string->size()); + reinterpret_cast(j.m_data.m_value.string->c_str()), + j.m_data.m_value.string->size()); break; } @@ -15716,7 +15841,7 @@ class binary_writer } bool prefix_required = true; - if (use_type && !j.m_value.array->empty()) + if (use_type && !j.m_data.m_value.array->empty()) { JSON_ASSERT(use_count); const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata); @@ -15739,10 +15864,10 @@ class binary_writer if (use_count) { oa->write_character(to_char_type('#')); - write_number_with_ubjson_prefix(j.m_value.array->size(), true, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.array->size(), true, use_bjdata); } - for (const auto& el : *j.m_value.array) + for (const auto& el : *j.m_data.m_value.array) { write_ubjson(el, use_count, use_type, prefix_required, use_bjdata); } @@ -15762,7 +15887,7 @@ class binary_writer oa->write_character(to_char_type('[')); } - if (use_type && !j.m_value.binary->empty()) + if (use_type && !j.m_data.m_value.binary->empty()) { JSON_ASSERT(use_count); oa->write_character(to_char_type('$')); @@ -15772,21 +15897,21 @@ class binary_writer if (use_count) { oa->write_character(to_char_type('#')); - write_number_with_ubjson_prefix(j.m_value.binary->size(), true, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.binary->size(), true, use_bjdata); } if (use_type) { oa->write_characters( - reinterpret_cast(j.m_value.binary->data()), - j.m_value.binary->size()); + reinterpret_cast(j.m_data.m_value.binary->data()), + j.m_data.m_value.binary->size()); } else { - for (size_t i = 0; i < j.m_value.binary->size(); ++i) + for (size_t i = 0; i < j.m_data.m_value.binary->size(); ++i) { oa->write_character(to_char_type('U')); - oa->write_character(j.m_value.binary->data()[i]); + oa->write_character(j.m_data.m_value.binary->data()[i]); } } @@ -15800,9 +15925,9 @@ class binary_writer case value_t::object: { - if (use_bjdata && j.m_value.object->size() == 3 && j.m_value.object->find("_ArrayType_") != j.m_value.object->end() && j.m_value.object->find("_ArraySize_") != j.m_value.object->end() && j.m_value.object->find("_ArrayData_") != j.m_value.object->end()) + if (use_bjdata && j.m_data.m_value.object->size() == 3 && j.m_data.m_value.object->find("_ArrayType_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArraySize_") != j.m_data.m_value.object->end() && j.m_data.m_value.object->find("_ArrayData_") != j.m_data.m_value.object->end()) { - if (!write_bjdata_ndarray(*j.m_value.object, use_count, use_type)) // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata) + if (!write_bjdata_ndarray(*j.m_data.m_value.object, use_count, use_type)) // decode bjdata ndarray in the JData format (https://github.com/NeuroJSON/jdata) { break; } @@ -15814,7 +15939,7 @@ class binary_writer } bool prefix_required = true; - if (use_type && !j.m_value.object->empty()) + if (use_type && !j.m_data.m_value.object->empty()) { JSON_ASSERT(use_count); const CharType first_prefix = ubjson_prefix(j.front(), use_bjdata); @@ -15837,10 +15962,10 @@ class binary_writer if (use_count) { oa->write_character(to_char_type('#')); - write_number_with_ubjson_prefix(j.m_value.object->size(), true, use_bjdata); + write_number_with_ubjson_prefix(j.m_data.m_value.object->size(), true, use_bjdata); } - for (const auto& el : *j.m_value.object) + for (const auto& el : *j.m_data.m_value.object) { write_number_with_ubjson_prefix(el.first.size(), true, use_bjdata); oa->write_characters( @@ -15990,19 +16115,19 @@ class binary_writer void write_bson_unsigned(const string_t& name, const BasicJsonType& j) { - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { write_bson_entry_header(name, 0x10 /* int32 */); - write_number(static_cast(j.m_value.number_unsigned), true); + write_number(static_cast(j.m_data.m_value.number_unsigned), true); } - else if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + else if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { write_bson_entry_header(name, 0x12 /* int64 */); - write_number(static_cast(j.m_value.number_unsigned), true); + write_number(static_cast(j.m_data.m_value.number_unsigned), true); } else { - JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j)); + JSON_THROW(out_of_range::create(407, concat("integer number ", std::to_string(j.m_data.m_value.number_unsigned), " cannot be represented by BSON as it does not fit int64"), &j)); } } @@ -16083,13 +16208,13 @@ class binary_writer switch (j.type()) { case value_t::object: - return header_size + calc_bson_object_size(*j.m_value.object); + return header_size + calc_bson_object_size(*j.m_data.m_value.object); case value_t::array: - return header_size + calc_bson_array_size(*j.m_value.array); + return header_size + calc_bson_array_size(*j.m_data.m_value.array); case value_t::binary: - return header_size + calc_bson_binary_size(*j.m_value.binary); + return header_size + calc_bson_binary_size(*j.m_data.m_value.binary); case value_t::boolean: return header_size + 1ul; @@ -16098,13 +16223,13 @@ class binary_writer return header_size + 8ul; case value_t::number_integer: - return header_size + calc_bson_integer_size(j.m_value.number_integer); + return header_size + calc_bson_integer_size(j.m_data.m_value.number_integer); case value_t::number_unsigned: - return header_size + calc_bson_unsigned_size(j.m_value.number_unsigned); + return header_size + calc_bson_unsigned_size(j.m_data.m_value.number_unsigned); case value_t::string: - return header_size + calc_bson_string_size(*j.m_value.string); + return header_size + calc_bson_string_size(*j.m_data.m_value.string); case value_t::null: return header_size + 0ul; @@ -16130,28 +16255,28 @@ class binary_writer switch (j.type()) { case value_t::object: - return write_bson_object_entry(name, *j.m_value.object); + return write_bson_object_entry(name, *j.m_data.m_value.object); case value_t::array: - return write_bson_array(name, *j.m_value.array); + return write_bson_array(name, *j.m_data.m_value.array); case value_t::binary: - return write_bson_binary(name, *j.m_value.binary); + return write_bson_binary(name, *j.m_data.m_value.binary); case value_t::boolean: - return write_bson_boolean(name, j.m_value.boolean); + return write_bson_boolean(name, j.m_data.m_value.boolean); case value_t::number_float: - return write_bson_double(name, j.m_value.number_float); + return write_bson_double(name, j.m_data.m_value.number_float); case value_t::number_integer: - return write_bson_integer(name, j.m_value.number_integer); + return write_bson_integer(name, j.m_data.m_value.number_integer); case value_t::number_unsigned: return write_bson_unsigned(name, j); case value_t::string: - return write_bson_string(name, *j.m_value.string); + return write_bson_string(name, *j.m_data.m_value.string); case value_t::null: return write_bson_null(name); @@ -16173,8 +16298,8 @@ class binary_writer */ static std::size_t calc_bson_object_size(const typename BasicJsonType::object_t& value) { - std::size_t document_size = std::accumulate(value.begin(), value.end(), static_cast(0), - [](size_t result, const typename BasicJsonType::object_t::value_type & el) + const std::size_t document_size = std::accumulate(value.begin(), value.end(), static_cast(0), + [](size_t result, const typename BasicJsonType::object_t::value_type & el) { return result += calc_bson_element_size(el.first, el.second); }); @@ -16424,35 +16549,35 @@ class binary_writer return 'Z'; case value_t::boolean: - return j.m_value.boolean ? 'T' : 'F'; + return j.m_data.m_value.boolean ? 'T' : 'F'; case value_t::number_integer: { - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'i'; } - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'U'; } - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'I'; } - if (use_bjdata && ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)())) + if (use_bjdata && ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)())) { return 'u'; } - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'l'; } - if (use_bjdata && ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)())) + if (use_bjdata && ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)())) { return 'm'; } - if ((std::numeric_limits::min)() <= j.m_value.number_integer && j.m_value.number_integer <= (std::numeric_limits::max)()) + if ((std::numeric_limits::min)() <= j.m_data.m_value.number_integer && j.m_data.m_value.number_integer <= (std::numeric_limits::max)()) { return 'L'; } @@ -16462,35 +16587,35 @@ class binary_writer case value_t::number_unsigned: { - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'i'; } - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'U'; } - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'I'; } - if (use_bjdata && j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'u'; } - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'l'; } - if (use_bjdata && j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (use_bjdata && j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'm'; } - if (j.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) + if (j.m_data.m_value.number_unsigned <= static_cast((std::numeric_limits::max)())) { return 'L'; } - if (use_bjdata && j.m_value.number_unsigned <= (std::numeric_limits::max)()) + if (use_bjdata && j.m_data.m_value.number_unsigned <= (std::numeric_limits::max)()) { return 'M'; } @@ -16499,7 +16624,7 @@ class binary_writer } case value_t::number_float: - return get_ubjson_float_prefix(j.m_value.number_float); + return get_ubjson_float_prefix(j.m_data.m_value.number_float); case value_t::string: return 'S'; @@ -16548,7 +16673,7 @@ class binary_writer std::size_t len = (value.at(key).empty() ? 0 : 1); for (const auto& el : value.at(key)) { - len *= static_cast(el.m_value.number_unsigned); + len *= static_cast(el.m_data.m_value.number_unsigned); } key = "_ArrayData_"; @@ -16570,70 +16695,70 @@ class binary_writer { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_unsigned), true); + write_number(static_cast(el.m_data.m_value.number_unsigned), true); } } else if (dtype == 'i') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_integer), true); + write_number(static_cast(el.m_data.m_value.number_integer), true); } } else if (dtype == 'u') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_unsigned), true); + write_number(static_cast(el.m_data.m_value.number_unsigned), true); } } else if (dtype == 'I') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_integer), true); + write_number(static_cast(el.m_data.m_value.number_integer), true); } } else if (dtype == 'm') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_unsigned), true); + write_number(static_cast(el.m_data.m_value.number_unsigned), true); } } else if (dtype == 'l') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_integer), true); + write_number(static_cast(el.m_data.m_value.number_integer), true); } } else if (dtype == 'M') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_unsigned), true); + write_number(static_cast(el.m_data.m_value.number_unsigned), true); } } else if (dtype == 'L') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_integer), true); + write_number(static_cast(el.m_data.m_value.number_integer), true); } } else if (dtype == 'd') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_float), true); + write_number(static_cast(el.m_data.m_value.number_float), true); } } else if (dtype == 'D') { for (const auto& el : value.at(key)) { - write_number(static_cast(el.m_value.number_float), true); + write_number(static_cast(el.m_data.m_value.number_float), true); } } return false; @@ -16757,11 +16882,11 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // // SPDX-FileCopyrightText: 2008-2009 Björn Hoehrmann -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -16782,11 +16907,11 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // // SPDX-FileCopyrightText: 2009 Florian Loitsch -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -17692,7 +17817,7 @@ void grisu2(char* buf, int& len, int& decimal_exponent, FloatType value) // NB: If the neighbors are computed for single-precision numbers, there is a single float // (7.0385307e-26f) which can't be recovered using strtod. The resulting double precision // value is off by 1 ulp. -#if 0 +#if 0 // NOLINT(readability-avoid-unconditional-preprocessor-if) const boundaries w = compute_boundaries(static_cast(value)); #else const boundaries w = compute_boundaries(value); @@ -17994,11 +18119,11 @@ class serializer const unsigned int indent_step, const unsigned int current_indent = 0) { - switch (val.m_type) + switch (val.m_data.m_type) { case value_t::object: { - if (val.m_value.object->empty()) + if (val.m_data.m_value.object->empty()) { o->write_characters("{}", 2); return; @@ -18016,8 +18141,8 @@ class serializer } // first n-1 elements - auto i = val.m_value.object->cbegin(); - for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i) + auto i = val.m_data.m_value.object->cbegin(); + for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i) { o->write_characters(indent_string.c_str(), new_indent); o->write_character('\"'); @@ -18028,8 +18153,8 @@ class serializer } // last element - JSON_ASSERT(i != val.m_value.object->cend()); - JSON_ASSERT(std::next(i) == val.m_value.object->cend()); + JSON_ASSERT(i != val.m_data.m_value.object->cend()); + JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend()); o->write_characters(indent_string.c_str(), new_indent); o->write_character('\"'); dump_escaped(i->first, ensure_ascii); @@ -18045,8 +18170,8 @@ class serializer o->write_character('{'); // first n-1 elements - auto i = val.m_value.object->cbegin(); - for (std::size_t cnt = 0; cnt < val.m_value.object->size() - 1; ++cnt, ++i) + auto i = val.m_data.m_value.object->cbegin(); + for (std::size_t cnt = 0; cnt < val.m_data.m_value.object->size() - 1; ++cnt, ++i) { o->write_character('\"'); dump_escaped(i->first, ensure_ascii); @@ -18056,8 +18181,8 @@ class serializer } // last element - JSON_ASSERT(i != val.m_value.object->cend()); - JSON_ASSERT(std::next(i) == val.m_value.object->cend()); + JSON_ASSERT(i != val.m_data.m_value.object->cend()); + JSON_ASSERT(std::next(i) == val.m_data.m_value.object->cend()); o->write_character('\"'); dump_escaped(i->first, ensure_ascii); o->write_characters("\":", 2); @@ -18071,7 +18196,7 @@ class serializer case value_t::array: { - if (val.m_value.array->empty()) + if (val.m_data.m_value.array->empty()) { o->write_characters("[]", 2); return; @@ -18089,8 +18214,8 @@ class serializer } // first n-1 elements - for (auto i = val.m_value.array->cbegin(); - i != val.m_value.array->cend() - 1; ++i) + for (auto i = val.m_data.m_value.array->cbegin(); + i != val.m_data.m_value.array->cend() - 1; ++i) { o->write_characters(indent_string.c_str(), new_indent); dump(*i, true, ensure_ascii, indent_step, new_indent); @@ -18098,9 +18223,9 @@ class serializer } // last element - JSON_ASSERT(!val.m_value.array->empty()); + JSON_ASSERT(!val.m_data.m_value.array->empty()); o->write_characters(indent_string.c_str(), new_indent); - dump(val.m_value.array->back(), true, ensure_ascii, indent_step, new_indent); + dump(val.m_data.m_value.array->back(), true, ensure_ascii, indent_step, new_indent); o->write_character('\n'); o->write_characters(indent_string.c_str(), current_indent); @@ -18111,16 +18236,16 @@ class serializer o->write_character('['); // first n-1 elements - for (auto i = val.m_value.array->cbegin(); - i != val.m_value.array->cend() - 1; ++i) + for (auto i = val.m_data.m_value.array->cbegin(); + i != val.m_data.m_value.array->cend() - 1; ++i) { dump(*i, false, ensure_ascii, indent_step, current_indent); o->write_character(','); } // last element - JSON_ASSERT(!val.m_value.array->empty()); - dump(val.m_value.array->back(), false, ensure_ascii, indent_step, current_indent); + JSON_ASSERT(!val.m_data.m_value.array->empty()); + dump(val.m_data.m_value.array->back(), false, ensure_ascii, indent_step, current_indent); o->write_character(']'); } @@ -18131,7 +18256,7 @@ class serializer case value_t::string: { o->write_character('\"'); - dump_escaped(*val.m_value.string, ensure_ascii); + dump_escaped(*val.m_data.m_value.string, ensure_ascii); o->write_character('\"'); return; } @@ -18153,24 +18278,24 @@ class serializer o->write_characters("\"bytes\": [", 10); - if (!val.m_value.binary->empty()) + if (!val.m_data.m_value.binary->empty()) { - for (auto i = val.m_value.binary->cbegin(); - i != val.m_value.binary->cend() - 1; ++i) + for (auto i = val.m_data.m_value.binary->cbegin(); + i != val.m_data.m_value.binary->cend() - 1; ++i) { dump_integer(*i); o->write_characters(", ", 2); } - dump_integer(val.m_value.binary->back()); + dump_integer(val.m_data.m_value.binary->back()); } o->write_characters("],\n", 3); o->write_characters(indent_string.c_str(), new_indent); o->write_characters("\"subtype\": ", 11); - if (val.m_value.binary->has_subtype()) + if (val.m_data.m_value.binary->has_subtype()) { - dump_integer(val.m_value.binary->subtype()); + dump_integer(val.m_data.m_value.binary->subtype()); } else { @@ -18184,21 +18309,21 @@ class serializer { o->write_characters("{\"bytes\":[", 10); - if (!val.m_value.binary->empty()) + if (!val.m_data.m_value.binary->empty()) { - for (auto i = val.m_value.binary->cbegin(); - i != val.m_value.binary->cend() - 1; ++i) + for (auto i = val.m_data.m_value.binary->cbegin(); + i != val.m_data.m_value.binary->cend() - 1; ++i) { dump_integer(*i); o->write_character(','); } - dump_integer(val.m_value.binary->back()); + dump_integer(val.m_data.m_value.binary->back()); } o->write_characters("],\"subtype\":", 12); - if (val.m_value.binary->has_subtype()) + if (val.m_data.m_value.binary->has_subtype()) { - dump_integer(val.m_value.binary->subtype()); + dump_integer(val.m_data.m_value.binary->subtype()); o->write_character('}'); } else @@ -18211,7 +18336,7 @@ class serializer case value_t::boolean: { - if (val.m_value.boolean) + if (val.m_data.m_value.boolean) { o->write_characters("true", 4); } @@ -18224,19 +18349,19 @@ class serializer case value_t::number_integer: { - dump_integer(val.m_value.number_integer); + dump_integer(val.m_data.m_value.number_integer); return; } case value_t::number_unsigned: { - dump_integer(val.m_value.number_unsigned); + dump_integer(val.m_data.m_value.number_unsigned); return; } case value_t::number_float: { - dump_float(val.m_value.number_float); + dump_float(val.m_data.m_value.number_float); return; } @@ -18810,8 +18935,8 @@ class serializer ? (byte & 0x3fu) | (codep << 6u) : (0xFFu >> type) & (byte); - std::size_t index = 256u + static_cast(state) * 16u + static_cast(type); - JSON_ASSERT(index < 400); + const std::size_t index = 256u + static_cast(state) * 16u + static_cast(type); + JSON_ASSERT(index < utf8d.size()); state = utf8d[index]; return state; } @@ -18878,10 +19003,10 @@ NLOHMANN_JSON_NAMESPACE_END // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -18998,7 +19123,7 @@ template , template::value, int> = 0> - T & at(KeyType && key) + T & at(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19026,7 +19151,7 @@ template , template::value, int> = 0> - const T & at(KeyType && key) const + const T & at(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19060,7 +19185,7 @@ template , template::value, int> = 0> - size_type erase(KeyType && key) + size_type erase(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19151,7 +19276,7 @@ template , template::value, int> = 0> - size_type count(KeyType && key) const + size_type count(KeyType && key) const // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19177,7 +19302,7 @@ template , template::value, int> = 0> - iterator find(KeyType && key) + iterator find(KeyType && key) // NOLINT(cppcoreguidelines-missing-std-forward) { for (auto it = this->begin(); it != this->end(); ++it) { @@ -19240,7 +19365,9 @@ NLOHMANN_JSON_NAMESPACE_END #if defined(JSON_HAS_CPP_17) - #include + #if JSON_HAS_STATIC_RTTI + #include + #endif #include #endif @@ -19271,6 +19398,7 @@ The invariants are checked by member function assert_invariant(). */ NLOHMANN_BASIC_JSON_TPL_DECLARATION class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-special-member-functions) + : public ::nlohmann::detail::json_base_class { private: template friend struct detail::external_constructor; @@ -19297,6 +19425,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// workaround type for MSVC using basic_json_t = NLOHMANN_BASIC_JSON_TPL; + using json_base_class_t = ::nlohmann::detail::json_base_class; JSON_PRIVATE_UNLESS_TESTED: // convenience aliases for types residing in namespace detail; @@ -19368,7 +19497,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - ///////////////////// // container types // ///////////////////// @@ -19410,7 +19538,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - /// @brief returns the allocator associated with the container /// @sa https://json.nlohmann.me/api/basic_json/get_allocator/ static allocator_type get_allocator() @@ -19425,7 +19552,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { basic_json result; - result["copyright"] = "(C) 2013-2022 Niels Lohmann"; + result["copyright"] = "(C) 2013-2023 Niels Lohmann"; result["name"] = "JSON for Modern C++"; result["url"] = "https://github.com/nlohmann/json"; result["version"]["string"] = @@ -19473,7 +19600,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec result["compiler"] = {{"family", "unknown"}, {"version", "unknown"}}; #endif - #if defined(_MSVC_LANG) result["compiler"]["c++"] = std::to_string(_MSVC_LANG); #elif defined(__cplusplus) @@ -19484,7 +19610,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec return result; } - /////////////////////////// // JSON value data types // /////////////////////////// @@ -19692,7 +19817,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec object = nullptr; // silence warning, see #821 if (JSON_HEDLEY_UNLIKELY(t == value_t::null)) { - JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.2", nullptr)); // LCOV_EXCL_LINE + JSON_THROW(other_error::create(500, "961c151d2e87f2686a955a9be24d316f1362bf21 3.11.3", nullptr)); // LCOV_EXCL_LINE } break; } @@ -19731,6 +19856,16 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec void destroy(value_t t) { + if ( + (t == value_t::object && object == nullptr) || + (t == value_t::array && array == nullptr) || + (t == value_t::string && string == nullptr) || + (t == value_t::binary && binary == nullptr) + ) + { + //not initialized (e.g. due to exception in the ctor) + return; + } if (t == value_t::array || t == value_t::object) { // flatten the current json_value to a heap-allocated stack @@ -19761,18 +19896,18 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // its children to the stack to be processed later if (current_item.is_array()) { - std::move(current_item.m_value.array->begin(), current_item.m_value.array->end(), std::back_inserter(stack)); + std::move(current_item.m_data.m_value.array->begin(), current_item.m_data.m_value.array->end(), std::back_inserter(stack)); - current_item.m_value.array->clear(); + current_item.m_data.m_value.array->clear(); } else if (current_item.is_object()) { - for (auto&& it : *current_item.m_value.object) + for (auto&& it : *current_item.m_data.m_value.object) { stack.push_back(std::move(it.second)); } - current_item.m_value.object->clear(); + current_item.m_data.m_value.object->clear(); } // it's now safe that current_item get destructed @@ -19849,10 +19984,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec */ void assert_invariant(bool check_parents = true) const noexcept { - JSON_ASSERT(m_type != value_t::object || m_value.object != nullptr); - JSON_ASSERT(m_type != value_t::array || m_value.array != nullptr); - JSON_ASSERT(m_type != value_t::string || m_value.string != nullptr); - JSON_ASSERT(m_type != value_t::binary || m_value.binary != nullptr); + JSON_ASSERT(m_data.m_type != value_t::object || m_data.m_value.object != nullptr); + JSON_ASSERT(m_data.m_type != value_t::array || m_data.m_value.array != nullptr); + JSON_ASSERT(m_data.m_type != value_t::string || m_data.m_value.string != nullptr); + JSON_ASSERT(m_data.m_type != value_t::binary || m_data.m_value.binary != nullptr); #if JSON_DIAGNOSTICS JSON_TRY @@ -19871,11 +20006,11 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec void set_parents() { #if JSON_DIAGNOSTICS - switch (m_type) + switch (m_data.m_type) { case value_t::array: { - for (auto& element : *m_value.array) + for (auto& element : *m_data.m_value.array) { element.m_parent = this; } @@ -19884,7 +20019,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec case value_t::object: { - for (auto& element : *m_value.object) + for (auto& element : *m_data.m_value.object) { element.second.m_parent = this; } @@ -19925,7 +20060,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { // see https://github.com/nlohmann/json/issues/2838 JSON_ASSERT(type() == value_t::array); - if (JSON_HEDLEY_UNLIKELY(m_value.array->capacity() != old_capacity)) + if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity)) { // capacity has changed: update all parents set_parents(); @@ -19981,7 +20116,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief create an empty value with a given type /// @sa https://json.nlohmann.me/api/basic_json/basic_json/ basic_json(const value_t v) - : m_type(v), m_value(v) + : m_data(v) { assert_invariant(); } @@ -20055,12 +20190,12 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec *this = nullptr; break; case value_t::discarded: - m_type = value_t::discarded; + m_data.m_type = value_t::discarded; break; default: // LCOV_EXCL_LINE JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE } - JSON_ASSERT(m_type == val.type()); + JSON_ASSERT(m_data.m_type == val.type()); set_parents(); assert_invariant(); } @@ -20076,7 +20211,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec bool is_an_object = std::all_of(init.begin(), init.end(), [](const detail::json_ref& element_ref) { - return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[0].is_string(); + // The cast is to ensure op[size_type] is called, bearing in mind size_type may not be int; + // (many string types can be constructed from 0 via its null-pointer guise, so we get a + // broken call to op[key_type], the wrong semantics and a 4804 warning on Windows) + return element_ref->is_array() && element_ref->size() == 2 && (*element_ref)[static_cast(0)].is_string(); }); // adjust type if type deduction is not wanted @@ -20098,22 +20236,22 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_an_object) { // the initializer list is a list of pairs -> create object - m_type = value_t::object; - m_value = value_t::object; + m_data.m_type = value_t::object; + m_data.m_value = value_t::object; for (auto& element_ref : init) { auto element = element_ref.moved_or_copied(); - m_value.object->emplace( - std::move(*((*element.m_value.array)[0].m_value.string)), - std::move((*element.m_value.array)[1])); + m_data.m_value.object->emplace( + std::move(*((*element.m_data.m_value.array)[0].m_data.m_value.string)), + std::move((*element.m_data.m_value.array)[1])); } } else { // the initializer list describes an array -> create array - m_type = value_t::array; - m_value.array = create(init.begin(), init.end()); + m_data.m_type = value_t::array; + m_data.m_value.array = create(init.begin(), init.end()); } set_parents(); @@ -20126,8 +20264,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec static basic_json binary(const typename binary_t::container_type& init) { auto res = basic_json(); - res.m_type = value_t::binary; - res.m_value = init; + res.m_data.m_type = value_t::binary; + res.m_data.m_value = init; return res; } @@ -20137,8 +20275,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec static basic_json binary(const typename binary_t::container_type& init, typename binary_t::subtype_type subtype) { auto res = basic_json(); - res.m_type = value_t::binary; - res.m_value = binary_t(init, subtype); + res.m_data.m_type = value_t::binary; + res.m_data.m_value = binary_t(init, subtype); return res; } @@ -20148,8 +20286,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec static basic_json binary(typename binary_t::container_type&& init) { auto res = basic_json(); - res.m_type = value_t::binary; - res.m_value = std::move(init); + res.m_data.m_type = value_t::binary; + res.m_data.m_value = std::move(init); return res; } @@ -20159,8 +20297,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec static basic_json binary(typename binary_t::container_type&& init, typename binary_t::subtype_type subtype) { auto res = basic_json(); - res.m_type = value_t::binary; - res.m_value = binary_t(std::move(init), subtype); + res.m_data.m_type = value_t::binary; + res.m_data.m_value = binary_t(std::move(init), subtype); return res; } @@ -20182,10 +20320,9 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief construct an array with count copies of given value /// @sa https://json.nlohmann.me/api/basic_json/basic_json/ - basic_json(size_type cnt, const basic_json& val) - : m_type(value_t::array) + basic_json(size_type cnt, const basic_json& val): + m_data{cnt, val} { - m_value.array = create(cnt, val); set_parents(); assert_invariant(); } @@ -20207,10 +20344,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec } // copy type from first iterator - m_type = first.m_object->m_type; + m_data.m_type = first.m_object->m_data.m_type; // check if iterator range is complete for primitive values - switch (m_type) + switch (m_data.m_type) { case value_t::boolean: case value_t::number_float: @@ -20235,55 +20372,55 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec break; } - switch (m_type) + switch (m_data.m_type) { case value_t::number_integer: { - m_value.number_integer = first.m_object->m_value.number_integer; + m_data.m_value.number_integer = first.m_object->m_data.m_value.number_integer; break; } case value_t::number_unsigned: { - m_value.number_unsigned = first.m_object->m_value.number_unsigned; + m_data.m_value.number_unsigned = first.m_object->m_data.m_value.number_unsigned; break; } case value_t::number_float: { - m_value.number_float = first.m_object->m_value.number_float; + m_data.m_value.number_float = first.m_object->m_data.m_value.number_float; break; } case value_t::boolean: { - m_value.boolean = first.m_object->m_value.boolean; + m_data.m_value.boolean = first.m_object->m_data.m_value.boolean; break; } case value_t::string: { - m_value = *first.m_object->m_value.string; + m_data.m_value = *first.m_object->m_data.m_value.string; break; } case value_t::object: { - m_value.object = create(first.m_it.object_iterator, - last.m_it.object_iterator); + m_data.m_value.object = create(first.m_it.object_iterator, + last.m_it.object_iterator); break; } case value_t::array: { - m_value.array = create(first.m_it.array_iterator, - last.m_it.array_iterator); + m_data.m_value.array = create(first.m_it.array_iterator, + last.m_it.array_iterator); break; } case value_t::binary: { - m_value = *first.m_object->m_value.binary; + m_data.m_value = *first.m_object->m_data.m_value.binary; break; } @@ -20297,7 +20434,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec assert_invariant(); } - /////////////////////////////////////// // other constructors and destructor // /////////////////////////////////////// @@ -20310,58 +20446,59 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief copy constructor /// @sa https://json.nlohmann.me/api/basic_json/basic_json/ basic_json(const basic_json& other) - : m_type(other.m_type) + : json_base_class_t(other) { + m_data.m_type = other.m_data.m_type; // check of passed value is valid other.assert_invariant(); - switch (m_type) + switch (m_data.m_type) { case value_t::object: { - m_value = *other.m_value.object; + m_data.m_value = *other.m_data.m_value.object; break; } case value_t::array: { - m_value = *other.m_value.array; + m_data.m_value = *other.m_data.m_value.array; break; } case value_t::string: { - m_value = *other.m_value.string; + m_data.m_value = *other.m_data.m_value.string; break; } case value_t::boolean: { - m_value = other.m_value.boolean; + m_data.m_value = other.m_data.m_value.boolean; break; } case value_t::number_integer: { - m_value = other.m_value.number_integer; + m_data.m_value = other.m_data.m_value.number_integer; break; } case value_t::number_unsigned: { - m_value = other.m_value.number_unsigned; + m_data.m_value = other.m_data.m_value.number_unsigned; break; } case value_t::number_float: { - m_value = other.m_value.number_float; + m_data.m_value = other.m_data.m_value.number_float; break; } case value_t::binary: { - m_value = *other.m_value.binary; + m_data.m_value = *other.m_data.m_value.binary; break; } @@ -20378,15 +20515,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief move constructor /// @sa https://json.nlohmann.me/api/basic_json/basic_json/ basic_json(basic_json&& other) noexcept - : m_type(std::move(other.m_type)), - m_value(std::move(other.m_value)) + : json_base_class_t(std::forward(other)), + m_data(std::move(other.m_data)) { // check that passed value is valid other.assert_invariant(false); // invalidate payload - other.m_type = value_t::null; - other.m_value = {}; + other.m_data.m_type = value_t::null; + other.m_data.m_value = {}; set_parents(); assert_invariant(); @@ -20398,15 +20535,17 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec std::is_nothrow_move_constructible::value&& std::is_nothrow_move_assignable::value&& std::is_nothrow_move_constructible::value&& - std::is_nothrow_move_assignable::value + std::is_nothrow_move_assignable::value&& + std::is_nothrow_move_assignable::value ) { // check that passed value is valid other.assert_invariant(); using std::swap; - swap(m_type, other.m_type); - swap(m_value, other.m_value); + swap(m_data.m_type, other.m_data.m_type); + swap(m_data.m_value, other.m_data.m_value); + json_base_class_t::operator=(std::move(other)); set_parents(); assert_invariant(); @@ -20418,7 +20557,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec ~basic_json() noexcept { assert_invariant(false); - m_value.destroy(m_type); } /// @} @@ -20458,7 +20596,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/type/ constexpr value_t type() const noexcept { - return m_type; + return m_data.m_type; } /// @brief return whether type is primitive @@ -20479,14 +20617,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/is_null/ constexpr bool is_null() const noexcept { - return m_type == value_t::null; + return m_data.m_type == value_t::null; } /// @brief return whether value is a boolean /// @sa https://json.nlohmann.me/api/basic_json/is_boolean/ constexpr bool is_boolean() const noexcept { - return m_type == value_t::boolean; + return m_data.m_type == value_t::boolean; } /// @brief return whether value is a number @@ -20500,63 +20638,63 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/is_number_integer/ constexpr bool is_number_integer() const noexcept { - return m_type == value_t::number_integer || m_type == value_t::number_unsigned; + return m_data.m_type == value_t::number_integer || m_data.m_type == value_t::number_unsigned; } /// @brief return whether value is an unsigned integer number /// @sa https://json.nlohmann.me/api/basic_json/is_number_unsigned/ constexpr bool is_number_unsigned() const noexcept { - return m_type == value_t::number_unsigned; + return m_data.m_type == value_t::number_unsigned; } /// @brief return whether value is a floating-point number /// @sa https://json.nlohmann.me/api/basic_json/is_number_float/ constexpr bool is_number_float() const noexcept { - return m_type == value_t::number_float; + return m_data.m_type == value_t::number_float; } /// @brief return whether value is an object /// @sa https://json.nlohmann.me/api/basic_json/is_object/ constexpr bool is_object() const noexcept { - return m_type == value_t::object; + return m_data.m_type == value_t::object; } /// @brief return whether value is an array /// @sa https://json.nlohmann.me/api/basic_json/is_array/ constexpr bool is_array() const noexcept { - return m_type == value_t::array; + return m_data.m_type == value_t::array; } /// @brief return whether value is a string /// @sa https://json.nlohmann.me/api/basic_json/is_string/ constexpr bool is_string() const noexcept { - return m_type == value_t::string; + return m_data.m_type == value_t::string; } /// @brief return whether value is a binary array /// @sa https://json.nlohmann.me/api/basic_json/is_binary/ constexpr bool is_binary() const noexcept { - return m_type == value_t::binary; + return m_data.m_type == value_t::binary; } /// @brief return whether value is discarded /// @sa https://json.nlohmann.me/api/basic_json/is_discarded/ constexpr bool is_discarded() const noexcept { - return m_type == value_t::discarded; + return m_data.m_type == value_t::discarded; } /// @brief return the type of the JSON value (implicit) /// @sa https://json.nlohmann.me/api/basic_json/operator_value_t/ constexpr operator value_t() const noexcept { - return m_type; + return m_data.m_type; } /// @} @@ -20571,7 +20709,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { if (JSON_HEDLEY_LIKELY(is_boolean())) { - return m_value.boolean; + return m_data.m_value.boolean; } JSON_THROW(type_error::create(302, detail::concat("type must be boolean, but is ", type_name()), this)); @@ -20580,97 +20718,97 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// get a pointer to the value (object) object_t* get_impl_ptr(object_t* /*unused*/) noexcept { - return is_object() ? m_value.object : nullptr; + return is_object() ? m_data.m_value.object : nullptr; } /// get a pointer to the value (object) constexpr const object_t* get_impl_ptr(const object_t* /*unused*/) const noexcept { - return is_object() ? m_value.object : nullptr; + return is_object() ? m_data.m_value.object : nullptr; } /// get a pointer to the value (array) array_t* get_impl_ptr(array_t* /*unused*/) noexcept { - return is_array() ? m_value.array : nullptr; + return is_array() ? m_data.m_value.array : nullptr; } /// get a pointer to the value (array) constexpr const array_t* get_impl_ptr(const array_t* /*unused*/) const noexcept { - return is_array() ? m_value.array : nullptr; + return is_array() ? m_data.m_value.array : nullptr; } /// get a pointer to the value (string) string_t* get_impl_ptr(string_t* /*unused*/) noexcept { - return is_string() ? m_value.string : nullptr; + return is_string() ? m_data.m_value.string : nullptr; } /// get a pointer to the value (string) constexpr const string_t* get_impl_ptr(const string_t* /*unused*/) const noexcept { - return is_string() ? m_value.string : nullptr; + return is_string() ? m_data.m_value.string : nullptr; } /// get a pointer to the value (boolean) boolean_t* get_impl_ptr(boolean_t* /*unused*/) noexcept { - return is_boolean() ? &m_value.boolean : nullptr; + return is_boolean() ? &m_data.m_value.boolean : nullptr; } /// get a pointer to the value (boolean) constexpr const boolean_t* get_impl_ptr(const boolean_t* /*unused*/) const noexcept { - return is_boolean() ? &m_value.boolean : nullptr; + return is_boolean() ? &m_data.m_value.boolean : nullptr; } /// get a pointer to the value (integer number) number_integer_t* get_impl_ptr(number_integer_t* /*unused*/) noexcept { - return is_number_integer() ? &m_value.number_integer : nullptr; + return is_number_integer() ? &m_data.m_value.number_integer : nullptr; } /// get a pointer to the value (integer number) constexpr const number_integer_t* get_impl_ptr(const number_integer_t* /*unused*/) const noexcept { - return is_number_integer() ? &m_value.number_integer : nullptr; + return is_number_integer() ? &m_data.m_value.number_integer : nullptr; } /// get a pointer to the value (unsigned number) number_unsigned_t* get_impl_ptr(number_unsigned_t* /*unused*/) noexcept { - return is_number_unsigned() ? &m_value.number_unsigned : nullptr; + return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr; } /// get a pointer to the value (unsigned number) constexpr const number_unsigned_t* get_impl_ptr(const number_unsigned_t* /*unused*/) const noexcept { - return is_number_unsigned() ? &m_value.number_unsigned : nullptr; + return is_number_unsigned() ? &m_data.m_value.number_unsigned : nullptr; } /// get a pointer to the value (floating-point number) number_float_t* get_impl_ptr(number_float_t* /*unused*/) noexcept { - return is_number_float() ? &m_value.number_float : nullptr; + return is_number_float() ? &m_data.m_value.number_float : nullptr; } /// get a pointer to the value (floating-point number) constexpr const number_float_t* get_impl_ptr(const number_float_t* /*unused*/) const noexcept { - return is_number_float() ? &m_value.number_float : nullptr; + return is_number_float() ? &m_data.m_value.number_float : nullptr; } /// get a pointer to the value (binary) binary_t* get_impl_ptr(binary_t* /*unused*/) noexcept { - return is_binary() ? m_value.binary : nullptr; + return is_binary() ? m_data.m_value.binary : nullptr; } /// get a pointer to the value (binary) constexpr const binary_t* get_impl_ptr(const binary_t* /*unused*/) const noexcept { - return is_binary() ? m_value.binary : nullptr; + return is_binary() ? m_data.m_value.binary : nullptr; } /*! @@ -21053,7 +21191,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec #if defined(JSON_HAS_CPP_17) && (defined(__GNUC__) || (defined(_MSC_VER) && _MSC_VER >= 1910 && _MSC_VER <= 1914)) detail::negation>, #endif -#if defined(JSON_HAS_CPP_17) +#if defined(JSON_HAS_CPP_17) && JSON_HAS_STATIC_RTTI detail::negation>, #endif detail::is_detected_lazy @@ -21090,7 +21228,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - //////////////////// // element access // //////////////////// @@ -21108,7 +21245,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { JSON_TRY { - return set_parent(m_value.array->at(idx)); + return set_parent(m_data.m_value.array->at(idx)); } JSON_CATCH (std::out_of_range&) { @@ -21131,7 +21268,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { JSON_TRY { - return m_value.array->at(idx); + return m_data.m_value.array->at(idx); } JSON_CATCH (std::out_of_range&) { @@ -21155,8 +21292,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this)); } - auto it = m_value.object->find(key); - if (it == m_value.object->end()) + auto it = m_data.m_value.object->find(key); + if (it == m_data.m_value.object->end()) { JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this)); } @@ -21175,8 +21312,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this)); } - auto it = m_value.object->find(std::forward(key)); - if (it == m_value.object->end()) + auto it = m_data.m_value.object->find(std::forward(key)); + if (it == m_data.m_value.object->end()) { JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward(key)), "' not found"), this)); } @@ -21193,8 +21330,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this)); } - auto it = m_value.object->find(key); - if (it == m_value.object->end()) + auto it = m_data.m_value.object->find(key); + if (it == m_data.m_value.object->end()) { JSON_THROW(out_of_range::create(403, detail::concat("key '", key, "' not found"), this)); } @@ -21213,8 +21350,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(304, detail::concat("cannot use at() with ", type_name()), this)); } - auto it = m_value.object->find(std::forward(key)); - if (it == m_value.object->end()) + auto it = m_data.m_value.object->find(std::forward(key)); + if (it == m_data.m_value.object->end()) { JSON_THROW(out_of_range::create(403, detail::concat("key '", string_t(std::forward(key)), "' not found"), this)); } @@ -21228,8 +21365,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // implicitly convert null value to an empty array if (is_null()) { - m_type = value_t::array; - m_value.array = create(); + m_data.m_type = value_t::array; + m_data.m_value.array = create(); assert_invariant(); } @@ -21237,17 +21374,17 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (JSON_HEDLEY_LIKELY(is_array())) { // fill up array with null values if given idx is outside range - if (idx >= m_value.array->size()) + if (idx >= m_data.m_value.array->size()) { #if JSON_DIAGNOSTICS // remember array size & capacity before resizing - const auto old_size = m_value.array->size(); - const auto old_capacity = m_value.array->capacity(); + const auto old_size = m_data.m_value.array->size(); + const auto old_capacity = m_data.m_value.array->capacity(); #endif - m_value.array->resize(idx + 1); + m_data.m_value.array->resize(idx + 1); #if JSON_DIAGNOSTICS - if (JSON_HEDLEY_UNLIKELY(m_value.array->capacity() != old_capacity)) + if (JSON_HEDLEY_UNLIKELY(m_data.m_value.array->capacity() != old_capacity)) { // capacity has changed: update all parents set_parents(); @@ -21261,7 +21398,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec assert_invariant(); } - return m_value.array->operator[](idx); + return m_data.m_value.array->operator[](idx); } JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this)); @@ -21274,7 +21411,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // const operator[] only works for arrays if (JSON_HEDLEY_LIKELY(is_array())) { - return m_value.array->operator[](idx); + return m_data.m_value.array->operator[](idx); } JSON_THROW(type_error::create(305, detail::concat("cannot use operator[] with a numeric argument with ", type_name()), this)); @@ -21287,15 +21424,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // implicitly convert null value to an empty object if (is_null()) { - m_type = value_t::object; - m_value.object = create(); + m_data.m_type = value_t::object; + m_data.m_value.object = create(); assert_invariant(); } // operator[] only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { - auto result = m_value.object->emplace(std::move(key), nullptr); + auto result = m_data.m_value.object->emplace(std::move(key), nullptr); return set_parent(result.first->second); } @@ -21309,8 +21446,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // const operator[] only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { - auto it = m_value.object->find(key); - JSON_ASSERT(it != m_value.object->end()); + auto it = m_data.m_value.object->find(key); + JSON_ASSERT(it != m_data.m_value.object->end()); return it->second; } @@ -21340,15 +21477,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // implicitly convert null value to an empty object if (is_null()) { - m_type = value_t::object; - m_value.object = create(); + m_data.m_type = value_t::object; + m_data.m_value.object = create(); assert_invariant(); } // operator[] only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { - auto result = m_value.object->emplace(std::forward(key), nullptr); + auto result = m_data.m_value.object->emplace(std::forward(key), nullptr); return set_parent(result.first->second); } @@ -21364,8 +21501,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // const operator[] only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { - auto it = m_value.object->find(std::forward(key)); - JSON_ASSERT(it != m_value.object->end()); + auto it = m_data.m_value.object->find(std::forward(key)); + JSON_ASSERT(it != m_data.m_value.object->end()); return it->second; } @@ -21602,7 +21739,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec IteratorType result = end(); - switch (m_type) + switch (m_data.m_type) { case value_t::boolean: case value_t::number_float: @@ -21619,32 +21756,32 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_string()) { AllocatorType alloc; - std::allocator_traits::destroy(alloc, m_value.string); - std::allocator_traits::deallocate(alloc, m_value.string, 1); - m_value.string = nullptr; + std::allocator_traits::destroy(alloc, m_data.m_value.string); + std::allocator_traits::deallocate(alloc, m_data.m_value.string, 1); + m_data.m_value.string = nullptr; } else if (is_binary()) { AllocatorType alloc; - std::allocator_traits::destroy(alloc, m_value.binary); - std::allocator_traits::deallocate(alloc, m_value.binary, 1); - m_value.binary = nullptr; + std::allocator_traits::destroy(alloc, m_data.m_value.binary); + std::allocator_traits::deallocate(alloc, m_data.m_value.binary, 1); + m_data.m_value.binary = nullptr; } - m_type = value_t::null; + m_data.m_type = value_t::null; assert_invariant(); break; } case value_t::object: { - result.m_it.object_iterator = m_value.object->erase(pos.m_it.object_iterator); + result.m_it.object_iterator = m_data.m_value.object->erase(pos.m_it.object_iterator); break; } case value_t::array: { - result.m_it.array_iterator = m_value.array->erase(pos.m_it.array_iterator); + result.m_it.array_iterator = m_data.m_value.array->erase(pos.m_it.array_iterator); break; } @@ -21672,7 +21809,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec IteratorType result = end(); - switch (m_type) + switch (m_data.m_type) { case value_t::boolean: case value_t::number_float: @@ -21690,33 +21827,33 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_string()) { AllocatorType alloc; - std::allocator_traits::destroy(alloc, m_value.string); - std::allocator_traits::deallocate(alloc, m_value.string, 1); - m_value.string = nullptr; + std::allocator_traits::destroy(alloc, m_data.m_value.string); + std::allocator_traits::deallocate(alloc, m_data.m_value.string, 1); + m_data.m_value.string = nullptr; } else if (is_binary()) { AllocatorType alloc; - std::allocator_traits::destroy(alloc, m_value.binary); - std::allocator_traits::deallocate(alloc, m_value.binary, 1); - m_value.binary = nullptr; + std::allocator_traits::destroy(alloc, m_data.m_value.binary); + std::allocator_traits::deallocate(alloc, m_data.m_value.binary, 1); + m_data.m_value.binary = nullptr; } - m_type = value_t::null; + m_data.m_type = value_t::null; assert_invariant(); break; } case value_t::object: { - result.m_it.object_iterator = m_value.object->erase(first.m_it.object_iterator, + result.m_it.object_iterator = m_data.m_value.object->erase(first.m_it.object_iterator, last.m_it.object_iterator); break; } case value_t::array: { - result.m_it.array_iterator = m_value.array->erase(first.m_it.array_iterator, + result.m_it.array_iterator = m_data.m_value.array->erase(first.m_it.array_iterator, last.m_it.array_iterator); break; } @@ -21741,7 +21878,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this)); } - return m_value.object->erase(std::forward(key)); + return m_data.m_value.object->erase(std::forward(key)); } template < typename KeyType, detail::enable_if_t < @@ -21754,10 +21891,10 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(type_error::create(307, detail::concat("cannot use erase() with ", type_name()), this)); } - const auto it = m_value.object->find(std::forward(key)); - if (it != m_value.object->end()) + const auto it = m_data.m_value.object->find(std::forward(key)); + if (it != m_data.m_value.object->end()) { - m_value.object->erase(it); + m_data.m_value.object->erase(it); return 1; } return 0; @@ -21795,7 +21932,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(out_of_range::create(401, detail::concat("array index ", std::to_string(idx), " is out of range"), this)); } - m_value.array->erase(m_value.array->begin() + static_cast(idx)); + m_data.m_value.array->erase(m_data.m_value.array->begin() + static_cast(idx)); } else { @@ -21805,7 +21942,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - //////////// // lookup // //////////// @@ -21821,7 +21957,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_object()) { - result.m_it.object_iterator = m_value.object->find(key); + result.m_it.object_iterator = m_data.m_value.object->find(key); } return result; @@ -21835,7 +21971,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_object()) { - result.m_it.object_iterator = m_value.object->find(key); + result.m_it.object_iterator = m_data.m_value.object->find(key); } return result; @@ -21851,7 +21987,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_object()) { - result.m_it.object_iterator = m_value.object->find(std::forward(key)); + result.m_it.object_iterator = m_data.m_value.object->find(std::forward(key)); } return result; @@ -21867,7 +22003,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (is_object()) { - result.m_it.object_iterator = m_value.object->find(std::forward(key)); + result.m_it.object_iterator = m_data.m_value.object->find(std::forward(key)); } return result; @@ -21878,7 +22014,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec size_type count(const typename object_t::key_type& key) const { // return 0 for all nonobject types - return is_object() ? m_value.object->count(key) : 0; + return is_object() ? m_data.m_value.object->count(key) : 0; } /// @brief returns the number of occurrences of a key in a JSON object @@ -21888,14 +22024,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec size_type count(KeyType && key) const { // return 0 for all nonobject types - return is_object() ? m_value.object->count(std::forward(key)) : 0; + return is_object() ? m_data.m_value.object->count(std::forward(key)) : 0; } /// @brief check the existence of an element in a JSON object /// @sa https://json.nlohmann.me/api/basic_json/contains/ bool contains(const typename object_t::key_type& key) const { - return is_object() && m_value.object->find(key) != m_value.object->end(); + return is_object() && m_data.m_value.object->find(key) != m_data.m_value.object->end(); } /// @brief check the existence of an element in a JSON object @@ -21904,7 +22040,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec detail::is_usable_as_basic_json_key_type::value, int> = 0> bool contains(KeyType && key) const { - return is_object() && m_value.object->find(std::forward(key)) != m_value.object->end(); + return is_object() && m_data.m_value.object->find(std::forward(key)) != m_data.m_value.object->end(); } /// @brief check the existence of an element in a JSON object given a JSON pointer @@ -21923,7 +22059,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - /////////////// // iterators // /////////////// @@ -22062,7 +22197,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - ////////////// // capacity // ////////////// @@ -22074,7 +22208,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/empty/ bool empty() const noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::null: { @@ -22085,13 +22219,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec case value_t::array: { // delegate call to array_t::empty() - return m_value.array->empty(); + return m_data.m_value.array->empty(); } case value_t::object: { // delegate call to object_t::empty() - return m_value.object->empty(); + return m_data.m_value.object->empty(); } case value_t::string: @@ -22113,7 +22247,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/size/ size_type size() const noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::null: { @@ -22124,13 +22258,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec case value_t::array: { // delegate call to array_t::size() - return m_value.array->size(); + return m_data.m_value.array->size(); } case value_t::object: { // delegate call to object_t::size() - return m_value.object->size(); + return m_data.m_value.object->size(); } case value_t::string: @@ -22152,18 +22286,18 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/max_size/ size_type max_size() const noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::array: { // delegate call to array_t::max_size() - return m_value.array->max_size(); + return m_data.m_value.array->max_size(); } case value_t::object: { // delegate call to object_t::max_size() - return m_value.object->max_size(); + return m_data.m_value.object->max_size(); } case value_t::null: @@ -22184,7 +22318,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @} - /////////////// // modifiers // /////////////// @@ -22196,53 +22329,53 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @sa https://json.nlohmann.me/api/basic_json/clear/ void clear() noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::number_integer: { - m_value.number_integer = 0; + m_data.m_value.number_integer = 0; break; } case value_t::number_unsigned: { - m_value.number_unsigned = 0; + m_data.m_value.number_unsigned = 0; break; } case value_t::number_float: { - m_value.number_float = 0.0; + m_data.m_value.number_float = 0.0; break; } case value_t::boolean: { - m_value.boolean = false; + m_data.m_value.boolean = false; break; } case value_t::string: { - m_value.string->clear(); + m_data.m_value.string->clear(); break; } case value_t::binary: { - m_value.binary->clear(); + m_data.m_value.binary->clear(); break; } case value_t::array: { - m_value.array->clear(); + m_data.m_value.array->clear(); break; } case value_t::object: { - m_value.object->clear(); + m_data.m_value.object->clear(); break; } @@ -22266,15 +22399,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an array if (is_null()) { - m_type = value_t::array; - m_value = value_t::array; + m_data.m_type = value_t::array; + m_data.m_value = value_t::array; assert_invariant(); } // add element to array (move semantics) - const auto old_capacity = m_value.array->capacity(); - m_value.array->push_back(std::move(val)); - set_parent(m_value.array->back(), old_capacity); + const auto old_capacity = m_data.m_value.array->capacity(); + m_data.m_value.array->push_back(std::move(val)); + set_parent(m_data.m_value.array->back(), old_capacity); // if val is moved from, basic_json move constructor marks it null, so we do not call the destructor } @@ -22299,15 +22432,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an array if (is_null()) { - m_type = value_t::array; - m_value = value_t::array; + m_data.m_type = value_t::array; + m_data.m_value = value_t::array; assert_invariant(); } // add element to array - const auto old_capacity = m_value.array->capacity(); - m_value.array->push_back(val); - set_parent(m_value.array->back(), old_capacity); + const auto old_capacity = m_data.m_value.array->capacity(); + m_data.m_value.array->push_back(val); + set_parent(m_data.m_value.array->back(), old_capacity); } /// @brief add an object to an array @@ -22331,13 +22464,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an object if (is_null()) { - m_type = value_t::object; - m_value = value_t::object; + m_data.m_type = value_t::object; + m_data.m_value = value_t::object; assert_invariant(); } // add element to object - auto res = m_value.object->insert(val); + auto res = m_data.m_value.object->insert(val); set_parent(res.first->second); } @@ -22387,15 +22520,15 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an array if (is_null()) { - m_type = value_t::array; - m_value = value_t::array; + m_data.m_type = value_t::array; + m_data.m_value = value_t::array; assert_invariant(); } // add element to array (perfect forwarding) - const auto old_capacity = m_value.array->capacity(); - m_value.array->emplace_back(std::forward(args)...); - return set_parent(m_value.array->back(), old_capacity); + const auto old_capacity = m_data.m_value.array->capacity(); + m_data.m_value.array->emplace_back(std::forward(args)...); + return set_parent(m_data.m_value.array->back(), old_capacity); } /// @brief add an object to an object if key does not exist @@ -22412,13 +22545,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // transform null object into an object if (is_null()) { - m_type = value_t::object; - m_value = value_t::object; + m_data.m_type = value_t::object; + m_data.m_value = value_t::object; assert_invariant(); } // add element to array (perfect forwarding) - auto res = m_value.object->emplace(std::forward(args)...); + auto res = m_data.m_value.object->emplace(std::forward(args)...); set_parent(res.first->second); // create result iterator and set iterator to the result of emplace @@ -22436,14 +22569,14 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec iterator insert_iterator(const_iterator pos, Args&& ... args) { iterator result(this); - JSON_ASSERT(m_value.array != nullptr); + JSON_ASSERT(m_data.m_value.array != nullptr); - auto insert_pos = std::distance(m_value.array->begin(), pos.m_it.array_iterator); - m_value.array->insert(pos.m_it.array_iterator, std::forward(args)...); - result.m_it.array_iterator = m_value.array->begin() + insert_pos; + auto insert_pos = std::distance(m_data.m_value.array->begin(), pos.m_it.array_iterator); + m_data.m_value.array->insert(pos.m_it.array_iterator, std::forward(args)...); + result.m_it.array_iterator = m_data.m_value.array->begin() + insert_pos; // This could have been written as: - // result.m_it.array_iterator = m_value.array->insert(pos.m_it.array_iterator, cnt, val); + // result.m_it.array_iterator = m_data.m_value.array->insert(pos.m_it.array_iterator, cnt, val); // but the return value of insert is missing in GCC 4.8, so it is written this way instead. set_parents(); @@ -22570,7 +22703,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_THROW(invalid_iterator::create(202, "iterators first and last must point to objects", this)); } - m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator); + m_data.m_value.object->insert(first.m_it.object_iterator, last.m_it.object_iterator); } /// @brief updates a JSON object from another object, overwriting existing keys @@ -22587,8 +22720,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // implicitly convert null value to an empty object if (is_null()) { - m_type = value_t::object; - m_value.object = create(); + m_data.m_type = value_t::object; + m_data.m_value.object = create(); assert_invariant(); } @@ -22613,16 +22746,16 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec { if (merge_objects && it.value().is_object()) { - auto it2 = m_value.object->find(it.key()); - if (it2 != m_value.object->end()) + auto it2 = m_data.m_value.object->find(it.key()); + if (it2 != m_data.m_value.object->end()) { it2->second.update(it.value(), true); continue; } } - m_value.object->operator[](it.key()) = it.value(); + m_data.m_value.object->operator[](it.key()) = it.value(); #if JSON_DIAGNOSTICS - m_value.object->operator[](it.key()).m_parent = this; + m_data.m_value.object->operator[](it.key()).m_parent = this; #endif } } @@ -22632,12 +22765,12 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec void swap(reference other) noexcept ( std::is_nothrow_move_constructible::value&& std::is_nothrow_move_assignable::value&& - std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_constructible::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap) std::is_nothrow_move_assignable::value ) { - std::swap(m_type, other.m_type); - std::swap(m_value, other.m_value); + std::swap(m_data.m_type, other.m_data.m_type); + std::swap(m_data.m_value, other.m_data.m_value); set_parents(); other.set_parents(); @@ -22649,7 +22782,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec friend void swap(reference left, reference right) noexcept ( std::is_nothrow_move_constructible::value&& std::is_nothrow_move_assignable::value&& - std::is_nothrow_move_constructible::value&& + std::is_nothrow_move_constructible::value&& // NOLINT(cppcoreguidelines-noexcept-swap,performance-noexcept-swap) std::is_nothrow_move_assignable::value ) { @@ -22658,13 +22791,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief exchanges the values /// @sa https://json.nlohmann.me/api/basic_json/swap/ - void swap(array_t& other) // NOLINT(bugprone-exception-escape) + void swap(array_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) { // swap only works for arrays if (JSON_HEDLEY_LIKELY(is_array())) { using std::swap; - swap(*(m_value.array), other); + swap(*(m_data.m_value.array), other); } else { @@ -22674,13 +22807,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief exchanges the values /// @sa https://json.nlohmann.me/api/basic_json/swap/ - void swap(object_t& other) // NOLINT(bugprone-exception-escape) + void swap(object_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) { // swap only works for objects if (JSON_HEDLEY_LIKELY(is_object())) { using std::swap; - swap(*(m_value.object), other); + swap(*(m_data.m_value.object), other); } else { @@ -22690,13 +22823,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief exchanges the values /// @sa https://json.nlohmann.me/api/basic_json/swap/ - void swap(string_t& other) // NOLINT(bugprone-exception-escape) + void swap(string_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) { // swap only works for strings if (JSON_HEDLEY_LIKELY(is_string())) { using std::swap; - swap(*(m_value.string), other); + swap(*(m_data.m_value.string), other); } else { @@ -22706,13 +22839,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec /// @brief exchanges the values /// @sa https://json.nlohmann.me/api/basic_json/swap/ - void swap(binary_t& other) // NOLINT(bugprone-exception-escape) + void swap(binary_t& other) // NOLINT(bugprone-exception-escape,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) { // swap only works for strings if (JSON_HEDLEY_LIKELY(is_binary())) { using std::swap; - swap(*(m_value.binary), other); + swap(*(m_data.m_value.binary), other); } else { @@ -22728,7 +22861,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec if (JSON_HEDLEY_LIKELY(is_binary())) { using std::swap; - swap(*(m_value.binary), other); + swap(*(m_data.m_value.binary), other); } else { @@ -22756,31 +22889,31 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec switch (lhs_type) \ { \ case value_t::array: \ - return (*lhs.m_value.array) op (*rhs.m_value.array); \ + return (*lhs.m_data.m_value.array) op (*rhs.m_data.m_value.array); \ \ case value_t::object: \ - return (*lhs.m_value.object) op (*rhs.m_value.object); \ + return (*lhs.m_data.m_value.object) op (*rhs.m_data.m_value.object); \ \ case value_t::null: \ return (null_result); \ \ case value_t::string: \ - return (*lhs.m_value.string) op (*rhs.m_value.string); \ + return (*lhs.m_data.m_value.string) op (*rhs.m_data.m_value.string); \ \ case value_t::boolean: \ - return (lhs.m_value.boolean) op (rhs.m_value.boolean); \ + return (lhs.m_data.m_value.boolean) op (rhs.m_data.m_value.boolean); \ \ case value_t::number_integer: \ - return (lhs.m_value.number_integer) op (rhs.m_value.number_integer); \ + return (lhs.m_data.m_value.number_integer) op (rhs.m_data.m_value.number_integer); \ \ case value_t::number_unsigned: \ - return (lhs.m_value.number_unsigned) op (rhs.m_value.number_unsigned); \ + return (lhs.m_data.m_value.number_unsigned) op (rhs.m_data.m_value.number_unsigned); \ \ case value_t::number_float: \ - return (lhs.m_value.number_float) op (rhs.m_value.number_float); \ + return (lhs.m_data.m_value.number_float) op (rhs.m_data.m_value.number_float); \ \ case value_t::binary: \ - return (*lhs.m_value.binary) op (*rhs.m_value.binary); \ + return (*lhs.m_data.m_value.binary) op (*rhs.m_data.m_value.binary); \ \ case value_t::discarded: \ default: \ @@ -22789,27 +22922,27 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec } \ else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_float) \ { \ - return static_cast(lhs.m_value.number_integer) op rhs.m_value.number_float; \ + return static_cast(lhs.m_data.m_value.number_integer) op rhs.m_data.m_value.number_float; \ } \ else if (lhs_type == value_t::number_float && rhs_type == value_t::number_integer) \ { \ - return lhs.m_value.number_float op static_cast(rhs.m_value.number_integer); \ + return lhs.m_data.m_value.number_float op static_cast(rhs.m_data.m_value.number_integer); \ } \ else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_float) \ { \ - return static_cast(lhs.m_value.number_unsigned) op rhs.m_value.number_float; \ + return static_cast(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_float; \ } \ else if (lhs_type == value_t::number_float && rhs_type == value_t::number_unsigned) \ { \ - return lhs.m_value.number_float op static_cast(rhs.m_value.number_unsigned); \ + return lhs.m_data.m_value.number_float op static_cast(rhs.m_data.m_value.number_unsigned); \ } \ else if (lhs_type == value_t::number_unsigned && rhs_type == value_t::number_integer) \ { \ - return static_cast(lhs.m_value.number_unsigned) op rhs.m_value.number_integer; \ + return static_cast(lhs.m_data.m_value.number_unsigned) op rhs.m_data.m_value.number_integer; \ } \ else if (lhs_type == value_t::number_integer && rhs_type == value_t::number_unsigned) \ { \ - return lhs.m_value.number_integer op static_cast(rhs.m_value.number_unsigned); \ + return lhs.m_data.m_value.number_integer op static_cast(rhs.m_data.m_value.number_unsigned); \ } \ else if(compares_unordered(lhs, rhs))\ {\ @@ -22826,8 +22959,8 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // an operation is computed as an odd number of inverses of others static bool compares_unordered(const_reference lhs, const_reference rhs, bool inverse = false) noexcept { - if ((lhs.is_number_float() && std::isnan(lhs.m_value.number_float) && rhs.is_number()) - || (rhs.is_number_float() && std::isnan(rhs.m_value.number_float) && lhs.is_number())) + if ((lhs.is_number_float() && std::isnan(lhs.m_data.m_value.number_float) && rhs.is_number()) + || (rhs.is_number_float() && std::isnan(rhs.m_data.m_value.number_float) && lhs.is_number())) { return true; } @@ -23171,7 +23304,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec #endif // JSON_NO_IO /// @} - ///////////////////// // deserialization // ///////////////////// @@ -23328,7 +23460,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec JSON_HEDLEY_RETURNS_NON_NULL const char* type_name() const noexcept { - switch (m_type) + switch (m_data.m_type) { case value_t::null: return "null"; @@ -23352,17 +23484,43 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec } } - JSON_PRIVATE_UNLESS_TESTED: ////////////////////// // member variables // ////////////////////// - /// the type of the current element - value_t m_type = value_t::null; + struct data + { + /// the type of the current element + value_t m_type = value_t::null; - /// the value of the current element - json_value m_value = {}; + /// the value of the current element + json_value m_value = {}; + + data(const value_t v) + : m_type(v), m_value(v) + { + } + + data(size_type cnt, const basic_json& val) + : m_type(value_t::array) + { + m_value.array = create(cnt, val); + } + + data() noexcept = default; + data(data&&) noexcept = default; + data(const data&) noexcept = delete; + data& operator=(data&&) noexcept = delete; + data& operator=(const data&) noexcept = delete; + + ~data() noexcept + { + m_value.destroy(m_type); + } + }; + + data m_data = {}; #if JSON_DIAGNOSTICS /// a pointer to a parent value (for debugging purposes) @@ -23543,7 +23701,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec return from_cbor(ptr, ptr + len, strict, allow_exceptions, tag_handler); } - JSON_HEDLEY_WARN_UNUSED_RESULT JSON_HEDLEY_DEPRECATED_FOR(3.8.0, from_cbor(ptr, ptr + len)) static basic_json from_cbor(detail::span_input_adapter&& i, @@ -23667,7 +23824,6 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec return res ? result : basic_json(value_t::discarded); } - /// @brief create a JSON value from an input in BJData format /// @sa https://json.nlohmann.me/api/basic_json/from_bjdata/ template @@ -23890,7 +24046,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec } // make sure the top element of the pointer exists - json_pointer top_pointer = ptr.top(); + json_pointer const top_pointer = ptr.top(); if (top_pointer != ptr) { result.at(top_pointer); @@ -23902,7 +24058,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec // parent must exist when performing patch add per RFC6902 specs basic_json& parent = result.at(ptr); - switch (parent.m_type) + switch (parent.m_data.m_type) { case value_t::null: case value_t::object: @@ -23948,7 +24104,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec }; // wrapper for "remove" operation; remove value at ptr - const auto operation_remove = [this, &result](json_pointer & ptr) + const auto operation_remove = [this, & result](json_pointer & ptr) { // get reference to parent of JSON pointer ptr const auto last_path = ptr.back(); @@ -23991,13 +24147,13 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec bool string_type) -> basic_json & { // find value - auto it = val.m_value.object->find(member); + auto it = val.m_data.m_value.object->find(member); // context-sensitive error message - const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\''); + const auto error_msg = (op == "op") ? "operation" : detail::concat("operation '", op, '\''); // NOLINT(bugprone-unused-local-non-trivial-variable) // check if desired value is present - if (JSON_HEDLEY_UNLIKELY(it == val.m_value.object->end())) + if (JSON_HEDLEY_UNLIKELY(it == val.m_data.m_value.object->end())) { // NOLINTNEXTLINE(performance-inefficient-string-concatenation) JSON_THROW(parse_error::create(105, 0, detail::concat(error_msg, " must have member '", member, "'"), &val)); @@ -24052,7 +24208,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec json_pointer from_ptr(from_path); // the "from" location must exist - use at() - basic_json v = result.at(from_ptr); + basic_json const v = result.at(from_ptr); // The move operation is functionally identical to a // "remove" operation on the "from" location, followed @@ -24069,7 +24225,7 @@ class basic_json // NOLINT(cppcoreguidelines-special-member-functions,hicpp-spec const json_pointer from_ptr(from_path); // the "from" location must exist - use at() - basic_json v = result.at(from_ptr); + basic_json const v = result.at(from_ptr); // The copy is functionally identical to an "add" // operation at the target location using the value @@ -24311,7 +24467,11 @@ inline namespace json_literals /// @brief user-defined string literal for JSON values /// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json/ JSON_HEDLEY_NON_NULL(1) -inline nlohmann::json operator "" _json(const char* s, std::size_t n) +#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) + inline nlohmann::json operator ""_json(const char* s, std::size_t n) +#else + inline nlohmann::json operator "" _json(const char* s, std::size_t n) +#endif { return nlohmann::json::parse(s, s + n); } @@ -24319,7 +24479,11 @@ inline nlohmann::json operator "" _json(const char* s, std::size_t n) /// @brief user-defined string literal for JSON pointer /// @sa https://json.nlohmann.me/api/basic_json/operator_literal_json_pointer/ JSON_HEDLEY_NON_NULL(1) -inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n) +#if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) + inline nlohmann::json::json_pointer operator ""_json_pointer(const char* s, std::size_t n) +#else + inline nlohmann::json::json_pointer operator "" _json_pointer(const char* s, std::size_t n) +#endif { return nlohmann::json::json_pointer(std::string(s, n)); } @@ -24338,7 +24502,7 @@ namespace std // NOLINT(cert-dcl58-cpp) /// @brief hash value for JSON objects /// @sa https://json.nlohmann.me/api/basic_json/std_hash/ NLOHMANN_BASIC_JSON_TPL_DECLARATION -struct hash +struct hash // NOLINT(cert-dcl58-cpp) { std::size_t operator()(const nlohmann::NLOHMANN_BASIC_JSON_TPL& j) const { @@ -24371,8 +24535,8 @@ struct less< ::nlohmann::detail::value_t> // do not remove the space after '<', /// @brief exchanges the values of two JSON objects /// @sa https://json.nlohmann.me/api/basic_json/std_swap/ NLOHMANN_BASIC_JSON_TPL_DECLARATION -inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC_JSON_TPL& j2) noexcept( // NOLINT(readability-inconsistent-declaration-parameter-name) - is_nothrow_move_constructible::value&& // NOLINT(misc-redundant-expression) +inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC_JSON_TPL& j2) noexcept( // NOLINT(readability-inconsistent-declaration-parameter-name, cert-dcl58-cpp) + is_nothrow_move_constructible::value&& // NOLINT(misc-redundant-expression,cppcoreguidelines-noexcept-swap,performance-noexcept-swap) is_nothrow_move_assignable::value) { j1.swap(j2); @@ -24383,17 +24547,22 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC } // namespace std #if JSON_USE_GLOBAL_UDLS - using nlohmann::literals::json_literals::operator "" _json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers) - using nlohmann::literals::json_literals::operator "" _json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers) + #if !defined(JSON_HEDLEY_GCC_VERSION) || JSON_HEDLEY_GCC_VERSION_CHECK(4,9,0) + using nlohmann::literals::json_literals::operator ""_json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers) + using nlohmann::literals::json_literals::operator ""_json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers) + #else + using nlohmann::literals::json_literals::operator "" _json; // NOLINT(misc-unused-using-decls,google-global-names-in-headers) + using nlohmann::literals::json_literals::operator "" _json_pointer; //NOLINT(misc-unused-using-decls,google-global-names-in-headers) + #endif #endif // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT @@ -24428,16 +24597,17 @@ inline void swap(nlohmann::NLOHMANN_BASIC_JSON_TPL& j1, nlohmann::NLOHMANN_BASIC #undef JSON_HAS_EXPERIMENTAL_FILESYSTEM #undef JSON_HAS_THREE_WAY_COMPARISON #undef JSON_HAS_RANGES + #undef JSON_HAS_STATIC_RTTI #undef JSON_USE_LEGACY_DISCARDED_VALUE_COMPARISON #endif // #include // __ _____ _____ _____ // __| | __| | | | JSON for Modern C++ -// | | |__ | | | | | | version 3.11.2 +// | | |__ | | | | | | version 3.11.3 // |_____|_____|_____|_|___| https://github.com/nlohmann/json // -// SPDX-FileCopyrightText: 2013-2022 Niels Lohmann +// SPDX-FileCopyrightText: 2013-2023 Niels Lohmann // SPDX-License-Identifier: MIT From ccf58aa3ec4d20b10162ba40898dc038ad4c3fad Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Mar 2024 14:42:59 +0100 Subject: [PATCH 25/28] cuda : refactor to remove global resources (#6170) * cuda : refactor to remove global resources --- ggml-cuda.cu | 3536 +++++++++++++++++++++++++------------------------- 1 file changed, 1760 insertions(+), 1776 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 139025588..e70b79c30 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -12,6 +12,7 @@ #include "ggml-common.h" #include +#include #include #include #include @@ -19,12 +20,13 @@ #include #include #include +#include +#include +#include #include #include #include #include -#include -#include // stringize macro for converting __CUDA_ARCH_LIST__ (list of integers) to string #define STRINGIZE_IMPL(...) #__VA_ARGS__ @@ -52,6 +54,7 @@ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6 #define cublasCreate hipblasCreate +#define cublasDestroy hipblasDestroy #define cublasGemmEx hipblasGemmEx #define cublasGemmBatchedEx hipblasGemmBatchedEx #define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx @@ -108,6 +111,7 @@ #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags +#define cudaStreamDestroy hipStreamDestroy #define cudaStreamFireAndForget hipStreamFireAndForget #define cudaStreamNonBlocking hipStreamNonBlocking #define cudaStreamPerThread hipStreamPerThread @@ -151,8 +155,6 @@ #define CC_RDNA2 (CC_OFFSET_AMD + 1030) #define CC_RDNA3 (CC_OFFSET_AMD + 1100) -#define GGML_CUDA_MAX_NODES 8192 - // define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication // on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant // for large computational tasks. the drawback is that this requires some extra amount of VRAM: @@ -170,6 +172,1214 @@ #define MMVQ_MAX_BATCH_SIZE 8 // max batch size to use MMVQ kernels #define MMQ_MAX_BATCH_SIZE 32 // max batch size to use MMQ kernels when tensor cores are available +#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses + + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); + +[[noreturn]] +static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) { + int id = -1; // in case cudaGetDevice fails + cudaGetDevice(&id); + + fprintf(stderr, "CUDA error: %s\n", msg); + fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line); + fprintf(stderr, " %s\n", stmt); + // abort with GGML_ASSERT to get a stack trace + GGML_ASSERT(!"CUDA error"); +} + +#define CUDA_CHECK_GEN(err, success, error_fn) \ + do { \ + auto err_ = (err); \ + if (err_ != (success)) { \ + ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \ + } \ + } while (0) + +#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) + +#if CUDART_VERSION >= 12000 + static const char * cublas_get_error_str(const cublasStatus_t err) { + return cublasGetStatusString(err); + } +#else + static const char * cublas_get_error_str(const cublasStatus_t err) { + switch (err) { + case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; + case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; + default: return "unknown error"; + } + } +#endif // CUDART_VERSION >= 12000 + +#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) + +#if !defined(GGML_USE_HIPBLAS) +static const char * cu_get_error_str(CUresult err) { + const char * err_str; + cuGetErrorString(err, &err_str); + return err_str; +} +#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) +#endif + +#if CUDART_VERSION >= 11100 +#define GGML_CUDA_ASSUME(x) __builtin_assume(x) +#else +#define GGML_CUDA_ASSUME(x) +#endif // CUDART_VERSION >= 11100 + + +#define GGML_CUDA_MAX_STREAMS 8 + +struct ggml_tensor_extra_gpu { + void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors + cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs +}; + +// this is faster on Windows +// probably because the Windows CUDA libraries forget to make this check before invoking the drivers +static void ggml_cuda_set_device(const int device) { + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + + if (device == current_device) { + return; + } + + CUDA_CHECK(cudaSetDevice(device)); +} + +static int ggml_cuda_get_device() { + int id; + CUDA_CHECK(cudaGetDevice(&id)); + return id; +} + +struct ggml_cuda_device_info { + int device_count; + + struct cuda_device_info { + int cc; // compute capability + size_t smpb; // max. shared memory per block + bool vmm; // virtual memory support + size_t vmm_granularity; // granularity of virtual memory + size_t total_vram; + }; + + cuda_device_info devices[GGML_CUDA_MAX_DEVICES] = {}; + + std::array default_tensor_split = {}; +}; + +static ggml_cuda_device_info ggml_cuda_init() { +#ifdef __HIP_PLATFORM_AMD__ + // Workaround for a rocBLAS bug when using multiple graphics cards: + // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 + rocblas_initialize(); + CUDA_CHECK(cudaDeviceSynchronize()); +#endif + + ggml_cuda_device_info info = {}; + + if (cudaGetDeviceCount(&info.device_count) != cudaSuccess) { + fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__); + return info; + } + + GGML_ASSERT(info.device_count <= GGML_CUDA_MAX_DEVICES); + + int64_t total_vram = 0; +#if defined(GGML_CUDA_FORCE_MMQ) + fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); +#else + fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); +#endif +#if defined(CUDA_USE_TENSOR_CORES) + fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__); +#else + fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__); +#endif + fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, info.device_count); + for (int id = 0; id < info.device_count; ++id) { + int device_vmm = 0; + +#if !defined(GGML_USE_HIPBLAS) + CUdevice device; + CU_CHECK(cuDeviceGet(&device, id)); + CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); + + if (device_vmm) { + CUmemAllocationProp alloc_prop = {}; + alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + alloc_prop.location.id = id; + CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + } +#endif // !defined(GGML_USE_HIPBLAS) + info.devices[id].vmm = !!device_vmm; + + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); + fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); + + info.default_tensor_split[id] = total_vram; + total_vram += prop.totalGlobalMem; + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + info.devices[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; +#else + info.devices[id].cc = 100*prop.major + 10*prop.minor; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + info.devices[id].smpb = prop.sharedMemPerBlock; + } + + for (int id = 0; id < info.device_count; ++id) { + info.default_tensor_split[id] /= total_vram; + } + + // configure logging to stdout + // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); + + return info; +} + +static const ggml_cuda_device_info & get_cuda_global_info() { + static ggml_cuda_device_info info = ggml_cuda_init(); + return info; +} + +// #define DEBUG_CUDA_MALLOC + +// buffer pool for cuda (legacy) +struct ggml_cuda_pool { + virtual ~ggml_cuda_pool() = default; + + virtual void * alloc(size_t size, size_t * actual_size) = 0; + virtual void free(void * ptr, size_t size) = 0; + + ggml_cuda_pool() = default; + ggml_cuda_pool(const ggml_cuda_pool &) = delete; + ggml_cuda_pool(ggml_cuda_pool &&) = delete; + ggml_cuda_pool& operator=(const ggml_cuda_pool &) = delete; + ggml_cuda_pool& operator=(ggml_cuda_pool &&) = delete; +}; + +struct ggml_cuda_pool_leg : public ggml_cuda_pool { + static const int MAX_BUFFERS = 256; + + int device; + struct ggml_cuda_buffer { + void * ptr = nullptr; + size_t size = 0; + }; + + ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {}; + size_t pool_size = 0; + + explicit ggml_cuda_pool_leg(int device) : + device(device) { + } + + ~ggml_cuda_pool_leg() { + ggml_cuda_set_device(device); + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cuda_buffer & b = buffer_pool[i]; + if (b.ptr != nullptr) { + CUDA_CHECK(cudaFree(b.ptr)); + pool_size -= b.size; + } + } + GGML_ASSERT(pool_size == 0); + } + + void * alloc(size_t size, size_t * actual_size) override { +#ifdef DEBUG_CUDA_MALLOC + int nnz = 0; + size_t max_size = 0; +#endif + size_t best_diff = 1ull << 36; + int ibest = -1; + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cuda_buffer& b = buffer_pool[i]; + if (b.ptr != nullptr) { +#ifdef DEBUG_CUDA_MALLOC + ++nnz; + if (b.size > max_size) max_size = b.size; +#endif + if (b.size >= size) { + size_t diff = b.size - size; + if (diff < best_diff) { + best_diff = diff; + ibest = i; + if (!best_diff) { + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + } + } + } + } + if (ibest >= 0) { + ggml_cuda_buffer& b = buffer_pool[ibest]; + void * ptr = b.ptr; + *actual_size = b.size; + b.ptr = nullptr; + b.size = 0; + return ptr; + } + void * ptr; + size_t look_ahead_size = (size_t) (1.05 * size); + look_ahead_size = 256 * ((look_ahead_size + 255)/256); + ggml_cuda_set_device(device); + CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size)); + *actual_size = look_ahead_size; + pool_size += look_ahead_size; +#ifdef DEBUG_CUDA_MALLOC + fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, + (uint32_t)(max_size/1024/1024), (uint32_t)(pool_size/1024/1024), (uint32_t)(size/1024/1024)); +#endif + return ptr; + } + + void free(void * ptr, size_t size) override { + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cuda_buffer& b = buffer_pool[i]; + if (b.ptr == nullptr) { + b.ptr = ptr; + b.size = size; + return; + } + } + fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); + ggml_cuda_set_device(device); + CUDA_CHECK(cudaFree(ptr)); + pool_size -= size; + } +}; + +// pool with virtual memory +#if !defined(GGML_USE_HIPBLAS) +struct ggml_cuda_pool_vmm : public ggml_cuda_pool { + static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB + + int device; + CUdeviceptr pool_addr = 0; + size_t pool_used = 0; + size_t pool_size = 0; + size_t granularity; + + explicit ggml_cuda_pool_vmm(int device) : + device(device), + granularity(get_cuda_global_info().devices[device].vmm_granularity) { + } + + ~ggml_cuda_pool_vmm() { + if (pool_addr != 0) { + CU_CHECK(cuMemUnmap(pool_addr, pool_size)); + CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE)); + } + } + + void * alloc(size_t size, size_t * actual_size) override { + // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types + const size_t alignment = 128; + size = alignment * ((size + alignment - 1) / alignment); + + size_t avail = pool_size - pool_used; + + if (size > avail) { + // round up to the next multiple of the granularity + size_t reserve_size = size - avail; + reserve_size = granularity * ((reserve_size + granularity - 1) / granularity); + + GGML_ASSERT(pool_size + reserve_size <= CUDA_POOL_VMM_MAX_SIZE); + + // allocate more physical memory + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + prop.location.id = device; + CUmemGenericAllocationHandle handle; + CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0)); + + // reserve virtual address space (if not already reserved) + if (pool_addr == 0) { + CU_CHECK(cuMemAddressReserve(&pool_addr, CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)); + } + + // map at the end of the pool + CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0)); + + // the memory allocation handle is no longer needed after mapping + CU_CHECK(cuMemRelease(handle)); + + // set access + CUmemAccessDesc access = {}; + access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + access.location.id = device; + access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; + CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1)); + + // add to the pool + pool_size += reserve_size; + + //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n", + // id, (unsigned long long) (pool_size[id]/1024/1024), + // (unsigned long long) (reserve_size/1024/1024)); + } + + GGML_ASSERT(pool_addr != 0); + + void * ptr = (void *) (pool_addr + pool_used); + *actual_size = size; + pool_used += size; + +#ifdef DEBUG_CUDA_MALLOC + printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr); +#endif + + return ptr; + } + + void free(void * ptr, size_t size) override { +#ifdef DEBUG_CUDA_MALLOC + printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr); +#endif + + pool_used -= size; + + // all deallocations must be in reverse order of the allocations + GGML_ASSERT(ptr == (void *) (pool_addr + pool_used)); + } +}; +#endif // !defined(GGML_USE_HIPBLAS) + +template +struct ggml_cuda_pool_alloc { + ggml_cuda_pool * pool = nullptr; + T * ptr = nullptr; + size_t actual_size = 0; + + ggml_cuda_pool_alloc() = default; + + explicit ggml_cuda_pool_alloc(ggml_cuda_pool & pool) : pool(&pool) { + } + + ggml_cuda_pool_alloc(ggml_cuda_pool & pool, size_t size) : pool(&pool) { + alloc(size); + } + + ~ggml_cuda_pool_alloc() { + if (ptr != nullptr) { + pool->free(ptr, actual_size); + } + } + + // size is in number of elements + T * alloc(size_t size) { + GGML_ASSERT(pool != nullptr); + GGML_ASSERT(ptr == nullptr); + ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); + return ptr; + } + + T * alloc(ggml_cuda_pool & pool, size_t size) { + this->pool = &pool; + return alloc(size); + } + + T * get() { + return ptr; + } + + ggml_cuda_pool_alloc(const ggml_cuda_pool_alloc &) = delete; + ggml_cuda_pool_alloc(ggml_cuda_pool_alloc &&) = delete; + ggml_cuda_pool_alloc& operator=(const ggml_cuda_pool_alloc &) = delete; + ggml_cuda_pool_alloc& operator=(ggml_cuda_pool_alloc &&) = delete; +}; + + +// backend interface + +struct ggml_backend_cuda_context { + int device; + std::string name; + cudaEvent_t copy_event = nullptr; + + cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } }; + cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; + + explicit ggml_backend_cuda_context(int device) : + device(device), + name(GGML_CUDA_NAME + std::to_string(device)) { + } + + ~ggml_backend_cuda_context() { + if (copy_event != nullptr) { + CUDA_CHECK(cudaEventDestroy(copy_event)); + } + for (int i = 0; i < GGML_CUDA_MAX_DEVICES; ++i) { + for (int j = 0; j < GGML_CUDA_MAX_STREAMS; ++j) { + if (streams[i][j] != nullptr) { + CUDA_CHECK(cudaStreamDestroy(streams[i][j])); + } + } + if (cublas_handles[i] != nullptr) { + CUBLAS_CHECK(cublasDestroy(cublas_handles[i])); + } + } + } + + cudaStream_t stream(int device, int stream) { + if (streams[device][stream] == nullptr) { + ggml_cuda_set_device(device); + CUDA_CHECK(cudaStreamCreateWithFlags(&streams[device][stream], cudaStreamNonBlocking)); + } + return streams[device][stream]; + } + + cudaStream_t stream() { + return stream(device, 0); + } + + cublasHandle_t cublas_handle(int device) { + if (cublas_handles[device] == nullptr) { + ggml_cuda_set_device(device); + CUBLAS_CHECK(cublasCreate(&cublas_handles[device])); + CUBLAS_CHECK(cublasSetMathMode(cublas_handles[device], CUBLAS_TF32_TENSOR_OP_MATH)); + } + return cublas_handles[device]; + } + + cublasHandle_t cublas_handle() { + return cublas_handle(device); + } + + // pool + std::unique_ptr pools[GGML_CUDA_MAX_DEVICES]; + + static std::unique_ptr new_pool_for_device(int device) { +#if !defined(GGML_USE_HIPBLAS) + if (get_cuda_global_info().devices[device].vmm) { + return std::unique_ptr(new ggml_cuda_pool_vmm(device)); + } +#endif + return std::unique_ptr(new ggml_cuda_pool_leg(device)); + } + + ggml_cuda_pool & pool(int device) { + if (pools[device] == nullptr) { + pools[device] = new_pool_for_device(device); + } + return *pools[device]; + } + + ggml_cuda_pool & pool() { + return pool(device); + } +}; + +// cuda buffer + +struct ggml_backend_cuda_buffer_context { + int device; + void * dev_ptr = nullptr; + std::string name; + + ggml_backend_cuda_buffer_context(int device, void * dev_ptr) : + device(device), dev_ptr(dev_ptr), + name(GGML_CUDA_NAME + std::to_string(device)) { + } + + ~ggml_backend_cuda_buffer_context() { + CUDA_CHECK(cudaFree(dev_ptr)); + } +}; + +GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + return ctx->name.c_str(); +} + +GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name; +} + +GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + delete ctx; +} + +GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + return ctx->dev_ptr; +} + +GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + + if (tensor->view_src != NULL && tensor->view_offs == 0) { + assert(tensor->view_src->buffer->buft == buffer->buft); + tensor->backend = tensor->view_src->backend; + tensor->extra = tensor->view_src->extra; + return; + } + + if (ggml_is_quantized(tensor->type)) { + // initialize padding to 0 to avoid possible NaN values + size_t original_size = ggml_nbytes(tensor); + size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); + + if (padded_size > original_size && tensor->view_src == nullptr) { + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); + } + } +} + +GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); +} + +GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); +} + +GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + if (ggml_backend_buffer_is_cuda(src->buffer)) { + ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; + ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; + if (src_ctx->device == dst_ctx->device) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); + } else { + CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread)); + } + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + return true; + } + return false; + + GGML_UNUSED(buffer); +} + +GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size)); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_get_name, + /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, + /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cuda_buffer_clear, + /* .reset = */ NULL, +}; + +// cuda buffer type +struct ggml_backend_cuda_buffer_type_context { + int device; + std::string name; +}; + +GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { + ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + + return ctx->name.c_str(); +} + +GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + + ggml_cuda_set_device(buft_ctx->device); + + size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 + + void * dev_ptr; + cudaError_t err = cudaMalloc(&dev_ptr, size); + if (err != cudaSuccess) { + fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); + return nullptr; + } + + ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); + + return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); +} + +GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + + GGML_UNUSED(buft); +} + +GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + size_t size = ggml_nbytes(tensor); + int64_t ne0 = tensor->ne[0]; + + if (ggml_is_quantized(tensor->type)) { + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return size; + + GGML_UNUSED(buft); +} + +GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + if (!ggml_backend_is_cuda(backend)) { + return false; + } + + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + return buft_ctx->device == cuda_ctx->device; +} + +static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, + /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, + /* .is_host = */ NULL, +}; + +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + if (device >= ggml_backend_cuda_get_device_count()) { + return nullptr; + } + + static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + + static bool ggml_backend_cuda_buffer_type_initialized = false; + + if (!ggml_backend_cuda_buffer_type_initialized) { + for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { + ggml_backend_cuda_buffer_types[i] = { + /* .iface = */ ggml_backend_cuda_buffer_type_interface, + /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)}, + }; + } + ggml_backend_cuda_buffer_type_initialized = true; + } + + return &ggml_backend_cuda_buffer_types[device]; +} + +// cuda split buffer + +static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { + int64_t min_compute_capability = INT_MAX; + int64_t max_compute_capability = INT_MIN; + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + if (tensor_split[id] < (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) { + if (min_compute_capability > get_cuda_global_info().devices[id].cc) { + min_compute_capability = get_cuda_global_info().devices[id].cc; + } + if (max_compute_capability < get_cuda_global_info().devices[id].cc) { + max_compute_capability = get_cuda_global_info().devices[id].cc; + } + } + } + +#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return 1; + case GGML_TYPE_Q2_K: + return max_compute_capability >= CC_RDNA2 ? 128 : 32; + case GGML_TYPE_Q3_K: + return min_compute_capability < CC_RDNA2 ? 128 : 64; + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_Q6_K: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ3_S: + return max_compute_capability >= CC_RDNA2 ? 128 : 64; + default: + GGML_ASSERT(false); + } +#else + switch(type) { + case GGML_TYPE_Q4_0: + case GGML_TYPE_Q4_1: + return max_compute_capability >= CC_VOLTA ? 128 : 64; + case GGML_TYPE_Q5_0: + case GGML_TYPE_Q5_1: + case GGML_TYPE_Q8_0: + return 64; + case GGML_TYPE_F16: + case GGML_TYPE_F32: + return 1; + case GGML_TYPE_Q2_K: + case GGML_TYPE_Q3_K: + case GGML_TYPE_Q4_K: + case GGML_TYPE_Q5_K: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ2_S: + case GGML_TYPE_IQ3_XXS: + case GGML_TYPE_IQ1_S: + case GGML_TYPE_IQ4_NL: + case GGML_TYPE_IQ4_XS: + case GGML_TYPE_IQ3_S: + return max_compute_capability >= CC_VOLTA ? 128 : 64; + case GGML_TYPE_Q6_K: + return 64; + default: + GGML_ASSERT(false); + } +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +} + +static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { + const int64_t nrows = ggml_nrows(tensor); + const int64_t rounding = get_row_rounding(tensor->type, tensor_split); + + *row_low = id == 0 ? 0 : nrows*tensor_split[id]; + *row_low -= *row_low % rounding; + + if (id == ggml_backend_cuda_get_device_count() - 1) { + *row_high = nrows; + } else { + *row_high = nrows*tensor_split[id + 1]; + *row_high -= *row_high % rounding; + } +} + +static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { + static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); + + return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); +} + +struct ggml_backend_cuda_split_buffer_type_context { + std::array tensor_split; +}; + +struct ggml_backend_cuda_split_buffer_context { + ~ggml_backend_cuda_split_buffer_context() { + for (ggml_tensor_extra_gpu * extra : tensor_extras) { + for (int id = 0; id < GGML_CUDA_MAX_DEVICES; ++id) { + for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { + if (extra->events[id][is] != nullptr) { + CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + } + } + if (extra->data_device[id] != nullptr) { + CUDA_CHECK(cudaFree(extra->data_device[id])); + } + } + delete extra; + } + } + + std::vector tensor_extras; +}; + +GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { + return GGML_CUDA_NAME "_Split"; + + GGML_UNUSED(buffer); +} + +static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; + GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds +} + +GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + delete ctx; +} + +GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { + // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced + return (void *)0x1000; + + GGML_UNUSED(buffer); +} + +GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported + + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + + ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + ctx->tensor_extras.push_back(extra); + + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + // FIXME: do not crash if cudaMalloc fails + // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first + ggml_cuda_set_device(id); + char * buf; + CUDA_CHECK(cudaMalloc(&buf, size)); + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + } + + extra->data_device[id] = buf; + + for (int64_t is = 0; is < GGML_CUDA_MAX_STREAMS; ++is) { + CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + } + } + tensor->extra = extra; +} + +GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + const char * buf_host = (const char *)data + offset_split; + CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); + } + + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + } +} + +GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + char * buf_host = (char *)data + offset_split; + CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); + } + + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); + } +} + +GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + GGML_UNUSED(buffer); + GGML_UNUSED(value); +} + +static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_get_name, + /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_cuda_split_buffer_clear, + /* .reset = */ NULL, +}; + +// cuda split buffer type + +GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_CUDA_NAME "_Split"; + + GGML_UNUSED(buft); +} + +GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point + // instead, we allocate them for each tensor separately in init_tensor + // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, + // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. + ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); + + return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); +} + +GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + + GGML_UNUSED(buft); +} + +GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; + + size_t total_size = 0; + + const int64_t ne0 = tensor->ne[0]; + + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + total_size += ggml_nbytes_split(tensor, nrows_split); + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return total_size; +} + +GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + return ggml_backend_is_cuda(backend); + + GGML_UNUSED(buft); +} + +GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + GGML_UNUSED(buft); +} + +static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, + /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, +}; + +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { + static std::mutex mutex; + std::lock_guard lock(mutex); + + static std::map, struct ggml_backend_buffer_type> buft_map; + + std::array tensor_split_arr = {}; + + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; }); + if (all_zero) { + tensor_split_arr = get_cuda_global_info().default_tensor_split; + } else { + float split_sum = 0.0f; + for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { + tensor_split_arr[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < ggml_backend_cuda_get_device_count(); ++i) { + tensor_split_arr[i] /= split_sum; + } + } + + auto it = buft_map.find(tensor_split_arr); + if (it != buft_map.end()) { + return &it->second; + } + + struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, + /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, + }; + + auto result = buft_map.emplace(tensor_split_arr, buft); + return &result.first->second; +} + +// host buffer type + +GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_CUDA_NAME "_Host"; + + GGML_UNUSED(buft); +} + +GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { + return GGML_CUDA_NAME "_Host"; + + GGML_UNUSED(buffer); +} + +GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + CUDA_CHECK(cudaFreeHost(buffer->context)); +} + +static void * ggml_cuda_host_malloc(size_t size) { + if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { + return nullptr; + } + + void * ptr = nullptr; + cudaError_t err = cudaMallocHost((void **) &ptr, size); + if (err != cudaSuccess) { + // clear the error + cudaGetLastError(); + fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, + size/1024.0/1024.0, cudaGetErrorString(err)); + return nullptr; + } + + return ptr; +} + +GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr = ggml_cuda_host_malloc(size); + + if (ptr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.get_name = ggml_backend_cuda_host_buffer_name; + buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; + + return buffer; +} + +GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_cuda_buffer_type_host; +} + +//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) { +// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name; +//} + + + +/// kernels + + #if defined(GGML_USE_HIPBLAS) #define __CUDA_ARCH__ 1300 @@ -253,71 +1463,6 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) { } #endif // defined(GGML_USE_HIPBLAS) -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size"); - -[[noreturn]] -static void ggml_cuda_error(const char * stmt, const char * func, const char * file, const int line, const char * msg) { - int id = -1; // in case cudaGetDevice fails - cudaGetDevice(&id); - - fprintf(stderr, "CUDA error: %s\n", msg); - fprintf(stderr, " current device: %d, in function %s at %s:%d\n", id, func, file, line); - fprintf(stderr, " %s\n", stmt); - // abort with GGML_ASSERT to get a stack trace - GGML_ASSERT(!"CUDA error"); -} - -#define CUDA_CHECK_GEN(err, success, error_fn) \ - do { \ - auto err_ = (err); \ - if (err_ != (success)) { \ - ggml_cuda_error(#err, __func__, __FILE__, __LINE__, error_fn(err_)); \ - } \ - } while (0) - -#define CUDA_CHECK(err) CUDA_CHECK_GEN(err, cudaSuccess, cudaGetErrorString) - -#if CUDART_VERSION >= 12000 - static const char * cublas_get_error_str(const cublasStatus_t err) { - return cublasGetStatusString(err); - } -#else - static const char * cublas_get_error_str(const cublasStatus_t err) { - switch (err) { - case CUBLAS_STATUS_SUCCESS: return "CUBLAS_STATUS_SUCCESS"; - case CUBLAS_STATUS_NOT_INITIALIZED: return "CUBLAS_STATUS_NOT_INITIALIZED"; - case CUBLAS_STATUS_ALLOC_FAILED: return "CUBLAS_STATUS_ALLOC_FAILED"; - case CUBLAS_STATUS_INVALID_VALUE: return "CUBLAS_STATUS_INVALID_VALUE"; - case CUBLAS_STATUS_ARCH_MISMATCH: return "CUBLAS_STATUS_ARCH_MISMATCH"; - case CUBLAS_STATUS_MAPPING_ERROR: return "CUBLAS_STATUS_MAPPING_ERROR"; - case CUBLAS_STATUS_EXECUTION_FAILED: return "CUBLAS_STATUS_EXECUTION_FAILED"; - case CUBLAS_STATUS_INTERNAL_ERROR: return "CUBLAS_STATUS_INTERNAL_ERROR"; - case CUBLAS_STATUS_NOT_SUPPORTED: return "CUBLAS_STATUS_NOT_SUPPORTED"; - default: return "unknown error"; - } - } -#endif // CUDART_VERSION >= 12000 - -#define CUBLAS_CHECK(err) CUDA_CHECK_GEN(err, CUBLAS_STATUS_SUCCESS, cublas_get_error_str) - -#if !defined(GGML_USE_HIPBLAS) -static const char * cu_get_error_str(CUresult err) { - const char * err_str; - cuGetErrorString(err, &err_str); - return err_str; -} -#define CU_CHECK(err) CUDA_CHECK_GEN(err, CUDA_SUCCESS, cu_get_error_str) -#endif - -#if CUDART_VERSION >= 11100 -#define GGML_CUDA_ASSUME(x) __builtin_assume(x) -#else -#define GGML_CUDA_ASSUME(x) -#endif // CUDART_VERSION >= 11100 #ifdef GGML_CUDA_F16 typedef half dfloat; // dequantize float @@ -363,12 +1508,17 @@ typedef to_t_cuda_t to_fp16_cuda_t; typedef void (*dequantize_kernel_t)(const void * vx, const int ib, const int iqs, dfloat2 & v); typedef void (*dot_kernel_k_t)(const void * __restrict__ vx, const int ib, const int iqs, const float * __restrict__ y, float & v); typedef void (*cpy_kernel_t)(const char * cx, char * cdst); -typedef void (*ggml_cuda_func_t)(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + +typedef void (*ggml_cuda_func_t)(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst); + typedef void (*ggml_cuda_op_mul_mat_t)( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream); + typedef void (*ggml_cuda_op_flatten_t)( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream); @@ -382,7 +1532,6 @@ typedef float (*vec_dot_q_mul_mat_cuda_t)( const int * __restrict__ y_qs, const half2 * __restrict__ y_ms, const int & i, const int & j, const int & k); #define WARP_SIZE 32 -#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses #define CUDA_GELU_BLOCK_SIZE 256 #define CUDA_SILU_BLOCK_SIZE 256 @@ -432,42 +1581,6 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA #define MUL_MAT_SRC1_COL_STRIDE 128 -#define MAX_STREAMS 8 -static cudaStream_t g_cudaStreams[GGML_CUDA_MAX_DEVICES][MAX_STREAMS] = { { nullptr } }; - -struct ggml_tensor_extra_gpu { - void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors - cudaEvent_t events[GGML_CUDA_MAX_DEVICES][MAX_STREAMS]; // events for synchronizing multiple GPUs -}; - -// this is faster on Windows -// probably because the Windows CUDA libraries forget to make this check before invoking the drivers -static void ggml_cuda_set_device(const int device) { - int current_device; - CUDA_CHECK(cudaGetDevice(¤t_device)); - - if (device == current_device) { - return; - } - - CUDA_CHECK(cudaSetDevice(device)); -} - -static int g_device_count = -1; -static int g_main_device = 0; -static std::array g_default_tensor_split = {}; - -struct cuda_device_capabilities { - int cc; // compute capability - size_t smpb; // max. shared memory per block - bool vmm; // virtual memory support - size_t vmm_granularity; // granularity of virtual memory -}; - -static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} }; - -static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; - [[noreturn]] static __device__ void no_device_code( const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) { @@ -475,20 +1588,21 @@ static __device__ void no_device_code( #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n", file_name, line, function_name, arch); - (void) arch_list; + GGML_UNUSED(arch_list); #else printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n", file_name, line, function_name, arch, arch_list); #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) __trap(); - (void) no_device_code; // suppress unused function warning + GGML_UNUSED(no_device_code); // suppress unused function warning } #ifdef __CUDA_ARCH__ #define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__)) #else -#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") +//#define NO_DEVICE_CODE GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.") +#define NO_DEVICE_CODE #endif // __CUDA_ARCH__ static __device__ __forceinline__ float warp_reduce_sum(float x) { @@ -517,7 +1631,7 @@ static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { } return a; #else - (void) a; + GGML_UNUSED(a); NO_DEVICE_CODE; #endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL } @@ -539,7 +1653,7 @@ static __device__ __forceinline__ float warp_reduce_max(float x) { // } // return x; //#else -// (void) x; +// GGML_UNUSED(x); // NO_DEVICE_CODE; //#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX //} @@ -2293,7 +3407,9 @@ static __global__ void dequantize_block_q8_0_f16(const void * __restrict__ vx, h y2[iy/2 + threadIdx.x] = __hmul2(make_half2(qs.x, qs.y), __half2half2(d)); } #else - (void) vx; (void) y; (void) k; + GGML_UNUSED(vx); + GGML_UNUSED(y); + GGML_UNUSED(k); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_PASCAL } @@ -2853,7 +3969,8 @@ static __device__ __forceinline__ float vec_dot_q4_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); + GGML_UNUSED(x_sc); __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI4_0) + mmq_y/QI4_0]; @@ -2865,7 +3982,7 @@ template static __device__ __forceinline__ void allocate_tiles_q4_0( template static __device__ __forceinline__ void load_tiles_q4_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); GGML_CUDA_ASSUME(k >= 0); @@ -2912,7 +4029,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q4_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const float * x_dmf = (const float *) x_dm; @@ -2949,7 +4066,7 @@ static __device__ __forceinline__ float vec_dot_q4_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_1) + mmq_y/QI4_1]; @@ -2961,7 +4078,7 @@ template static __device__ __forceinline__ void allocate_tiles_q4_1( template static __device__ __forceinline__ void load_tiles_q4_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3006,7 +4123,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q4_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); @@ -3044,7 +4161,7 @@ static __device__ __forceinline__ float vec_dot_q5_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI5_0) + mmq_y/QI5_0]; @@ -3056,7 +4173,7 @@ template static __device__ __forceinline__ void allocate_tiles_q5_0( template static __device__ __forceinline__ void load_tiles_q5_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3121,7 +4238,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q5_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_0) + i/QI5_0 + k/QI5_0; @@ -3161,7 +4278,7 @@ static __device__ __forceinline__ float vec_dot_q5_1_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_1(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_1) + mmq_y/QI5_1]; @@ -3173,7 +4290,7 @@ template static __device__ __forceinline__ void allocate_tiles_q5_1( template static __device__ __forceinline__ void load_tiles_q5_1( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3235,7 +4352,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q5_1_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); const int kyqs = k % (QI8_1/2) + QI8_1 * (k / (QI8_1/2)); const int index_bx = i * (WARP_SIZE/QI5_1) + + i/QI5_1 + k/QI5_1; @@ -3270,7 +4387,7 @@ static __device__ __forceinline__ float vec_dot_q8_0_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q8_0(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); __shared__ int tile_x_qs[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ float tile_x_d[mmq_y * (WARP_SIZE/QI8_0) + mmq_y/QI8_0]; @@ -3282,7 +4399,7 @@ template static __device__ __forceinline__ void allocate_tiles_q8_0( template static __device__ __forceinline__ void load_tiles_q8_0( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3328,7 +4445,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q8_0_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; (void)x_sc; + GGML_UNUSED(x_qh); GGML_UNUSED(x_sc); const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -3362,7 +4479,7 @@ static __device__ __forceinline__ float vec_dot_q2_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q2_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - (void)x_qh; + GGML_UNUSED(x_qh); __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI2_K) + mmq_y/QI2_K]; @@ -3376,7 +4493,7 @@ template static __device__ __forceinline__ void allocate_tiles_q2_K( template static __device__ __forceinline__ void load_tiles_q2_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - (void)x_qh; + GGML_UNUSED(x_qh); GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3434,7 +4551,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q2_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; + GGML_UNUSED(x_qh); const int kbx = k / QI2_K; const int ky = (k % QI2_K) * QR2_K; @@ -3703,7 +4820,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q4_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - (void)x_qh; + GGML_UNUSED(x_qh); __shared__ int tile_x_ql[mmq_y * (WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI4_K) + mmq_y/QI4_K]; @@ -3717,7 +4834,7 @@ template static __device__ __forceinline__ void allocate_tiles_q4_K( template static __device__ __forceinline__ void load_tiles_q4_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - (void)x_qh; + GGML_UNUSED(x_qh); GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3787,7 +4904,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q4_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; + GGML_UNUSED(x_qh); const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2*((k % 16) / 8); @@ -3886,7 +5003,7 @@ static __device__ __forceinline__ float vec_dot_q5_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q5_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - (void)x_qh; + GGML_UNUSED(x_qh); __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI5_K) + mmq_y/QI5_K]; @@ -3900,7 +5017,7 @@ template static __device__ __forceinline__ void allocate_tiles_q5_K( template static __device__ __forceinline__ void load_tiles_q5_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - (void)x_qh; + GGML_UNUSED(x_qh); GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -3981,7 +5098,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q5_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; + GGML_UNUSED(x_qh); const uint8_t * sc = ((const uint8_t *) &x_sc[i * (WARP_SIZE/8) + i/8 + k/16]) + 2 * ((k % 16) / 8); @@ -4018,7 +5135,7 @@ static __device__ __forceinline__ float vec_dot_q6_K_q8_1( } template static __device__ __forceinline__ void allocate_tiles_q6_K(int ** x_ql, half2 ** x_dm, int ** x_qh, int ** x_sc) { - (void)x_qh; + GGML_UNUSED(x_qh); __shared__ int tile_x_ql[mmq_y * (2*WARP_SIZE) + mmq_y]; __shared__ half2 tile_x_dm[mmq_y * (WARP_SIZE/QI6_K) + mmq_y/QI6_K]; @@ -4032,7 +5149,7 @@ template static __device__ __forceinline__ void allocate_tiles_q6_K( template static __device__ __forceinline__ void load_tiles_q6_K( const void * __restrict__ vx, int * __restrict__ x_ql, half2 * __restrict__ x_dm, int * __restrict__ x_qh, int * __restrict__ x_sc, const int & i_offset, const int & i_max, const int & k, const int & blocks_per_row) { - (void)x_qh; + GGML_UNUSED(x_qh); GGML_CUDA_ASSUME(i_offset >= 0); GGML_CUDA_ASSUME(i_offset < nwarps); @@ -4104,7 +5221,7 @@ template static __device__ __forceinlin static __device__ __forceinline__ float vec_dot_q6_K_q8_1_mul_mat( const int * __restrict__ x_ql, const half2 * __restrict__ x_dm, const int * __restrict__ x_qh, const int * __restrict__ x_sc, const int * __restrict__ y_qs, const half2 * __restrict__ y_ds, const int & i, const int & j, const int & k) { - (void)x_qh; + GGML_UNUSED(x_qh); const float * x_dmf = (const float *) x_dm; const float * y_df = (const float *) y_ds; @@ -4199,12 +5316,12 @@ static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); #else - (void) ksigns64; + GGML_UNUSED(ksigns64); assert(false); return 0.f; #endif #else - (void) ksigns64; + GGML_UNUSED(ksigns64); assert(false); return 0.f; #endif @@ -4247,12 +5364,12 @@ static __device__ __forceinline__ float vec_dot_iq2_s_q8_1( const float d = (float)bq2->d * __low2float(bq8_1[ib32].ds) * 0.25f; return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); #else - (void) ksigns64; + GGML_UNUSED(ksigns64); assert(false); return 0.f; #endif #else - (void) ksigns64; + GGML_UNUSED(ksigns64); assert(false); return 0.f; #endif @@ -4654,7 +5771,7 @@ template static __global__ void load_tiles_q4_0, VDR_Q4_0_Q8_1_MMQ, vec_dot_q4_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q4_0_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q4_0_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4723,7 +5840,7 @@ template static __global__ void load_tiles_q4_1, VDR_Q4_1_Q8_1_MMQ, vec_dot_q4_1_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q4_1_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q4_1_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4790,7 +5907,7 @@ template static __global__ void load_tiles_q5_0, VDR_Q5_0_Q8_1_MMQ, vec_dot_q5_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q5_0_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q5_0_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4857,7 +5974,7 @@ mul_mat_q5_1( load_tiles_q5_1, VDR_Q5_1_Q8_1_MMQ, vec_dot_q5_1_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q5_1_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q5_1_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4924,7 +6041,7 @@ template static __global__ void load_tiles_q8_0, VDR_Q8_0_Q8_1_MMQ, vec_dot_q8_0_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q8_0_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q8_0_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -4991,7 +6108,7 @@ mul_mat_q2_K( load_tiles_q2_K, VDR_Q2_K_Q8_1_MMQ, vec_dot_q2_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q2_K_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q2_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -5060,7 +6177,7 @@ template static __global__ void load_tiles_q3_K, VDR_Q3_K_Q8_1_MMQ, vec_dot_q3_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q3_K_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q3_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -5129,7 +6246,7 @@ template static __global__ void load_tiles_q4_K, VDR_Q4_K_Q8_1_MMQ, vec_dot_q4_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q4_K_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q4_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -5196,7 +6313,7 @@ mul_mat_q5_K( load_tiles_q5_K, VDR_Q5_K_Q8_1_MMQ, vec_dot_q5_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q5_K_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q5_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -5265,7 +6382,7 @@ template static __global__ void load_tiles_q6_K, VDR_Q6_K_Q8_1_MMQ, vec_dot_q6_K_q8_1_mul_mat> (vx, vy, dst, ncols_x, nrows_x, ncols_y, nrows_y, nrows_dst); #else - (void) vec_dot_q6_K_q8_1_mul_mat; + GGML_UNUSED(vec_dot_q6_K_q8_1_mul_mat); NO_DEVICE_CODE; #endif // __CUDA_ARCH__ >= CC_VOLTA } @@ -6082,46 +7199,48 @@ static __global__ void pool2d_nchw_kernel( const int kh, const int kw, const int sh, const int sw, const int ph, const int pw, const int parallel_elements, const Ti* src, To* dst, const enum ggml_op_pool op) { - int idx = threadIdx.x + blockIdx.x * blockDim.x; - if (idx >= parallel_elements) { - return; - } + int idx = threadIdx.x + blockIdx.x * blockDim.x; + if (idx >= parallel_elements) { + return; + } - const int I_HW = ih * iw; - const int O_HW = oh * ow; - const int nc = idx / O_HW; - const int cur_oh = idx % O_HW / ow; - const int cur_ow = idx % O_HW % ow; - const Ti* i_ptr = src + nc * I_HW; - To* o_ptr = dst + nc * O_HW; - const int start_h = cur_oh * sh - ph; - const int bh = max(0, start_h); - const int eh = min(ih, start_h + kh); - const int start_w = cur_ow * sw - pw; - const int bw = max(0, start_w); - const int ew = min(iw, start_w + kw); - const To scale = 1. / (kh * kw); - To res = 0; + const int I_HW = ih * iw; + const int O_HW = oh * ow; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / ow; + const int cur_ow = idx % O_HW % ow; + const Ti* i_ptr = src + nc * I_HW; + To* o_ptr = dst + nc * O_HW; + const int start_h = cur_oh * sh - ph; + const int bh = max(0, start_h); + const int eh = min(ih, start_h + kh); + const int start_w = cur_ow * sw - pw; + const int bw = max(0, start_w); + const int ew = min(iw, start_w + kw); + const To scale = 1. / (kh * kw); + To res = 0; - switch (op) { - case GGML_OP_POOL_AVG: res = 0; break; - case GGML_OP_POOL_MAX: res = -FLT_MAX; break; - } + switch (op) { + case GGML_OP_POOL_AVG: res = 0; break; + case GGML_OP_POOL_MAX: res = -FLT_MAX; break; + default: assert(false); + } - for (int i = bh; i < eh; i += 1) { - for (int j = bw; j < ew; j += 1) { - #if __CUDA_ARCH__ >= 350 - Ti cur = __ldg(i_ptr + i * iw + j); - #else - Ti cur = i_ptr[i * iw + j]; - #endif - switch (op) { - case GGML_OP_POOL_AVG: res += cur * scale; break; - case GGML_OP_POOL_MAX: res = max(res, (To)cur); break; - } + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { +#if __CUDA_ARCH__ >= 350 + Ti cur = __ldg(i_ptr + i * iw + j); +#else + Ti cur = i_ptr[i * iw + j]; +#endif + switch (op) { + case GGML_OP_POOL_AVG: res += cur * scale; break; + case GGML_OP_POOL_MAX: res = max(res, (To)cur); break; + default: assert(false); } } - o_ptr[cur_oh * ow + cur_ow] = res; + } + o_ptr[cur_oh * ow + cur_ow] = res; } template @@ -6155,7 +7274,7 @@ static void get_rows_cuda(const ggml_tensor * src0, const ggml_tensor * src1, gg /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); - (void) dst; + GGML_UNUSED(dst); } template @@ -6187,7 +7306,7 @@ static void get_rows_cuda_float(const ggml_tensor * src0, const ggml_tensor * sr /* nb00,*/ nb01, nb02, nb03, s10, s11, s12/*, s13*/); - (void) dst; + GGML_UNUSED(dst); } template @@ -6580,7 +7699,7 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_block_cuda; case GGML_TYPE_Q8_0: CUDA_CHECK(cudaGetDevice(&id)); - if (g_device_caps[id].cc >= CC_PASCAL) { + if (get_cuda_global_info().devices[id].cc >= CC_PASCAL) { return dequantize_block_q8_0_f16_cuda; } return dequantize_block_cuda; @@ -6773,7 +7892,7 @@ static void mul_mat_vec_q_cuda( int64_t nwarps = 1; int64_t rows_per_cuda_block = 1; - if (g_device_caps[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2 + if (get_cuda_global_info().devices[id].cc < CC_RDNA2) { // NVIDIA and AMD older than RDNA2 switch(ncols_y) { case 1: nwarps = 4; @@ -6846,7 +7965,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -6891,7 +8010,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -6936,7 +8055,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -6981,7 +8100,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -7026,7 +8145,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -7071,7 +8190,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -7118,7 +8237,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -7164,7 +8283,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -7209,7 +8328,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -7254,7 +8373,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda( int id; CUDA_CHECK(cudaGetDevice(&id)); - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; int mmq_x, mmq_y, nwarps; if (compute_capability >= CC_RDNA2) { @@ -7506,7 +8625,7 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float * const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); - if (shmem < g_device_caps[g_main_device].smpb) { + if (shmem < get_cuda_global_info().devices[ggml_cuda_get_device()].smpb) { switch (ncols_x) { case 32: soft_max_f32<<>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); @@ -7543,7 +8662,7 @@ static void soft_max_f32_cuda(const float * x, const float * mask, const float * } template -static void im2col_cuda(const float* x, T* dst, +static void im2col_cuda(const float * x, T* dst, int64_t IW, int64_t IH, int64_t OW, int64_t OH, int64_t KW, int64_t KH, int64_t IC, int64_t batch, int64_t batch_offset, int64_t offset_delta, int s0,int s1,int p0,int p1,int d0,int d1, cudaStream_t stream) { @@ -7553,373 +8672,11 @@ static void im2col_cuda(const float* x, T* dst, im2col_kernel<<>>(x, dst, batch_offset, offset_delta, IC, IW, IH, OH, OW, KW, KH, parallel_elements, (IC * KH * KW), s0, s1, p0, p1, d0, d1); } -// buffer pool for cuda -#define MAX_CUDA_BUFFERS 256 - -struct scoped_spin_lock { - std::atomic_flag& lock; - scoped_spin_lock(std::atomic_flag& lock) : lock(lock) { - while (lock.test_and_set(std::memory_order_acquire)) { - ; // spin - } - } - ~scoped_spin_lock() { - lock.clear(std::memory_order_release); - } - scoped_spin_lock(const scoped_spin_lock&) = delete; - scoped_spin_lock& operator=(const scoped_spin_lock&) = delete; -}; - -static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; - -// #define DEBUG_CUDA_MALLOC -struct ggml_cuda_buffer { - void * ptr = nullptr; - size_t size = 0; -}; - -static ggml_cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS]; -static size_t g_cuda_pool_size[GGML_CUDA_MAX_DEVICES] = {0}; - -static void * ggml_cuda_pool_malloc_leg(int device, size_t size, size_t * actual_size) { - scoped_spin_lock lock(g_cuda_pool_lock); -#ifdef DEBUG_CUDA_MALLOC - int nnz = 0; - size_t max_size = 0; -#endif - size_t best_diff = 1ull << 36; - int ibest = -1; - for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { - ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i]; - if (b.ptr != nullptr) { -#ifdef DEBUG_CUDA_MALLOC - ++nnz; - if (b.size > max_size) max_size = b.size; -#endif - if (b.size >= size) { - size_t diff = b.size - size; - if (diff < best_diff) { - best_diff = diff; - ibest = i; - if (!best_diff) { - void * ptr = b.ptr; - *actual_size = b.size; - b.ptr = nullptr; - b.size = 0; - return ptr; - } - } - } - } - } - if (ibest >= 0) { - ggml_cuda_buffer& b = g_cuda_buffer_pool[device][ibest]; - void * ptr = b.ptr; - *actual_size = b.size; - b.ptr = nullptr; - b.size = 0; - return ptr; - } - void * ptr; - size_t look_ahead_size = (size_t) (1.05 * size); - look_ahead_size = 256 * ((look_ahead_size + 255)/256); - ggml_cuda_set_device(device); - CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size)); - *actual_size = look_ahead_size; - g_cuda_pool_size[device] += look_ahead_size; -#ifdef DEBUG_CUDA_MALLOC - fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, device, nnz, - (uint32_t)(max_size/1024/1024), (uint32_t)(g_cuda_pool_size[device]/1024/1024), (uint32_t)(size/1024/1024)); -#endif - return ptr; -} - -static void ggml_cuda_pool_free_leg(int device, void * ptr, size_t size) { - scoped_spin_lock lock(g_cuda_pool_lock); - - for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { - ggml_cuda_buffer& b = g_cuda_buffer_pool[device][i]; - if (b.ptr == nullptr) { - b.ptr = ptr; - b.size = size; - return; - } - } - fprintf(stderr, "WARNING: cuda buffer pool full, increase MAX_CUDA_BUFFERS\n"); - ggml_cuda_set_device(device); - CUDA_CHECK(cudaFree(ptr)); - g_cuda_pool_size[device] -= size; -} - -#if !defined(GGML_USE_HIPBLAS) -// pool with virtual memory -static CUdeviceptr g_cuda_pool_addr[GGML_CUDA_MAX_DEVICES] = {0}; -static size_t g_cuda_pool_used[GGML_CUDA_MAX_DEVICES] = {0}; -static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB - -static void * ggml_cuda_pool_malloc_vmm(int device, size_t size, size_t * actual_size) { - scoped_spin_lock lock(g_cuda_pool_lock); - - // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types - const size_t alignment = 128; - size = alignment * ((size + alignment - 1) / alignment); - - size_t avail = g_cuda_pool_size[device] - g_cuda_pool_used[device]; - - if (size > avail) { - // round up to the next multiple of the granularity - size_t reserve_size = size - avail; - const size_t granularity = g_device_caps[device].vmm_granularity; - reserve_size = granularity * ((reserve_size + granularity - 1) / granularity); - - GGML_ASSERT(g_cuda_pool_size[device] + reserve_size <= CUDA_POOL_VMM_MAX_SIZE); - - // allocate more physical memory - CUmemAllocationProp prop = {}; - prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.location.id = device; - CUmemGenericAllocationHandle handle; - CU_CHECK(cuMemCreate(&handle, reserve_size, &prop, 0)); - - // reserve virtual address space (if not already reserved) - if (g_cuda_pool_addr[device] == 0) { - CU_CHECK(cuMemAddressReserve(&g_cuda_pool_addr[device], CUDA_POOL_VMM_MAX_SIZE, 0, 0, 0)); - } - - // map at the end of the pool - CU_CHECK(cuMemMap(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, 0, handle, 0)); - - // the memory allocation handle is no longer needed after mapping - CU_CHECK(cuMemRelease(handle)); - - // set access - CUmemAccessDesc access = {}; - access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - access.location.id = device; - access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; - CU_CHECK(cuMemSetAccess(g_cuda_pool_addr[device] + g_cuda_pool_size[device], reserve_size, &access, 1)); - - // add to the pool - g_cuda_pool_size[device] += reserve_size; - - //printf("cuda pool[%d]: size increased to %llu MB (reserved %llu MB)\n", - // id, (unsigned long long) (g_cuda_pool_size[id]/1024/1024), - // (unsigned long long) (reserve_size/1024/1024)); - } - - GGML_ASSERT(g_cuda_pool_addr[device] != 0); - - void * ptr = (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device]); - *actual_size = size; - g_cuda_pool_used[device] += size; - -#ifdef DEBUG_CUDA_MALLOC - printf("cuda pool[%d]: allocated %llu bytes at %llx\n", device, (unsigned long long) size, ptr); -#endif - - return ptr; -} - -static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) { - scoped_spin_lock lock(g_cuda_pool_lock); - -#ifdef DEBUG_CUDA_MALLOC - printf("cuda pool[%d]: freed %llu bytes at %llx\n", device, (unsigned long long) size, ptr); -#endif - - g_cuda_pool_used[device] -= size; - - // all deallocations must be in reverse order of the allocations - GGML_ASSERT(ptr == (void *) (g_cuda_pool_addr[device] + g_cuda_pool_used[device])); -} - -static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) { - if (g_device_caps[device].vmm) { - return ggml_cuda_pool_malloc_vmm(device, size, actual_size); - } else { - return ggml_cuda_pool_malloc_leg(device, size, actual_size); - } -} - -static void ggml_cuda_pool_free(int device, void * ptr, size_t size) { - if (g_device_caps[device].vmm) { - ggml_cuda_pool_free_vmm(device, ptr, size); - } else { - ggml_cuda_pool_free_leg(device, ptr, size); - } -} -#else -#define ggml_cuda_pool_malloc ggml_cuda_pool_malloc_leg -#define ggml_cuda_pool_free ggml_cuda_pool_free_leg -#endif // !defined(GGML_USE_HIPBLAS) - -template -struct cuda_pool_alloc { - int device = -1; - T * ptr = nullptr; - size_t actual_size = 0; - - // size is in number of elements - T * alloc(size_t size) { - GGML_ASSERT(ptr == nullptr); - CUDA_CHECK(cudaGetDevice(&device)); - ptr = (T *) ggml_cuda_pool_malloc(device, size * sizeof(T), &this->actual_size); - return ptr; - } - - cuda_pool_alloc(size_t size) { - alloc(size); - } - - ~cuda_pool_alloc() { - if (ptr != nullptr) { - ggml_cuda_pool_free(device, ptr, actual_size); - } - } - - T * get() { - return ptr; - } - - cuda_pool_alloc() = default; - cuda_pool_alloc(const cuda_pool_alloc &) = delete; - cuda_pool_alloc(cuda_pool_alloc &&) = delete; - cuda_pool_alloc& operator=(const cuda_pool_alloc &) = delete; - cuda_pool_alloc& operator=(cuda_pool_alloc &&) = delete; -}; - -static bool g_cublas_loaded = false; - -static void ggml_init_cublas() { - static bool initialized = false; - - if (!initialized) { - -#ifdef __HIP_PLATFORM_AMD__ - // Workaround for a rocBLAS bug when using multiple graphics cards: - // https://github.com/ROCmSoftwarePlatform/rocBLAS/issues/1346 - rocblas_initialize(); - CUDA_CHECK(cudaDeviceSynchronize()); -#endif - - if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) { - initialized = true; - g_cublas_loaded = false; - fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__); - return; - } - - GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); - int64_t total_vram = 0; -#if defined(GGML_CUDA_FORCE_MMQ) - fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: yes\n", __func__); -#else - fprintf(stderr, "%s: GGML_CUDA_FORCE_MMQ: no\n", __func__); -#endif -#if defined(CUDA_USE_TENSOR_CORES) - fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: yes\n", __func__); -#else - fprintf(stderr, "%s: CUDA_USE_TENSOR_CORES: no\n", __func__); -#endif - fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); - for (int id = 0; id < g_device_count; ++id) { - int device_vmm = 0; - -#if !defined(GGML_USE_HIPBLAS) - CUdevice device; - CU_CHECK(cuDeviceGet(&device, id)); - CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device)); - - if (device_vmm) { - CUmemAllocationProp alloc_prop = {}; - alloc_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; - alloc_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - alloc_prop.location.id = id; - CU_CHECK(cuMemGetAllocationGranularity(&g_device_caps[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); - } -#endif // !defined(GGML_USE_HIPBLAS) - g_device_caps[id].vmm = !!device_vmm; - - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); - - g_default_tensor_split[id] = total_vram; - total_vram += prop.totalGlobalMem; - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; -#else - g_device_caps[id].cc = 100*prop.major + 10*prop.minor; -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - g_device_caps[id].smpb = prop.sharedMemPerBlock; - } - for (int id = 0; id < g_device_count; ++id) { - g_default_tensor_split[id] /= total_vram; - } - - for (int id = 0; id < g_device_count; ++id) { - ggml_cuda_set_device(id); - - // create cuda streams - for (int is = 0; is < MAX_STREAMS; ++is) { - CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking)); - } - - // create cublas handle - CUBLAS_CHECK(cublasCreate(&g_cublas_handles[id])); - CUBLAS_CHECK(cublasSetMathMode(g_cublas_handles[id], CUBLAS_TF32_TENSOR_OP_MATH)); - } - - // configure logging to stdout - // CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr)); - - initialized = true; - g_cublas_loaded = true; - } -} - -static void * ggml_cuda_host_malloc(size_t size) { - if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { - return nullptr; - } - - void * ptr = nullptr; - cudaError_t err = cudaMallocHost((void **) &ptr, size); - if (err != cudaSuccess) { - // clear the error - cudaGetLastError(); - fprintf(stderr, "%s: warning: failed to allocate %.2f MiB of pinned memory: %s\n", __func__, - size/1024.0/1024.0, cudaGetErrorString(err)); - return nullptr; - } - - return ptr; -} - -static void ggml_cuda_host_free(void * ptr) { - CUDA_CHECK(cudaFreeHost(ptr)); -} - static cudaError_t ggml_cuda_cpy_tensor_2d( void * dst, const struct ggml_tensor * src, int64_t i3, int64_t i2, int64_t i1_low, int64_t i1_high, cudaStream_t stream) { - cudaMemcpyKind kind; - char * src_ptr; - if (src->backend == GGML_BACKEND_TYPE_CPU) { - kind = cudaMemcpyHostToDevice; - src_ptr = (char *) src->data; - } else if (src->backend == GGML_BACKEND_TYPE_GPU || src->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { - GGML_ASSERT(src->backend != GGML_BACKEND_TYPE_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1])); - kind = cudaMemcpyDeviceToDevice; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; - int id; - CUDA_CHECK(cudaGetDevice(&id)); - src_ptr = (char *) extra->data_device[id]; - } else { - GGML_ASSERT(false); - } + GGML_ASSERT(ggml_backend_buffer_is_cuda(src->buffer)); + char * src_ptr = (char *) src->data; char * dst_ptr = (char *) dst; const int64_t ne0 = src->ne[0]; @@ -7934,25 +8691,30 @@ static cudaError_t ggml_cuda_cpy_tensor_2d( const char * x = src_ptr + i1_low*nb1 + i2*nb2 + i3*nb3; if (nb0 == ts && nb1 == ts*ne0/bs) { - return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, kind, stream); + return cudaMemcpyAsync(dst_ptr, x, i1_diff*nb1, cudaMemcpyDeviceToDevice, stream); } else if (nb0 == ts) { - return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, kind, stream); + return cudaMemcpy2DAsync(dst_ptr, ts*ne0/bs, x, nb1, ts*ne0/bs, i1_diff, cudaMemcpyDeviceToDevice, stream); } else { for (int64_t i1 = 0; i1 < i1_diff; i1++) { const void * rx = (const void *) ((const char *) x + i1*nb1); void * rd = (void *) (dst_ptr + i1*ts*ne0/bs); // pretend the row is a matrix with cols=1 - cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, kind, stream); - if (r != cudaSuccess) return r; + cudaError_t r = cudaMemcpy2DAsync(rd, ts/bs, rx, nb0, ts/bs, ne0, cudaMemcpyDeviceToDevice, stream); + if (r != cudaSuccess) { + return r; + } } return cudaSuccess; } } static void ggml_cuda_op_get_rows( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t stream) { + GGML_UNUSED(ctx); + GGML_ASSERT(src1->type == GGML_TYPE_I32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -7994,9 +8756,12 @@ static void ggml_cuda_op_get_rows( template static void ggml_cuda_op_bin_bcast( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { + GGML_UNUSED(ctx); + GGML_ASSERT(src1->type == GGML_TYPE_F32); if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { @@ -8013,26 +8778,31 @@ static void ggml_cuda_op_bin_bcast( } static void ggml_cuda_op_repeat( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_d, const float * src1_d, float * dst_d, cudaStream_t main_stream) { - ggml_cuda_op_bin_bcast>(dst, src0, dst, nullptr, src0_d, dst_d, main_stream); + ggml_cuda_op_bin_bcast>(ctx, dst, src0, dst, nullptr, src0_d, dst_d, main_stream); - (void) src1; - (void) src1_d; + GGML_UNUSED(src1); + GGML_UNUSED(src1_d); } static void ggml_cuda_op_add( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - ggml_cuda_op_bin_bcast>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_cuda_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); } static void ggml_cuda_op_acc( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { + GGML_UNUSED(ctx); + GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -8045,125 +8815,135 @@ static void ggml_cuda_op_acc( acc_f32_cuda(src0_dd, src1_dd, dst_dd, ggml_nelements(dst), src1->ne[0], src1->ne[1], src1->ne[2], nb1, nb2, offset, main_stream); - (void) dst; + GGML_UNUSED(dst); } static void ggml_cuda_op_mul( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - ggml_cuda_op_bin_bcast>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_cuda_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); } static void ggml_cuda_op_div( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - ggml_cuda_op_bin_bcast>(src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); + ggml_cuda_op_bin_bcast>(ctx, src0, src1, dst, src0_dd, src1_dd, dst_dd, main_stream); } static void ggml_cuda_op_gelu( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); gelu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_silu( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); silu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_gelu_quick( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); gelu_quick_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_tanh( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); tanh_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_relu( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_hardsigmoid( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); hardsigmoid_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_hardswish( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); hardswish_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_leaky_relu( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -8172,29 +8952,31 @@ static void ggml_cuda_op_leaky_relu( leaky_relu_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), negative_slope, main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_sqr( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); sqr_f32_cuda(src0_dd, dst_dd, ggml_nelements(src0), main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_norm( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -8206,15 +8988,16 @@ static void ggml_cuda_op_norm( norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_group_norm( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -8222,15 +9005,16 @@ static void ggml_cuda_op_group_norm( int group_size = src0->ne[0] * src0->ne[1] * ((src0->ne[2] + num_groups - 1) / num_groups); group_norm_f32_cuda(src0_dd, dst_dd, num_groups * src0->ne[3], group_size, ggml_nelements(src0), main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_concat( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -8239,14 +9023,15 @@ static void ggml_cuda_op_concat( concat_f32_cuda(src0_dd + i3 * (src0->nb[3] / 4), src1_dd + i3 * (src1->nb[3] / 4), dst_dd + i3 * (dst->nb[3] / 4), dst->ne[0], dst->ne[1], dst->ne[2], src0->ne[2], main_stream); } - (void) src1; - (void) dst; + GGML_UNUSED(src1); + GGML_UNUSED(dst); } static void ggml_cuda_op_upscale( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors @@ -8255,15 +9040,16 @@ static void ggml_cuda_op_upscale( upscale_f32_cuda(src0_dd, dst_dd, src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], scale_factor, main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_pad( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); GGML_ASSERT(src0->ne[3] == 1 && dst->ne[3] == 1); // just 3D tensors @@ -8272,15 +9058,16 @@ static void ggml_cuda_op_pad( src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3], dst->ne[0], dst->ne[1], dst->ne[2], dst->ne[3], main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_arange( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(dst->type == GGML_TYPE_F32); float start; @@ -8295,16 +9082,17 @@ static void ggml_cuda_op_arange( arange_f32_cuda(dst_dd, dst->ne[0], start, step, main_stream); - (void) src0; - (void) src1; - (void) src0_dd; - (void) src1_dd; + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(src0_dd); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_timestep_embedding( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(dst->type == GGML_TYPE_F32); @@ -8313,15 +9101,16 @@ static void ggml_cuda_op_timestep_embedding( timestep_embedding_f32_cuda(src0_dd, dst_dd, src0->ne[0], dst->nb[1], dim, max_period, main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_rms_norm( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -8333,12 +9122,13 @@ static void ggml_cuda_op_rms_norm( rms_norm_f32_cuda(src0_dd, dst_dd, ne00, nrows, eps, main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_mul_mat_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream) { @@ -8352,12 +9142,11 @@ static void ggml_cuda_op_mul_mat_q( const int64_t row_diff = row_high - row_low; - int id; - CUDA_CHECK(cudaGetDevice(&id)); + int id = ggml_cuda_get_device(); // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -8395,104 +9184,13 @@ static void ggml_cuda_op_mul_mat_q( break; } - (void) src1; - (void) dst; - (void) src1_ddf_i; -} - -static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { - int64_t min_compute_capability = INT_MAX; - int64_t max_compute_capability = INT_MIN; - for (int id = 0; id < g_device_count; ++id) { - if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { - if (min_compute_capability > g_device_caps[id].cc) { - min_compute_capability = g_device_caps[id].cc; - } - if (max_compute_capability < g_device_caps[id].cc) { - max_compute_capability = g_device_caps[id].cc; - } - } - } - -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - switch(type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - return max_compute_capability >= CC_RDNA2 ? 128 : 64; - case GGML_TYPE_F16: - case GGML_TYPE_F32: - return 1; - case GGML_TYPE_Q2_K: - return max_compute_capability >= CC_RDNA2 ? 128 : 32; - case GGML_TYPE_Q3_K: - return min_compute_capability < CC_RDNA2 ? 128 : 64; - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_Q6_K: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - return max_compute_capability >= CC_RDNA2 ? 128 : 64; - default: - GGML_ASSERT(false); - } -#else - switch(type) { - case GGML_TYPE_Q4_0: - case GGML_TYPE_Q4_1: - return max_compute_capability >= CC_VOLTA ? 128 : 64; - case GGML_TYPE_Q5_0: - case GGML_TYPE_Q5_1: - case GGML_TYPE_Q8_0: - return 64; - case GGML_TYPE_F16: - case GGML_TYPE_F32: - return 1; - case GGML_TYPE_Q2_K: - case GGML_TYPE_Q3_K: - case GGML_TYPE_Q4_K: - case GGML_TYPE_Q5_K: - case GGML_TYPE_IQ2_XXS: - case GGML_TYPE_IQ2_XS: - case GGML_TYPE_IQ2_S: - case GGML_TYPE_IQ3_XXS: - case GGML_TYPE_IQ1_S: - case GGML_TYPE_IQ4_NL: - case GGML_TYPE_IQ4_XS: - case GGML_TYPE_IQ3_S: - return max_compute_capability >= CC_VOLTA ? 128 : 64; - case GGML_TYPE_Q6_K: - return 64; - default: - GGML_ASSERT(false); - } -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) -} - -static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { - const int64_t nrows = ggml_nrows(tensor); - const int64_t rounding = get_row_rounding(tensor->type, tensor_split); - - *row_low = id == 0 ? 0 : nrows*tensor_split[id]; - *row_low -= *row_low % rounding; - - if (id == g_device_count - 1) { - *row_high = nrows; - } else { - *row_high = nrows*tensor_split[id + 1]; - *row_high -= *row_high % rounding; - } + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_ddf_i); } static void ggml_cuda_op_mul_mat_vec_q( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream) { @@ -8510,7 +9208,7 @@ static void ggml_cuda_op_mul_mat_vec_q( // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the kernel writes into - const int64_t nrows_dst = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; + const int64_t nrows_dst = id == ctx.device ? ne0 : row_diff; switch (src0->type) { case GGML_TYPE_Q4_0: @@ -8590,18 +9288,19 @@ static void ggml_cuda_op_mul_mat_vec_q( break; } - (void) src1; - (void) dst; - (void) src1_ddf_i; - (void) src1_ncols; - (void) src1_padded_row_size; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_ddf_i); + GGML_UNUSED(src1_ncols); + GGML_UNUSED(src1_padded_row_size); } static void ggml_cuda_op_dequantize_mul_mat_vec( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream) { - + GGML_UNUSED(ctx); const int64_t ne00 = src0->ne[0]; const int64_t row_diff = row_high - row_low; @@ -8666,14 +9365,15 @@ static void ggml_cuda_op_dequantize_mul_mat_vec( break; } - (void) src1; - (void) dst; - (void) src1_ddq_i; - (void) src1_ncols; - (void) src1_padded_row_size; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_ddq_i); + GGML_UNUSED(src1_ncols); + GGML_UNUSED(src1_padded_row_size); } static void ggml_cuda_op_mul_mat_cublas( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, cudaStream_t stream) { @@ -8689,19 +9389,17 @@ static void ggml_cuda_op_mul_mat_cublas( const int64_t row_diff = row_high - row_low; - int id; - CUDA_CHECK(cudaGetDevice(&id)); + int id = ggml_cuda_get_device(); // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; + int ldc = id == ctx.device ? ne0 : row_diff; - const int compute_capability = g_device_caps[id].cc; + const int compute_capability = get_cuda_global_info().devices[id].cc; if (compute_capability >= CC_VOLTA && (src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { - //printf("this branch\n"); // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 - cuda_pool_alloc src0_as_f16; + ggml_cuda_pool_alloc src0_as_f16(ctx.pool()); if (src0->type != GGML_TYPE_F16) { const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src0->type); GGML_ASSERT(to_fp16_cuda != nullptr); @@ -8711,7 +9409,7 @@ static void ggml_cuda_op_mul_mat_cublas( } const half * src0_ptr = src0->type == GGML_TYPE_F16 ? (const half *) src0_dd_i : src0_as_f16.get(); - cuda_pool_alloc src1_as_f16; + ggml_cuda_pool_alloc src1_as_f16(ctx.pool()); if (src1->type != GGML_TYPE_F16) { const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); GGML_ASSERT(to_fp16_cuda != nullptr); @@ -8720,14 +9418,14 @@ static void ggml_cuda_op_mul_mat_cublas( to_fp16_cuda(src1_ddf_i, src1_as_f16.get(), ne, stream); } const half * src1_ptr = src1->type == GGML_TYPE_F16 ? (const half *) src1_ddf_i : src1_as_f16.get(); - cuda_pool_alloc dst_f16(row_diff*src1_ncols); + ggml_cuda_pool_alloc dst_f16(ctx.pool(), row_diff*src1_ncols); const half alpha_f16 = 1.0f; const half beta_f16 = 0.0f; - CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); + CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); CUBLAS_CHECK( - cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + cublasGemmEx(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, row_diff, src1_ncols, ne10, &alpha_f16, src0_ptr, CUDA_R_16F, ne00, src1_ptr, CUDA_R_16F, ne10, @@ -8738,8 +9436,8 @@ static void ggml_cuda_op_mul_mat_cublas( const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); to_fp32_cuda(dst_f16.get(), dst_dd_i, row_diff*src1_ncols, stream); } else { - cuda_pool_alloc src0_ddq_as_f32; - cuda_pool_alloc src1_ddq_as_f32; + ggml_cuda_pool_alloc src0_ddq_as_f32(ctx.pool(id)); + ggml_cuda_pool_alloc src1_ddq_as_f32(ctx.pool(id)); if (src0->type != GGML_TYPE_F32) { const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); @@ -8760,24 +9458,25 @@ static void ggml_cuda_op_mul_mat_cublas( const float alpha = 1.0f; const float beta = 0.0f; - CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], stream)); + CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(id), stream)); CUBLAS_CHECK( - cublasSgemm(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + cublasSgemm(ctx.cublas_handle(id), CUBLAS_OP_T, CUBLAS_OP_N, row_diff, src1_ncols, ne10, &alpha, src0_ddf_i, ne00, src1_ddf1_i, ne10, &beta, dst_dd_i, ldc)); } - (void) dst; - (void) src1_ddq_i; - (void) src1_padded_row_size; + GGML_UNUSED(dst); + GGML_UNUSED(src1_ddq_i); + GGML_UNUSED(src1_padded_row_size); } static void ggml_cuda_op_rope( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16); GGML_ASSERT( dst->type == GGML_TYPE_F32 || dst->type == GGML_TYPE_F16); GGML_ASSERT(src0->type == dst->type); @@ -8849,15 +9548,16 @@ static void ggml_cuda_op_rope( } } - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_alibi( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -8881,14 +9581,15 @@ static void ggml_cuda_op_alibi( alibi_f32_cuda(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream); - (void) src1; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_pool2d( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -8914,14 +9615,15 @@ static void ggml_cuda_op_pool2d( dim3 block_nums(num_blocks); pool2d_nchw_kernel<<>>(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, parallel_elements, src0_dd, dst_dd, op); - (void) src1; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_im2col( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F16 || dst->type == GGML_TYPE_F32); @@ -8955,14 +9657,15 @@ static void ggml_cuda_op_im2col( im2col_cuda(src1_dd, (float*) dst_dd, IW, IH, OW, OH, KW, KH, IC, batch, batch_offset, delta_offset, s0, s1, p0, p1, d0, d1, main_stream); } - (void) src0; - (void) src0_dd; + GGML_UNUSED(src0); + GGML_UNUSED(src0_dd); } static void ggml_cuda_op_sum_rows( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -8971,15 +9674,16 @@ static void ggml_cuda_op_sum_rows( sum_rows_f32_cuda(src0_dd, dst_dd, ncols, nrows, main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_argsort( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_I32); @@ -8990,15 +9694,16 @@ static void ggml_cuda_op_argsort( argsort_f32_i32_cuda(src0_dd, (int *)dst_dd, ncols, nrows, order, main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_diag_mask_inf( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -9010,15 +9715,16 @@ static void ggml_cuda_op_diag_mask_inf( diag_mask_inf_f32_cuda(src0_dd, dst_dd, ne00, nrows0, ne01, n_past, main_stream); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_soft_max( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -9041,17 +9747,17 @@ static void ggml_cuda_op_soft_max( const bool use_src2 = src2 != nullptr; if (use_src2) { - ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra; - src2_dd = (float *) src2_extra->data_device[g_main_device]; + src2_dd = (float *)src2->data; } soft_max_f32_cuda(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00, nrows_x, nrows_y, scale, max_bias, main_stream); } static void ggml_cuda_op_scale( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -9061,15 +9767,16 @@ static void ggml_cuda_op_scale( scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } static void ggml_cuda_op_clamp( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const float * src0_dd, const float * src1_dd, float * dst_dd, cudaStream_t main_stream) { - + GGML_UNUSED(ctx); GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); @@ -9081,45 +9788,30 @@ static void ggml_cuda_op_clamp( clamp_f32_cuda(src0_dd, dst_dd, min, max, ggml_nelements(src0), main_stream); CUDA_CHECK(cudaGetLastError()); - (void) src1; - (void) dst; - (void) src1_dd; + GGML_UNUSED(src1); + GGML_UNUSED(dst); + GGML_UNUSED(src1_dd); } -static void ggml_cuda_op_flatten(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) { - const int64_t nrows0 = ggml_nrows(src0); - - const bool use_src1 = src1 != nullptr; - const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; - - GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - GGML_ASSERT( dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr; - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; +// TODO: remove this function +static void ggml_cuda_op_flatten(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const ggml_cuda_op_flatten_t op) { + GGML_ASSERT(!src0 || ggml_backend_buffer_is_cuda(src0->buffer)); + GGML_ASSERT(!src1 || ggml_backend_buffer_is_cuda(src1->buffer)); + GGML_ASSERT( ggml_backend_buffer_is_cuda(dst->buffer)); // dd = data device - float * src0_ddf = nullptr; - float * src1_ddf = nullptr; - float * dst_ddf = nullptr; + float * src0_ddf = src0 ? (float *) src0->data : nullptr; + float * src1_ddf = src1 ? (float *) src1->data : nullptr; + float * dst_ddf = (float *) dst->data; - ggml_cuda_set_device(g_main_device); - cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - - src0_ddf = (float *) src0_extra->data_device[g_main_device]; - - if (use_src1) { - src1_ddf = (float *) src1_extra->data_device[g_main_device]; - } - dst_ddf = (float *) dst_extra->data_device[g_main_device]; + ggml_cuda_set_device(ctx.device); // do the computation - op(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); + op(ctx, src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, ctx.stream()); CUDA_CHECK(cudaGetLastError()); } -static void ggml_cuda_set_peer_access(const int n_tokens) { +static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) { static bool peer_access_enabled = false; const bool enable_peer_access = n_tokens <= GGML_CUDA_PEER_MAX_BATCH_SIZE; @@ -9129,19 +9821,19 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { } #ifdef NDEBUG - for (int id = 0; id < g_device_count; ++id) { + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { ggml_cuda_set_device(id); CUDA_CHECK(cudaDeviceSynchronize()); } - for (int id = 0; id < g_device_count; ++id) { + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { ggml_cuda_set_device(id); - for (int id_other = 0; id_other < g_device_count; ++id_other) { + for (int id_other = 0; id_other < ggml_backend_cuda_get_device_count(); ++id_other) { if (id == id_other) { continue; } - if (id != g_main_device && id_other != g_main_device) { + if (id != main_device && id_other != main_device) { continue; } @@ -9165,14 +9857,12 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { #endif // NDEBUG peer_access_enabled = enable_peer_access; + + GGML_UNUSED(main_device); } -// FIXME: move this somewhere else -struct ggml_backend_cuda_split_buffer_type_context { - std::array tensor_split; -}; - static void ggml_cuda_op_mul_mat( + ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, const bool convert_src1_to_q8_1) { @@ -9195,8 +9885,11 @@ static void ggml_cuda_op_mul_mat( const int nb2 = dst->nb[2]; const int nb3 = dst->nb[3]; - GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); - GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(ggml_backend_buffer_is_cuda(dst->buffer)); + GGML_ASSERT(ggml_backend_buffer_is_cuda(src1->buffer)); + ggml_backend_cuda_buffer_context * src1_ctx = (ggml_backend_cuda_buffer_context *) src1->buffer->context; + ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *) dst->buffer->context; + GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -9208,33 +9901,30 @@ static void ggml_cuda_op_mul_mat( const size_t q8_1_ts = sizeof(block_q8_1); const size_t q8_1_bs = QK8_1; - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - const bool src0_is_contiguous = ggml_is_contiguous(src0); const bool src1_is_contiguous = ggml_is_contiguous(src1); const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING); - const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; + const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); GGML_ASSERT(!(split && ne02 > 1)); GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); + ggml_tensor_extra_gpu * src0_extra = split ? (ggml_tensor_extra_gpu *) src0->extra : nullptr; + + std::array tensor_split; if (split) { - // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check - // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...); ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; tensor_split = buft_ctx->tensor_split; } struct dev_data { - cuda_pool_alloc src0_dd_alloc; - cuda_pool_alloc src1_ddf_alloc; - cuda_pool_alloc src1_ddq_alloc; - cuda_pool_alloc dst_dd_alloc; + ggml_cuda_pool_alloc src0_dd_alloc; + ggml_cuda_pool_alloc src1_ddf_alloc; + ggml_cuda_pool_alloc src1_ddq_alloc; + ggml_cuda_pool_alloc dst_dd_alloc; char * src0_dd = nullptr; float * src1_ddf = nullptr; // float @@ -9249,7 +9939,7 @@ static void ggml_cuda_op_mul_mat( int used_devices = 0; - for (int id = 0; id < g_device_count; ++id) { + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { // by default, use all rows dev[id].row_low = 0; dev[id].row_high = ne01; @@ -9266,7 +9956,7 @@ static void ggml_cuda_op_mul_mat( } } - if (id != g_device_count - 1) { + if (id != ggml_backend_cuda_get_device_count() - 1) { dev[id].row_high = ne01*tensor_split[id + 1]; if (dev[id].row_high < ne01) { dev[id].row_high -= dev[id].row_high % rounding; @@ -9275,33 +9965,33 @@ static void ggml_cuda_op_mul_mat( } } - for (int id = 0; id < g_device_count; ++id) { - if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) { + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { continue; } used_devices++; - const bool src1_on_device = id == g_main_device; // TODO: check from buffer - const bool dst_on_device = id == g_main_device; + const bool src1_on_device = id == src1_ctx->device; + const bool dst_on_device = id == dst_ctx->device; ggml_cuda_set_device(id); - cudaStream_t stream = g_cudaStreams[id][0]; + cudaStream_t stream = ctx.stream(id, 0); if (src0_is_contiguous) { - dev[id].src0_dd = (char *) src0_extra->data_device[id]; + dev[id].src0_dd = split ? (char *) src0_extra->data_device[id] : (char *) src0->data; } else { - dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ggml_nbytes(src0)); + dev[id].src0_dd = dev[id].src0_dd_alloc.alloc(ctx.pool(id), ggml_nbytes(src0)); } if (src1_on_device && src1_is_contiguous) { - dev[id].src1_ddf = (float *) src1_extra->data_device[id]; + dev[id].src1_ddf = (float *) src1->data; } else { - dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ggml_nelements(src1)); + dev[id].src1_ddf = dev[id].src1_ddf_alloc.alloc(ctx.pool(id), ggml_nelements(src1)); } if (convert_src1_to_q8_1) { - dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs); + dev[id].src1_ddq = dev[id].src1_ddq_alloc.alloc(ctx.pool(id), nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs); if (src1_on_device && src1_is_contiguous) { quantize_row_q8_1_cuda(dev[id].src1_ddf, dev[id].src1_ddq, ne10, nrows1, src1_padded_col_size, stream); @@ -9310,40 +10000,40 @@ static void ggml_cuda_op_mul_mat( } if (dst_on_device) { - dev[id].dst_dd = (float *) dst_extra->data_device[id]; + dev[id].dst_dd = (float *) dst->data; } else { const size_t size_dst_ddf = split ? (dev[id].row_high - dev[id].row_low)*ne1 : ggml_nelements(dst); - dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(size_dst_ddf); + dev[id].dst_dd = dev[id].dst_dd_alloc.alloc(ctx.pool(id), size_dst_ddf); } } // if multiple devices are used they need to wait for the main device // here an event is recorded that signals that the main device has finished calculating the input data if (split && used_devices > 1) { - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0])); + ggml_cuda_set_device(ctx.device); + CUDA_CHECK(cudaEventRecord(src0_extra->events[ctx.device][0], ctx.stream())); } const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) { - const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; + const int64_t is = split ? (src1_col_0/src1_col_stride) % GGML_CUDA_MAX_STREAMS : 0; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; - for (int id = 0; id < g_device_count; ++id) { - if ((!split && id != g_main_device) || dev[id].row_low == dev[id].row_high) { + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { + if ((!split && id != ctx.device) || dev[id].row_low == dev[id].row_high) { continue; } - const bool src1_on_device = id == g_main_device; // TODO: check from buffer - const bool dst_on_device = id == g_main_device; + const bool src1_on_device = id == src1_ctx->device; + const bool dst_on_device = id == dst_ctx->device; const int64_t row_diff = dev[id].row_high - dev[id].row_low; ggml_cuda_set_device(id); - cudaStream_t stream = g_cudaStreams[id][is]; + cudaStream_t stream = ctx.stream(id, is); // wait for main GPU data if necessary - if (split && (id != g_main_device || is != 0)) { - CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[g_main_device][0], 0)); + if (split && (id != ctx.device || is != 0)) { + CUDA_CHECK(cudaStreamWaitEvent(stream, src0_extra->events[ctx.device][0], 0)); } for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { @@ -9360,21 +10050,21 @@ static void ggml_cuda_op_mul_mat( // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (id == g_main_device) { + if (id == ctx.device) { dst_dd_i += dev[id].row_low; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary if (src1_is_contiguous) { - if (id != g_main_device) { + if (id != ctx.device) { if (convert_src1_to_q8_1) { - char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset; - CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, g_main_device, + char * src1_ddq_i_source = dev[ctx.device].src1_ddq + src1_ddq_i_offset; + CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddq_i, id, src1_ddq_i_source, ctx.device, src1_ncols*src1_padded_col_size*q8_1_ts/q8_1_bs, stream)); } else { - float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; + float * src1_ddf_i_source = (float *) src1->data; src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10; - CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, g_main_device, + CUDA_CHECK(cudaMemcpyPeerAsync(src1_ddf_i, id, src1_ddf_i_source, ctx.device, src1_ncols*ne10*sizeof(float), stream)); } } @@ -9395,13 +10085,13 @@ static void ggml_cuda_op_mul_mat( } // do the computation - op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, + op(ctx, src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, dev[id].row_low, dev[id].row_high, src1_ncols, src1_padded_col_size, stream); CUDA_CHECK(cudaGetLastError()); // copy dst to host or other device if necessary if (!dst_on_device) { - void * dst_off_device = dst_extra->data_device[g_main_device]; + void * dst_off_device = dst->data; if (split) { // src0 = weight matrix is saved as a transposed matrix for better memory layout. // dst is NOT transposed. @@ -9414,7 +10104,7 @@ static void ggml_cuda_op_mul_mat( #if !defined(GGML_USE_HIPBLAS) // cudaMemcpy2DAsync may fail with copies between vmm pools of different devices cudaMemcpy3DPeerParms p = {}; - p.dstDevice = g_main_device; + p.dstDevice = ctx.device; p.dstPtr = make_cudaPitchedPtr(dhf_dst_i, ne0*sizeof(float), row_diff, src1_ncols); p.srcDevice = id; p.srcPtr = make_cudaPitchedPtr(dst_dd_i, row_diff*sizeof(float), row_diff, src1_ncols); @@ -9436,7 +10126,7 @@ static void ggml_cuda_op_mul_mat( } // add event for the main device to wait on until other device is done - if (split && (id != g_main_device || is != 0)) { + if (split && (id != ctx.device || is != 0)) { CUDA_CHECK(cudaEventRecord(src0_extra->events[id][is], stream)); } } @@ -9444,130 +10134,116 @@ static void ggml_cuda_op_mul_mat( } // main device waits for all other devices to be finished - if (split && g_device_count > 1) { + if (split && ggml_backend_cuda_get_device_count() > 1) { int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; - is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS; + is_max = is_max <= GGML_CUDA_MAX_STREAMS ? is_max : GGML_CUDA_MAX_STREAMS; - ggml_cuda_set_device(g_main_device); - for (int id = 0; id < g_device_count; ++id) { + ggml_cuda_set_device(ctx.device); + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { if (dev[id].row_low == dev[id].row_high) { continue; } for (int64_t is = 0; is < is_max; ++is) { - CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0)); + CUDA_CHECK(cudaStreamWaitEvent(ctx.stream(), src0_extra->events[id][is], 0)); } } } } -static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat); +static void ggml_cuda_repeat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_repeat); } -static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows); +static void ggml_cuda_get_rows(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_get_rows); } -static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add); +static void ggml_cuda_add(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_add); } -static void ggml_cuda_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_acc); +static void ggml_cuda_acc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_acc); } -static void ggml_cuda_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_mul); +static void ggml_cuda_mul(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_mul); } -static void ggml_cuda_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_div); +static void ggml_cuda_div(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_div); } -static void ggml_cuda_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu); +static void ggml_cuda_gelu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_gelu); } -static void ggml_cuda_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_silu); +static void ggml_cuda_silu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_silu); } -static void ggml_cuda_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_gelu_quick); +static void ggml_cuda_gelu_quick(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_gelu_quick); } -static void ggml_cuda_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_tanh); +static void ggml_cuda_tanh(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_tanh); } -static void ggml_cuda_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_relu); +static void ggml_cuda_relu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_relu); } -static void ggml_cuda_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardsigmoid); +static void ggml_cuda_hardsigmoid(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_hardsigmoid); } -static void ggml_cuda_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_hardswish); +static void ggml_cuda_hardswish(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_hardswish); } -static void ggml_cuda_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_leaky_relu); +static void ggml_cuda_leaky_relu(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_leaky_relu); } -static void ggml_cuda_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sqr); +static void ggml_cuda_sqr(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_sqr); } -static void ggml_cuda_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_norm); +static void ggml_cuda_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_norm); } -static void ggml_cuda_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_group_norm); +static void ggml_cuda_group_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_group_norm); } -static void ggml_cuda_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_concat); +static void ggml_cuda_concat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_concat); } -static void ggml_cuda_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_upscale); +static void ggml_cuda_upscale(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_upscale); } -static void ggml_cuda_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pad); +static void ggml_cuda_pad(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_pad); } -static void ggml_cuda_arange(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - - // dd = data device - float * src0_ddf = nullptr; - float * src1_ddf = nullptr; - float * dst_ddf = nullptr; - - ggml_cuda_set_device(g_main_device); - cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; - - dst_ddf = (float *) dst_extra->data_device[g_main_device]; - - // do the computation - ggml_cuda_op_arange(src0, src1, dst, src0_ddf, src1_ddf, dst_ddf, main_stream); - CUDA_CHECK(cudaGetLastError()); +static void ggml_cuda_arange(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_arange); } -static void ggml_cuda_timestep_embedding(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_timestep_embedding); +static void ggml_cuda_timestep_embedding(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_timestep_embedding); } -static void ggml_cuda_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rms_norm); +static void ggml_cuda_rms_norm(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_rms_norm); } -static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ +static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation GGML_ASSERT(src1->nb[0] <= src1->nb[1] && src1->nb[2] <= src1->nb[3]); // 0213 permutation GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -9579,26 +10255,21 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens const int64_t ne12 = src1->ne[2]; - ggml_cuda_set_device(g_main_device); - cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + ggml_cuda_set_device(ctx.device); + cudaStream_t main_stream = ctx.stream(); - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - void * src0_ddq = src0_extra->data_device[g_main_device]; - - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; - - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + void * src0_ddq = src0->data; + float * src1_ddf = (float *) src1->data; + float * dst_ddf = (float *) dst->data; ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); } -static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ +static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); - GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -9611,17 +10282,12 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor const int64_t ne12 = src1->ne[2]; - ggml_cuda_set_device(g_main_device); - cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + ggml_cuda_set_device(ctx.device); + cudaStream_t main_stream = ctx.stream(); - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - void * src0_ddq = src0_extra->data_device[g_main_device]; - - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; - - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + void * src0_ddq = src0->data; + float * src1_ddf = (float *) src1->data; + float * dst_ddf = (float *) dst->data; const int64_t row_stride_x = nb01 / sizeof(half); const int64_t channel_stride_x = nb02 / sizeof(half); @@ -9653,34 +10319,29 @@ static __global__ void k_compute_batched_ptrs( ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } -static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_mul_mat_batched_cublas(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_TENSOR_BINARY_OP_LOCALS const int64_t ne_dst = ggml_nelements(dst); - ggml_cuda_set_device(g_main_device); - cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + ggml_cuda_set_device(ctx.device); + cudaStream_t main_stream = ctx.stream(); - CUBLAS_CHECK(cublasSetStream(g_cublas_handles[g_main_device], main_stream)); + CUBLAS_CHECK(cublasSetStream(ctx.cublas_handle(), main_stream)); - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - void * src0_ddq = src0_extra->data_device[g_main_device]; + void * src0_ddq = src0->data; half * src0_f16 = (half *) src0_ddq; - - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; - - ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + float * src1_ddf = (float *) src1->data; + float * dst_ddf = (float *) dst->data; // convert src1 to fp16 - cuda_pool_alloc src1_f16_alloc; + ggml_cuda_pool_alloc src1_f16_alloc(ctx.pool()); if (src1->type != GGML_TYPE_F16) { const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); const int64_t ne_src1 = ggml_nelements(src1); @@ -9690,7 +10351,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm } half * src1_f16 = src1->type == GGML_TYPE_F16 ? (half *) src1_ddf : src1_f16_alloc.get(); - cuda_pool_alloc dst_f16; + ggml_cuda_pool_alloc dst_f16(ctx.pool()); char * dst_t; cublasComputeType_t cu_compute_type = CUBLAS_COMPUTE_16F; @@ -9755,7 +10416,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm // there is no broadcast and src0, src1 are contiguous across dims 2, 3 // use cublasGemmStridedBatchedEx CUBLAS_CHECK( - cublasGemmStridedBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, + cublasGemmStridedBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, alpha, (const char *) src0_f16, CUDA_R_16F, nb01/nb00, nb02/nb00, // strideA (const char *) src1_f16, CUDA_R_16F, nb11/nb10, nb12/nb10, // strideB @@ -9767,8 +10428,8 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm // use cublasGemmBatchedEx const int ne23 = ne12*ne13; - cuda_pool_alloc ptrs_src(2*ne23); - cuda_pool_alloc< void *> ptrs_dst(1*ne23); + ggml_cuda_pool_alloc ptrs_src(ctx.pool(), 2*ne23); + ggml_cuda_pool_alloc< void *> ptrs_dst(ctx.pool(), 1*ne23); dim3 block_dims(ne13, ne12); k_compute_batched_ptrs<<<1, block_dims, 0, main_stream>>>( @@ -9784,7 +10445,7 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm CUDA_CHECK(cudaGetLastError()); CUBLAS_CHECK( - cublasGemmBatchedEx(g_cublas_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, + cublasGemmBatchedEx(ctx.cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, alpha, (const void **) (ptrs_src.get() + 0*ne23), CUDA_R_16F, nb01/nb00, (const void **) (ptrs_src.get() + 1*ne23), CUDA_R_16F, nb11/nb10, @@ -9801,8 +10462,8 @@ static void ggml_cuda_mul_mat_batched_cublas(const ggml_tensor * src0, const ggm } } -static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; +static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer); int64_t min_compute_capability = INT_MAX; @@ -9810,22 +10471,22 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 if (split) { ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; auto & tensor_split = buft_ctx->tensor_split; - for (int id = 0; id < g_device_count; ++id) { + for (int id = 0; id < ggml_backend_cuda_get_device_count(); ++id) { // skip devices that are not going to do any work: - if (tensor_split[id] >= (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { + if (tensor_split[id] >= (id + 1 < ggml_backend_cuda_get_device_count() ? tensor_split[id + 1] : 1.0f)) { continue; } - if (min_compute_capability > g_device_caps[id].cc) { - min_compute_capability = g_device_caps[id].cc; + if (min_compute_capability > get_cuda_global_info().devices[id].cc) { + min_compute_capability = get_cuda_global_info().devices[id].cc; } - if (g_device_caps[id].cc == 610) { + if (get_cuda_global_info().devices[id].cc == 610) { any_pascal_with_slow_fp16 = true; } } } else { - min_compute_capability = g_device_caps[g_main_device].cc; - any_pascal_with_slow_fp16 = g_device_caps[g_main_device].cc == 610; + min_compute_capability = get_cuda_global_info().devices[ctx.device].cc; + any_pascal_with_slow_fp16 = get_cuda_global_info().devices[ctx.device].cc == 610; } // check data types and tensor shapes for custom matrix multiplication kernels: @@ -9880,21 +10541,21 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { // KQ single-batch - ggml_cuda_mul_mat_vec_p021(src0, src1, dst); + ggml_cuda_mul_mat_vec_p021(ctx, src0, src1, dst); } else if (!split && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch - ggml_cuda_mul_mat_vec_nc(src0, src1, dst); + ggml_cuda_mul_mat_vec_nc(ctx, src0, src1, dst); } else if (!split && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch - ggml_cuda_mul_mat_batched_cublas(src0, src1, dst); + ggml_cuda_mul_mat_batched_cublas(ctx, src0, src1, dst); } else if (use_dequantize_mul_mat_vec) { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); + ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_dequantize_mul_mat_vec, false); } else if (use_mul_mat_vec_q) { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); + ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_vec_q, true); } else if (use_mul_mat_q) { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_q, true); + ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_q, true); } else { - ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); + ggml_cuda_op_mul_mat(ctx, src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } } @@ -10079,12 +10740,12 @@ static void ggml_cuda_mul_mat_id_cublas(ggml_tensor * dst) { } #endif -static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_mul_mat_id(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { #if 0 ggml_cuda_mul_mat_id_cublas(dst); // TODO: mmq/mmv support #endif - cudaStream_t stream = g_cudaStreams[g_main_device][0]; + cudaStream_t stream = ctx.stream(); const size_t nb11 = src1->nb[1]; const size_t nb1 = dst->nb[1]; @@ -10094,27 +10755,15 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const int32_t n_as = ((int32_t *) dst->op_params)[1]; std::vector ids_host(ggml_nbytes(ids)); - const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; + const char * ids_dev = (const char *) ids->data; CUDA_CHECK(cudaMemcpyAsync(ids_host.data(), ids_dev, ggml_nbytes(ids), cudaMemcpyDeviceToHost, stream)); CUDA_CHECK(cudaStreamSynchronize(stream)); - const ggml_tensor_extra_gpu * src1_extra = (const ggml_tensor_extra_gpu *) src1->extra; - const ggml_tensor_extra_gpu * dst_extra = (const ggml_tensor_extra_gpu *) dst->extra; - - ggml_tensor_extra_gpu src1_row_extra; - ggml_tensor_extra_gpu dst_row_extra; - ggml_tensor src1_row = *src1; ggml_tensor dst_row = *dst; - src1_row.backend = GGML_BACKEND_TYPE_GPU; - dst_row.backend = GGML_BACKEND_TYPE_GPU; - - src1_row.extra = &src1_row_extra; - dst_row.extra = &dst_row_extra; - - char * src1_original = (char *) src1_extra->data_device[g_main_device]; - char * dst_original = (char *) dst_extra->data_device[g_main_device]; + char * src1_original = (char *) src1->data; + char * dst_original = (char *) dst->data; if (src1->ne[1] == 1) { for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { @@ -10124,20 +10773,17 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1]; - src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set? + src1_row.data = src1_original + i01*src1->nb[1]; + dst_row.data = dst_original + i01*dst->nb[1]; - dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1]; - dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set? - - ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row); } } else { - cuda_pool_alloc src1_contiguous(sizeof(float)*ggml_nelements(src1)); - cuda_pool_alloc dst_contiguous(sizeof(float)*ggml_nelements(dst)); + ggml_cuda_pool_alloc src1_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(src1)); + ggml_cuda_pool_alloc dst_contiguous(ctx.pool(), sizeof(float)*ggml_nelements(dst)); - src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); - dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); + src1_row.data = src1_contiguous.get(); + dst_row.data = dst_contiguous.get(); for (int32_t row_id = 0; row_id < n_as; ++row_id) { const struct ggml_tensor * src0_row = dst->src[row_id + 2]; @@ -10172,7 +10818,7 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s dst_row.nb[2] = num_src1_rows*nb1; dst_row.nb[3] = num_src1_rows*nb1; - ggml_cuda_mul_mat(src0_row, &src1_row, &dst_row); + ggml_cuda_mul_mat(ctx, src0_row, &src1_row, &dst_row); num_src1_rows = 0; for (int64_t i01 = 0; i01 < ids->ne[1]; i01++) { @@ -10192,20 +10838,19 @@ static void ggml_cuda_mul_mat_id(const ggml_tensor * src0, const ggml_tensor * s } } -static void ggml_cuda_scale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_scale); +static void ggml_cuda_scale(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_scale); } -static void ggml_cuda_clamp(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_clamp); +static void ggml_cuda_clamp(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_clamp); } -static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const int64_t ne = ggml_nelements(src0); GGML_ASSERT(ne == ggml_nelements(src1)); - GGML_ASSERT(src0->backend == GGML_BACKEND_TYPE_GPU); - GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); + GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer)); GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX); GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX); @@ -10232,14 +10877,11 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg const int64_t nb12 = src1->nb[2]; const int64_t nb13 = src1->nb[3]; - ggml_cuda_set_device(g_main_device); - cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + ggml_cuda_set_device(ctx.device); + cudaStream_t main_stream = ctx.stream(); - const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - - char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; - char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; + char * src0_ddc = (char *) src0->data; + char * src1_ddc = (char *) src1->data; if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_f32_f32_cuda (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); @@ -10261,87 +10903,61 @@ static void ggml_cuda_cpy(const ggml_tensor * src0, const ggml_tensor * src1, gg GGML_ASSERT(false); } - (void) dst; + GGML_UNUSED(dst); } -static void ggml_cuda_dup(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_dup(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { // TODO: why do we pass dst as src1 here? - ggml_cuda_cpy(src0, dst, nullptr); - (void) src1; + ggml_cuda_cpy(ctx, src0, dst, nullptr); + GGML_UNUSED(src1); } -static void ggml_cuda_diag_mask_inf(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_diag_mask_inf); +static void ggml_cuda_diag_mask_inf(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_diag_mask_inf); } -static void ggml_cuda_soft_max(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_soft_max); +static void ggml_cuda_soft_max(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_soft_max); } -static void ggml_cuda_rope(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_rope(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); // TODO: this restriction is temporary until non-cont support is implemented - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_rope); + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_rope); } -static void ggml_cuda_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_alibi); +static void ggml_cuda_alibi(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_alibi); } -static void ggml_cuda_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_pool2d); +static void ggml_cuda_pool2d(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_pool2d); } -static void ggml_cuda_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_im2col); +static void ggml_cuda_im2col(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_im2col); } -static void ggml_cuda_sum_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_sum_rows(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_sum_rows); + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_sum_rows); } -static void ggml_cuda_argsort(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { +static void ggml_cuda_argsort(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_ASSERT(ggml_is_contiguous(src0)); - ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_argsort); + ggml_cuda_op_flatten(ctx, src0, src1, dst, ggml_cuda_op_argsort); } -static void ggml_cuda_nop(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { - (void) src0; - (void) src1; - (void) dst; +static void ggml_cuda_nop(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_UNUSED(ctx); + GGML_UNUSED(src0); + GGML_UNUSED(src1); + GGML_UNUSED(dst); } -static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_split) { - static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function"); - - return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); -} - -static void ggml_cuda_set_main_device(const int main_device) { - if (main_device >= g_device_count) { - fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", - main_device, g_device_count, g_main_device); - return; - } - - if (g_main_device != main_device && g_device_count > 1) { - g_main_device = main_device; - //cudaDeviceProp prop; - //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device)); - //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name); - } -} - -static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) { - if (!g_cublas_loaded) return false; - - if (tensor->op == GGML_OP_MUL_MAT) { - if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { -#ifndef NDEBUG - fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); -#endif - return false; - } +static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * tensor) { + // FIXME: where should this be? + if (tensor->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(tensor->src[0]->buffer)) { + ggml_cuda_set_peer_access(tensor->src[1]->ne[1], ctx.device); } ggml_cuda_func_t func; @@ -10423,7 +11039,12 @@ static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) { func = ggml_cuda_rms_norm; break; case GGML_OP_MUL_MAT: - func = ggml_cuda_mul_mat; + if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) { + fprintf(stderr, "%s: cannot compute %s: src0->ne[3] = %" PRId64 ", src1->ne[3] = %" PRId64 " - fallback to CPU\n", __func__, tensor->name, tensor->src[0]->ne[3], tensor->src[1]->ne[3]); + return false; + } else { + func = ggml_cuda_mul_mat; + } break; case GGML_OP_MUL_MAT_ID: func = ggml_cuda_mul_mat_id; @@ -10478,640 +11099,13 @@ static bool ggml_cuda_compute_forward(struct ggml_tensor * tensor) { return false; } - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT) { - ggml_cuda_set_peer_access(tensor->src[1]->ne[1]); - } - - func(tensor->src[0], tensor->src[1], tensor); + func(ctx, tensor->src[0], tensor->src[1], tensor); return true; } -static int ggml_cuda_get_device_count() { - int device_count; - if (cudaGetDeviceCount(&device_count) != cudaSuccess) { - return 0; - } - return device_count; -} - -static void ggml_cuda_get_device_description(int device, char * description, size_t description_size) { - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); - snprintf(description, description_size, "%s", prop.name); -} //////////////////////////////////////////////////////////////////////////////// -// backend interface - -#define UNUSED GGML_UNUSED - -struct ggml_backend_cuda_context { - explicit ggml_backend_cuda_context(int device) : - device(device), - name(GGML_CUDA_NAME + std::to_string(device)) { - } - - ~ggml_backend_cuda_context() { - if (copy_event != nullptr) { - CUDA_CHECK(cudaEventDestroy(copy_event)); - } - } - - int device; - std::string name; - cudaEvent_t copy_event = nullptr; -}; - -// cuda buffer - -struct ggml_backend_cuda_buffer_context { - int device; - void * dev_ptr = nullptr; - ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; - size_t temp_tensor_extra_index = 0; - std::string name; - - ggml_backend_cuda_buffer_context(int device, void * dev_ptr) : - device(device), dev_ptr(dev_ptr), - name(GGML_CUDA_NAME + std::to_string(device)) { - } - - ~ggml_backend_cuda_buffer_context() { - delete[] temp_tensor_extras; - } - - ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { - // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset - if (temp_tensor_extras == nullptr) { - temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; - } - - size_t alloc_index = temp_tensor_extra_index; - temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; - ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index]; - memset(extra, 0, sizeof(*extra)); - - return extra; - } -}; - -GGML_CALL static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - return ctx->name.c_str(); -} - -GGML_CALL static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name; -} - -GGML_CALL static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - CUDA_CHECK(cudaFree(ctx->dev_ptr)); - delete ctx; -} - -GGML_CALL static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - return ctx->dev_ptr; -} - -GGML_CALL static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - - if (tensor->view_src != NULL && tensor->view_offs == 0) { - assert(tensor->view_src->buffer->buft == buffer->buft); - tensor->backend = tensor->view_src->backend; - tensor->extra = tensor->view_src->extra; - return; - } - - ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra(); - - extra->data_device[ctx->device] = tensor->data; - - tensor->backend = GGML_BACKEND_TYPE_GPU; - tensor->extra = extra; - - if (ggml_is_quantized(tensor->type)) { - // initialize padding to 0 to avoid possible NaN values - size_t original_size = ggml_nbytes(tensor); - size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); - - if (padded_size > original_size && tensor->view_src == nullptr) { - ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); - } - } -} - -GGML_CALL static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); - - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - - ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cudaStreamPerThread)); - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); -} - -GGML_CALL static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); - - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - - ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); -} - -GGML_CALL static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { - if (ggml_backend_buffer_is_cuda(src->buffer)) { - ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; - ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)dst->buffer->context; - if (src_ctx->device == dst_ctx->device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice, cudaStreamPerThread)); - } else { - CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, dst_ctx->device, src->data, src_ctx->device, ggml_nbytes(src), cudaStreamPerThread)); - } - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); - return true; - } - return false; - - UNUSED(buffer); -} - -GGML_CALL static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; - - ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size)); - CUDA_CHECK(cudaDeviceSynchronize()); -} - -static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { - /* .get_name = */ ggml_backend_cuda_buffer_get_name, - /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, - /* .get_base = */ ggml_backend_cuda_buffer_get_base, - /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, - /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, - /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, - /* .clear = */ ggml_backend_cuda_buffer_clear, - /* .reset = */ NULL, -}; - -// cuda buffer type -struct ggml_backend_cuda_buffer_type_context { - int device; - std::string name; -}; - -GGML_CALL static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { - ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - - return ctx->name.c_str(); -} - -GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - - ggml_cuda_set_device(buft_ctx->device); - - size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 - - void * dev_ptr; - cudaError_t err = cudaMalloc(&dev_ptr, size); - if (err != cudaSuccess) { - fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); - return nullptr; - } - - ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); - - return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); -} - -GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; - - UNUSED(buft); -} - -GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - size_t size = ggml_nbytes(tensor); - int64_t ne0 = tensor->ne[0]; - - if (ggml_is_quantized(tensor->type)) { - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - } - - return size; - - UNUSED(buft); -} - -GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - if (!ggml_backend_is_cuda(backend)) { - return false; - } - - ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; - ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - - return buft_ctx->device == cuda_ctx->device; -} - -static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { - /* .get_name = */ ggml_backend_cuda_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, - /* .is_host = */ NULL, -}; - -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { - ggml_init_cublas(); - - // FIXME: this is not thread safe - if (device >= ggml_backend_cuda_get_device_count()) { - return nullptr; - } - - static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; - - static bool ggml_backend_cuda_buffer_type_initialized = false; - - if (!ggml_backend_cuda_buffer_type_initialized) { - for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { - ggml_backend_cuda_buffer_types[i] = { - /* .iface = */ ggml_backend_cuda_buffer_type_interface, - /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)}, - }; - } - ggml_backend_cuda_buffer_type_initialized = true; - } - - return &ggml_backend_cuda_buffer_types[device]; -} - -// cuda split buffer - -struct ggml_backend_cuda_split_buffer_context { - ~ggml_backend_cuda_split_buffer_context() { - for (ggml_tensor_extra_gpu * extra : tensor_extras) { - for (int id = 0; id < g_device_count; ++id) { - for (int64_t is = 0; is < MAX_STREAMS; ++is) { - if (extra->events[id][is] != nullptr) { - CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); - } - } - if (extra->data_device[id] != nullptr) { - CUDA_CHECK(cudaFree(extra->data_device[id])); - } - } - delete extra; - } - } - - std::vector tensor_extras; -}; - -GGML_CALL static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { - return GGML_CUDA_NAME "_Split"; - - UNUSED(buffer); -} - -static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { - return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; - UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds -} - -GGML_CALL static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; - delete ctx; -} - -GGML_CALL static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { - // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced - return (void *)0x1000; - - UNUSED(buffer); -} - -GGML_CALL static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported - - ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - - ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; - - ctx->tensor_extras.push_back(extra); - - for (int id = 0; id < g_device_count; ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - // FIXME: do not crash if cudaMalloc fails - // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first - ggml_cuda_set_device(id); - char * buf; - CUDA_CHECK(cudaMalloc(&buf, size)); - - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); - } - - extra->data_device[id] = buf; - - for (int64_t is = 0; is < MAX_STREAMS; ++is) { - CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); - } - } - tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; - tensor->extra = extra; -} - -GGML_CALL static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - // split tensors must always be set in their entirety at once - GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); - - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int id = 0; id < g_device_count; ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - const char * buf_host = (const char *)data + offset_split; - CUDA_CHECK(cudaMemcpyAsync(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice, cudaStreamPerThread)); - } - - for (int id = 0; id < g_device_count; ++id) { - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); - } -} - -GGML_CALL static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - // split tensors must always be set in their entirety at once - GGML_ASSERT(offset == 0); - GGML_ASSERT(size == ggml_nbytes(tensor)); - - ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; - - const int64_t ne0 = tensor->ne[0]; - const size_t nb1 = tensor->nb[1]; - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; - - for (int id = 0; id < g_device_count; ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - char * buf_host = (char *)data + offset_split; - CUDA_CHECK(cudaMemcpyAsync(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost, cudaStreamPerThread)); - } - - for (int id = 0; id < g_device_count; ++id) { - CUDA_CHECK(cudaStreamSynchronize(cudaStreamPerThread)); - } -} - -GGML_CALL static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - UNUSED(buffer); - UNUSED(value); -} - -static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { - /* .get_name = */ ggml_backend_cuda_split_buffer_get_name, - /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, - /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, - /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, - /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, - /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, - /* .cpy_tensor = */ NULL, - /* .clear = */ ggml_backend_cuda_split_buffer_clear, - /* .reset = */ NULL, -}; - -// cuda split buffer type - -GGML_CALL static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { - return GGML_CUDA_NAME "_Split"; - - UNUSED(buft); -} - -GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point - // instead, we allocate them for each tensor separately in init_tensor - // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, - // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. - ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); - - return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); -} - -GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return 128; - - UNUSED(buft); -} - -GGML_CALL static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; - - size_t total_size = 0; - - const int64_t ne0 = tensor->ne[0]; - - for (int id = 0; id < g_device_count; ++id) { - int64_t row_low, row_high; - get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); - - int64_t nrows_split = row_high - row_low; - if (nrows_split == 0) { - continue; - } - - total_size += ggml_nbytes_split(tensor, nrows_split); - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - } - - return total_size; -} - -GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cuda(backend); - - UNUSED(buft); -} - -GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { - return false; - - UNUSED(buft); -} - -static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { - /* .get_name = */ ggml_backend_cuda_split_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, - /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, - /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, -}; - -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { - ggml_init_cublas(); - - // FIXME: this is not thread safe - static std::map, struct ggml_backend_buffer_type> buft_map; - - std::array tensor_split_arr = {}; - - bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; }); - if (all_zero) { - tensor_split_arr = g_default_tensor_split; - } else { - float split_sum = 0.0f; - for (int i = 0; i < g_device_count; ++i) { - tensor_split_arr[i] = split_sum; - split_sum += tensor_split[i]; - } - for (int i = 0; i < g_device_count; ++i) { - tensor_split_arr[i] /= split_sum; - } - } - - auto it = buft_map.find(tensor_split_arr); - if (it != buft_map.end()) { - return &it->second; - } - - struct ggml_backend_buffer_type buft { - /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, - /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, - }; - - auto result = buft_map.emplace(tensor_split_arr, buft); - return &result.first->second; -} - -// host buffer type - -GGML_CALL static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { - return GGML_CUDA_NAME "_Host"; - - UNUSED(buft); -} - -GGML_CALL static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { - return GGML_CUDA_NAME "_Host"; - - UNUSED(buffer); -} - -GGML_CALL static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_cuda_host_free(buffer->context); -} - -GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - void * ptr = ggml_cuda_host_malloc(size); - - if (ptr == nullptr) { - // fallback to cpu buffer - return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); - } - - ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); - buffer->buft = buft; - buffer->iface.get_name = ggml_backend_cuda_host_buffer_name; - buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; - - return buffer; -} - -GGML_CALL ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { - static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { - /* .iface = */ { - /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, - /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, - /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX - /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, - /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, - /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, - }, - /* .context = */ nullptr, - }; - - return &ggml_backend_cuda_buffer_type_host; -} - -//static bool ggml_backend_buffer_is_cuda_host(ggml_backend_buffer_t buffer) { -// return buffer->buft->iface.get_name == ggml_backend_cuda_host_buffer_type_name; -//} // backend @@ -11139,9 +11133,8 @@ GGML_CALL static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); - CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[cuda_ctx->device][0])); + CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, cuda_ctx->stream())); } GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { @@ -11149,9 +11142,8 @@ GGML_CALL static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; GGML_ASSERT(buf->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); - GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); - CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); + CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, cuda_ctx->stream())); } GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_backend_t backend_dst, const ggml_tensor * src, ggml_tensor * dst) { @@ -11186,19 +11178,19 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ // copy on src stream if (cuda_ctx_src->device == cuda_ctx_dst->device) { - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0])); + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); } else { - CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), g_cudaStreams[cuda_ctx_src->device][0])); + CUDA_CHECK(cudaMemcpyPeerAsync(dst->data, cuda_ctx_dst->device, src->data, cuda_ctx_src->device, ggml_nbytes(dst), cuda_ctx_src->stream())); } // record event on src stream - CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, g_cudaStreams[cuda_ctx_src->device][0])); + CUDA_CHECK(cudaEventRecord(cuda_ctx_src->copy_event, cuda_ctx_src->stream())); // wait on dst stream for the copy to complete - CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx_dst->device][0], cuda_ctx_src->copy_event, 0)); + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx_dst->stream(), cuda_ctx_src->copy_event, 0)); } else { // src and dst are on the same backend - CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx_dst->device][0])); + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, cuda_ctx_dst->stream())); } return true; } @@ -11206,15 +11198,15 @@ GGML_CALL static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0])); + CUDA_CHECK(cudaStreamSynchronize(cuda_ctx->stream())); - UNUSED(backend); + GGML_UNUSED(backend); } GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - ggml_cuda_set_main_device(cuda_ctx->device); + ggml_cuda_set_device(cuda_ctx->device); for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; @@ -11224,20 +11216,15 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t } #ifndef NDEBUG - assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); - assert(node->extra != nullptr); - for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer)); - assert(node->src[j]->extra != nullptr); } } #endif - bool ok = ggml_cuda_compute_forward(node); + bool ok = ggml_cuda_compute_forward(*cuda_ctx, node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } @@ -11371,7 +11358,7 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons return false; } - UNUSED(backend); + GGML_UNUSED(backend); } GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) { @@ -11379,7 +11366,7 @@ GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const return op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS; - UNUSED(backend); + GGML_UNUSED(backend); } static ggml_backend_event_t ggml_backend_cuda_event_new(ggml_backend_t backend) { @@ -11405,14 +11392,14 @@ static void ggml_backend_cuda_event_free(ggml_backend_event_t event) { static void ggml_backend_cuda_event_record(ggml_backend_event_t event) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)event->backend->context; - CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, g_cudaStreams[cuda_ctx->device][0])); + CUDA_CHECK(cudaEventRecord((cudaEvent_t)event->context, cuda_ctx->stream())); } static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_event_t event) { ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; if (ggml_backend_is_cuda(event->backend)) { - CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[cuda_ctx->device][0], (cudaEvent_t)event->context, 0)); + CUDA_CHECK(cudaStreamWaitEvent(cuda_ctx->stream(), (cudaEvent_t)event->context, 0)); } else { #if 0 // untested @@ -11421,7 +11408,7 @@ static void ggml_backend_cuda_event_wait(ggml_backend_t backend, ggml_backend_ev ggml_backend_event_synchronize(event); }; - CUDA_CHECK(cudaLaunchHostFunc(g_cudaStreams[cuda_ctx->device][0], wait_fn, event)); + CUDA_CHECK(cudaLaunchHostFunc(cuda_ctx->stream(), wait_fn, event)); #endif GGML_ASSERT(false); } @@ -11458,16 +11445,11 @@ static ggml_guid_t ggml_backend_cuda_guid() { } GGML_CALL ggml_backend_t ggml_backend_cuda_init(int device) { - ggml_init_cublas(); - - if (device < 0 || device >= ggml_cuda_get_device_count()) { + if (device < 0 || device >= ggml_backend_cuda_get_device_count()) { fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); return nullptr; } - // not strictly necessary, but it may reduce the overhead of the first graph_compute - ggml_cuda_set_main_device(device); - ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context(device); if (ctx == nullptr) { fprintf(stderr, "%s: error: failed to allocate context\n", __func__); @@ -11488,11 +11470,13 @@ GGML_CALL bool ggml_backend_is_cuda(ggml_backend_t backend) { } GGML_CALL int ggml_backend_cuda_get_device_count() { - return ggml_cuda_get_device_count(); + return get_cuda_global_info().device_count; } GGML_CALL void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { - ggml_cuda_get_device_description(device, description, description_size); + cudaDeviceProp prop; + CUDA_CHECK(cudaGetDeviceProperties(&prop, device)); + snprintf(description, description_size, "%s", prop.name); } GGML_CALL void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { @@ -11531,13 +11515,13 @@ GGML_CALL static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); return cuda_backend; - UNUSED(params); + GGML_UNUSED(params); } extern "C" GGML_CALL int ggml_backend_cuda_reg_devices(); GGML_CALL int ggml_backend_cuda_reg_devices() { - int device_count = ggml_cuda_get_device_count(); + int device_count = ggml_backend_cuda_get_device_count(); //int device_count = 1; // DEBUG: some tools require delaying CUDA initialization for (int i = 0; i < device_count; i++) { char name[128]; From 272935b281fee5c683e3d6d1eb580b84553cf503 Mon Sep 17 00:00:00 2001 From: Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> Date: Wed, 20 Mar 2024 23:02:32 +0800 Subject: [PATCH 26/28] llava : add MobileVLM_V2 backup (#6175) * Add MobileVLM_V2 backup * Update MobileVLM-README.md * Update examples/llava/MobileVLM-README.md Co-authored-by: Georgi Gerganov * Update examples/llava/convert-image-encoder-to-gguf.py Co-authored-by: Georgi Gerganov * clip : fix whitespace * fix deifinition mistake in clip.cpp --------- Co-authored-by: Georgi Gerganov --- examples/llava/MobileVLM-README.md | 14 +++++- examples/llava/clip.cpp | 50 ++++++++++++++++++- .../llava/convert-image-encoder-to-gguf.py | 9 ++-- 3 files changed, 67 insertions(+), 6 deletions(-) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index 9eba791da..c1f361d17 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -1,11 +1,13 @@ # MobileVLM -Currently this implementation supports [MobileVLM-v1.7](https://huggingface.co/mtgv/MobileVLM-1.7B) variants. +Currently this implementation supports [MobileVLM-1.7B](https://huggingface.co/mtgv/MobileVLM-1.7B) / [MobileVLM_V2-1.7B](https://huggingface.co/mtgv/MobileVLM_V2-1.7B) variants. for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com/Meituan-AutoML/MobileVLM) The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. +Notice: The overall process of model inference for both **MobilVLM** and **MobilVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown. + ## Usage Build with cmake or run `make llava-cli` to build it. @@ -34,7 +36,7 @@ git clone https://huggingface.co/openai/clip-vit-large-patch14-336 python ./examples/llava/llava-surgery.py -m path/to/MobileVLM-1.7B ``` -3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` to convert the LLaVA image encoder to GGUF: +3. Use `convert-image-encoder-to-gguf.py` with `--projector-type ldp` (for **V2** the arg is `--projector-type ldpv2`) to convert the LLaVA image encoder to GGUF: ```sh python ./examples/llava/convert-image-encoder-to-gguf \ @@ -44,6 +46,14 @@ python ./examples/llava/convert-image-encoder-to-gguf \ --projector-type ldp ``` +```sh +python ./examples/llava/convert-image-encoder-to-gguf \ + -m path/to/clip-vit-large-patch14-336 \ + --llava-projector path/to/MobileVLM-1.7B_V2/llava.projector \ + --output-dir path/to/MobileVLM-1.7B_V2 \ + --projector-type ldpv2 +``` + 4. Use `convert.py` to convert the LLaMA part of LLaVA to GGUF: ```sh diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 690bca2eb..48caafa87 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -119,6 +119,7 @@ static std::string format(const char * fmt, ...) { #define TN_LLAVA_PROJ "mm.%d.%s" #define TN_MVLM_PROJ_MLP "mm.model.mlp.%d.%s" #define TN_MVLM_PROJ_BLOCK "mm.model.mb_block.%d.block.%d.%s" +#define TN_MVLM_PROJ_PEG "mm.model.peg.%d.%s" #define TN_IMAGE_NEWLINE "model.image_newline" @@ -126,12 +127,14 @@ enum projector_type { PROJECTOR_TYPE_MLP, PROJECTOR_TYPE_MLP_NORM, PROJECTOR_TYPE_LDP, + PROJECTOR_TYPE_LDPV2, PROJECTOR_TYPE_UNKNOWN, }; static std::map PROJECTOR_TYPE_NAMES = { { PROJECTOR_TYPE_MLP, "mlp" }, { PROJECTOR_TYPE_LDP, "ldp" }, + { PROJECTOR_TYPE_LDPV2, "ldpv2"}, }; @@ -475,6 +478,14 @@ struct clip_vision_model { struct ggml_tensor * mm_model_block_2_block_2_0_w; struct ggml_tensor * mm_model_block_2_block_2_1_w; struct ggml_tensor * mm_model_block_2_block_2_1_b; + + // MobileVLM_V2 projection + struct ggml_tensor * mm_model_mlp_0_w; + struct ggml_tensor * mm_model_mlp_0_b; + struct ggml_tensor * mm_model_mlp_2_w; + struct ggml_tensor * mm_model_mlp_2_b; + struct ggml_tensor * mm_model_peg_0_w; + struct ggml_tensor * mm_model_peg_0_b; }; struct clip_ctx { @@ -807,6 +818,29 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 } embeddings = block_1; } + else if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) + { + int n_patch = 24; + struct ggml_tensor * mlp_0 = ggml_mul_mat(ctx0, model.mm_model_mlp_0_w, embeddings); + mlp_0 = ggml_add(ctx0, mlp_0, model.mm_model_mlp_0_b); + mlp_0 = ggml_gelu(ctx0, mlp_0); + struct ggml_tensor * mlp_2 = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, mlp_0); + mlp_2 = ggml_add(ctx0, mlp_2, model.mm_model_mlp_2_b); + // mlp_2 ne = [2048, 576, 1, 1] + // // AVG Pool Layer 2*2, strides = 2 + mlp_2 = ggml_cont(ctx0, ggml_permute(ctx0, mlp_2, 1, 0, 2, 3)); + // mlp_2 ne = [576, 2048, 1, 1] + mlp_2 = ggml_reshape_4d(ctx0, mlp_2, n_patch, n_patch, mlp_2->ne[1], mlp_2->ne[2]); + // mlp_2 ne [24, 24, 2048, 1] + mlp_2 = ggml_pool_2d(ctx0, mlp_2, GGML_OP_POOL_AVG, 2, 2, 2, 2, 0, 0); + // weight ne = [3, 3, 2048, 1] + struct ggml_tensor * peg_0 = ggml_conv_depthwise_2d(ctx0, model.mm_model_peg_0_w, mlp_2, 1, 1, 1, 1, 1, 1); + peg_0 = ggml_add(ctx0, peg_0, mlp_2); + peg_0 = ggml_cont(ctx0, ggml_permute(ctx0, peg_0, 1, 2, 0, 3)); + peg_0 = ggml_add(ctx0, peg_0, model.mm_model_peg_0_b); + peg_0 = ggml_reshape_3d(ctx0, peg_0, peg_0->ne[0], peg_0->ne[1] * peg_0->ne[2], peg_0->ne[3]); + embeddings = peg_0; + } else { GGML_ASSERT(false); } @@ -1177,7 +1211,18 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { vision_model.mm_model_block_2_block_2_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "0.weight")); vision_model.mm_model_block_2_block_2_1_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.weight")); vision_model.mm_model_block_2_block_2_1_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_BLOCK, 2, 2, "1.bias")); - } else { + } + else if (new_clip->proj_type == PROJECTOR_TYPE_LDPV2) + { + // MobilVLM_V2 projection + vision_model.mm_model_mlp_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "weight")); + vision_model.mm_model_mlp_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 0, "bias")); + vision_model.mm_model_mlp_2_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "weight")); + vision_model.mm_model_mlp_2_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_MLP, 2, "bias")); + vision_model.mm_model_peg_0_w = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "weight")); + vision_model.mm_model_peg_0_b = get_tensor(new_clip->ctx_data, format(TN_MVLM_PROJ_PEG, 0, "bias")); + } + else { std::string proj_type = PROJECTOR_TYPE_NAMES[new_clip->proj_type]; throw std::runtime_error(format("%s: don't support projector with: %s currently\n", __func__, proj_type.c_str())); } @@ -1966,6 +2011,9 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { if (ctx->proj_type == PROJECTOR_TYPE_LDP) { return ctx->vision_model.mm_model_block_1_block_2_1_b->ne[0]; } + if (ctx->proj_type == PROJECTOR_TYPE_LDPV2) { + return ctx->vision_model.mm_model_peg_0_b->ne[0]; + } if (ctx->proj_type == PROJECTOR_TYPE_MLP) { return ctx->vision_model.mm_2_b->ne[0]; } diff --git a/examples/llava/convert-image-encoder-to-gguf.py b/examples/llava/convert-image-encoder-to-gguf.py index c69f89ac2..b00bf7c6d 100644 --- a/examples/llava/convert-image-encoder-to-gguf.py +++ b/examples/llava/convert-image-encoder-to-gguf.py @@ -1,6 +1,7 @@ import argparse import os import json +import re import torch import numpy as np @@ -38,9 +39,11 @@ def should_skip_tensor(name: str, has_text: bool, has_vision: bool, has_llava: b def get_tensor_name(name: str) -> str: if "projection" in name: return name - if "mm_projector" in name: - return name.replace("model.mm_projector", "mm") + name = name.replace("model.mm_projector", "mm") + name = re.sub(r'mm\.mlp\.mlp', 'mm.model.mlp', name, count=1) + name = re.sub(r'mm\.peg\.peg', 'mm.model.peg', name, count=1) + return name return name.replace("text_model", "t").replace("vision_model", "v").replace("encoder.layers", "blk").replace("embeddings.", "").replace("_proj", "").replace("self_attn.", "attn_").replace("layer_norm", "ln").replace("layernorm", "ln").replace("mlp.fc1", "ffn_down").replace("mlp.fc2", "ffn_up").replace("embedding", "embd").replace("final", "post").replace("layrnorm", "ln") @@ -83,7 +86,7 @@ ap.add_argument("--clip-model-is-vision", action="store_true", required=False, ap.add_argument("--clip-model-is-openclip", action="store_true", required=False, help="The clip model is from openclip (for ViT-SO400M type))") ap.add_argument("--llava-projector", help="Path to llava.projector file. If specified, save an image encoder for LLaVA models.") -ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp", choices=["mlp", "ldp"], default="mlp") +ap.add_argument("--projector-type", help="Type of projector. Possible values: mlp, ldp, ldpv2", choices=["mlp", "ldp", "ldpv2"], default="mlp") ap.add_argument("-o", "--output-dir", help="Directory to save GGUF files. Default is the original model directory", default=None) # Example --image_mean 0.48145466 0.4578275 0.40821073 --image_std 0.26862954 0.26130258 0.27577711 # Example --image_mean 0.5 0.5 0.5 --image_std 0.5 0.5 0.5 From f9c7ba34476ffc4f13ae2cdb1aec493a16eb8d47 Mon Sep 17 00:00:00 2001 From: Ziang Wu <97337387+ZiangWu-77@users.noreply.github.com> Date: Wed, 20 Mar 2024 23:29:51 +0800 Subject: [PATCH 27/28] llava : update MobileVLM-README.md (#6180) --- examples/llava/MobileVLM-README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/llava/MobileVLM-README.md b/examples/llava/MobileVLM-README.md index c1f361d17..4d5fef020 100644 --- a/examples/llava/MobileVLM-README.md +++ b/examples/llava/MobileVLM-README.md @@ -6,7 +6,7 @@ for more information, please go to [Meituan-AutoML/MobileVLM](https://github.com The implementation is based on llava, and is compatible with llava and mobileVLM. The usage is basically same as llava. -Notice: The overall process of model inference for both **MobilVLM** and **MobilVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown. +Notice: The overall process of model inference for both **MobileVLM** and **MobileVLM_V2** models is the same, but the process of model conversion is a little different. Therefore, using MobiVLM as an example, the different conversion step will be shown. ## Usage Build with cmake or run `make llava-cli` to build it. From 1c51f98adcbad40e3c41f0a6ffadeb723190b417 Mon Sep 17 00:00:00 2001 From: slaren Date: Wed, 20 Mar 2024 21:03:26 +0100 Subject: [PATCH 28/28] cuda : print the returned error when CUDA initialization fails (#6185) --- ggml-cuda.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e70b79c30..24af9f671 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -294,8 +294,9 @@ static ggml_cuda_device_info ggml_cuda_init() { ggml_cuda_device_info info = {}; - if (cudaGetDeviceCount(&info.device_count) != cudaSuccess) { - fprintf(stderr, "%s: no " GGML_CUDA_NAME " devices found, " GGML_CUDA_NAME " will be disabled\n", __func__); + cudaError_t err = cudaGetDeviceCount(&info.device_count); + if (err != cudaSuccess) { + fprintf(stderr, "%s: failed to initialize " GGML_CUDA_NAME ": %s\n", __func__, cudaGetErrorString(err)); return info; }