diff --git a/.github/workflows/nix-flake-update.yml b/.github/workflows/nix-flake-update.yml index fa9360841..3a6a96e26 100644 --- a/.github/workflows/nix-flake-update.yml +++ b/.github/workflows/nix-flake-update.yml @@ -19,4 +19,4 @@ jobs: pr-labels: | nix pr-reviewers: philiptaron,SomeoneSerge - token: ${{ secrets.GITHUB_TOKEN }} + token: ${{ secrets.FLAKE_TOKEN }} diff --git a/.gitignore b/.gitignore index cf1b692e9..fba207045 100644 --- a/.gitignore +++ b/.gitignore @@ -43,6 +43,7 @@ models-mnt /embedding /gguf /gguf-llama-simple +/imatrix /infill /libllama.so /llama-bench diff --git a/Makefile b/Makefile index 4c7e175bf..05fe9a0f6 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ - main quantize quantize-stats perplexity embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ + main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml \ simple batched batched-bench save-load-state server gguf llama-bench libllava.a llava-cli baby-llama beam-search \ speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey tests/test-c.o @@ -614,6 +614,9 @@ quantize-stats: examples/quantize-stats/quantize-stats.cpp build-info.o ggml. perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) +imatrix: examples/imatrix/imatrix.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) + $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) + embedding: examples/embedding/embedding.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) diff --git a/Package.swift b/Package.swift index 583e2e276..37524edee 100644 --- a/Package.swift +++ b/Package.swift @@ -14,7 +14,7 @@ let package = Package( .library(name: "llama", targets: ["llama"]), ], dependencies: [ - .package(url: "https://github.com/ggerganov/ggml.git", .branch("master")) + .package(url: "https://github.com/ggerganov/ggml.git", .branch("release")) ], targets: [ .target( diff --git a/common/common.cpp b/common/common.cpp index 4e89fe516..322b9f91e 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -543,9 +543,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers = std::stoi(argv[i]); -#else +#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif @@ -554,9 +553,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef LLAMA_SUPPORTS_GPU_OFFLOAD params.n_gpu_layers_draft = std::stoi(argv[i]); -#else +#ifndef LLAMA_SUPPORTS_GPU_OFFLOAD fprintf(stderr, "warning: not compiled with GPU offload support, --n-gpu-layers-draft option will be ignored\n"); fprintf(stderr, "warning: see main README.md for information on enabling GPU BLAS support\n"); #endif @@ -565,25 +563,44 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS params.main_gpu = std::stoi(argv[i]); -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a main GPU.\n"); -#endif +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the main GPU has no effect.\n"); +#endif // GGML_USE_CUBLAS + } else if (arg == "--split-mode" || arg == "-sm") { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string arg_next = argv[i]; + if (arg_next == "none") { + params.split_mode = LLAMA_SPLIT_NONE; + } else if (arg_next == "layer") { + params.split_mode = LLAMA_SPLIT_LAYER; + } else if (arg_next == "row") { + params.split_mode = LLAMA_SPLIT_ROW; + } else { + invalid_param = true; + break; + } +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUBLAS } else if (arg == "--tensor-split" || arg == "-ts") { if (++i >= argc) { invalid_param = true; break; } -#ifdef GGML_USE_CUBLAS std::string arg_next = argv[i]; // split string by , and / const std::regex regex{R"([,/]+)"}; std::sregex_token_iterator it{arg_next.begin(), arg_next.end(), regex, -1}; std::vector split_arg{it, {}}; - GGML_ASSERT(split_arg.size() <= LLAMA_MAX_DEVICES); - + if (split_arg.size() >= LLAMA_MAX_DEVICES) { + invalid_param = true; + break; + } for (size_t i = 0; i < LLAMA_MAX_DEVICES; ++i) { if (i < split_arg.size()) { params.tensor_split[i] = std::stof(split_arg[i]); @@ -591,14 +608,8 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { params.tensor_split[i] = 0.0f; } } -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n"); -#endif // GGML_USE_CUBLAS - } else if (arg == "--no-mul-mat-q" || arg == "-nommq") { -#ifdef GGML_USE_CUBLAS - params.mul_mat_q = false; -#else - fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n"); +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting a tensor split has no effect.\n"); #endif // GGML_USE_CUBLAS } else if (arg == "--no-mmap") { params.use_mmap = false; @@ -630,6 +641,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { break; } params.ppl_stride = std::stoi(argv[i]); + } else if (arg == "-ptc" || arg == "--print-token-count") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_print = std::stoi(argv[i]); } else if (arg == "--ppl-output-type") { if (++i >= argc) { invalid_param = true; @@ -812,7 +829,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf("\n"); printf("options:\n"); printf(" -h, --help show this help message and exit\n"); - printf(" --version show version and build info\n"); + printf(" --version show version and build info\n"); printf(" -i, --interactive run in interactive mode\n"); printf(" --interactive-first run in interactive mode and wait for input right away\n"); printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n"); @@ -909,14 +926,15 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" number of layers to store in VRAM\n"); printf(" -ngld N, --n-gpu-layers-draft N\n"); printf(" number of layers to store in VRAM for the draft model\n"); - printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); -#ifdef GGML_USE_CUBLAS - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use " GGML_CUBLAS_NAME " instead of custom mul_mat_q " GGML_CUDA_NAME " kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); -#endif // GGML_USE_CUBLAS + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); + printf(" -ts SPLIT, --tensor-split SPLIT\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row) (default: %d)\n", params.main_gpu); #endif printf(" -gan N, --grp-attn-n N\n"); printf(" group-attention factor (default: %d)\n", params.grp_attn_n); @@ -944,6 +962,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); + printf(" -ptc N, --print-token-count N\n"); + printf(" print token count every N tokens (default: %d)\n", params.n_print); printf("\n"); #ifndef LOG_DISABLE_LOGS log_print_usage(); @@ -1033,6 +1053,7 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & mparams.n_gpu_layers = params.n_gpu_layers; } mparams.main_gpu = params.main_gpu; + mparams.split_mode = params.split_mode; mparams.tensor_split = params.tensor_split; mparams.use_mmap = params.use_mmap; mparams.use_mlock = params.use_mlock; @@ -1047,6 +1068,9 @@ struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & } static ggml_type kv_cache_type_from_str(const std::string & s) { + if (s == "f32") { + return GGML_TYPE_F32; + } if (s == "f16") { return GGML_TYPE_F16; } diff --git a/common/common.h b/common/common.h index e2bbfc258..f29be5b5a 100644 --- a/common/common.h +++ b/common/common.h @@ -59,11 +59,13 @@ struct gpt_params { float p_split = 0.1f; // speculative decoding split probability int32_t n_gpu_layers = -1; // number of layers to store in VRAM (-1 - use default) int32_t n_gpu_layers_draft = -1; // number of layers to store in VRAM for the draft model (-1 - use default) + llama_split_mode split_mode = LLAMA_SPLIT_LAYER; // how to split the model across GPUs int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs int32_t n_beams = 0; // if non-zero then use beam search of given width. int32_t grp_attn_n = 1; // group-attention factor int32_t grp_attn_w = 512; // group-attention width + int32_t n_print = -1; // print token count every n tokens (-1 = disabled) float rope_freq_base = 0.0f; // RoPE base frequency float rope_freq_scale = 0.0f; // RoPE frequency scaling factor float yarn_ext_factor = -1.0f; // YaRN extrapolation mix factor @@ -242,4 +244,3 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size = 80); // Dump the KV cache view showing individual sequences in each cell (long output). void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40); - diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 203eaf64b..a1c79fd47 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -817,10 +817,17 @@ class PersimmonModel(Model): hidden_size = self.hparams["hidden_size"] self.gguf_writer.add_name('persimmon-8b-chat') + self.gguf_writer.add_context_length(self.hparams["max_position_embeddings"]) self.gguf_writer.add_embedding_length(hidden_size) self.gguf_writer.add_block_count(block_count) self.gguf_writer.add_feed_forward_length(self.hparams["intermediate_size"]) - self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + + # NOTE: not sure about this change - why does the model not have a rope dimension count when it is smaller + # than the head size? + # ref: https://github.com/ggerganov/llama.cpp/pull/4889 + # self.gguf_writer.add_rope_dimension_count(hidden_size // head_count) + self.gguf_writer.add_rope_dimension_count(hidden_size // head_count // 2) + self.gguf_writer.add_head_count(head_count) self.gguf_writer.add_head_count_kv(head_count_kv) self.gguf_writer.add_rope_freq_base(self.hparams["rope_theta"]) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 0c71cbdf7..fa127a3aa 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -36,6 +36,7 @@ else() add_subdirectory(lookahead) add_subdirectory(lookup) add_subdirectory(train-text-from-scratch) + add_subdirectory(imatrix) if (LLAMA_METAL) add_subdirectory(metal) endif() diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index 57596ed98..7924db267 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -88,7 +88,10 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); + const std::vector t_split (LLAMA_MAX_DEVICES, 0.0f); + model_params.n_gpu_layers = n_gpu_layers; + model_params.tensor_split = t_split.data(); llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp index 58fbe204d..4cd5d99bb 100644 --- a/examples/export-lora/export-lora.cpp +++ b/examples/export-lora/export-lora.cpp @@ -245,9 +245,8 @@ static struct lora_data * load_lora(struct lora_info * info) { params_ggml.no_alloc = true; result->ctx = ggml_init(params_ggml); - uint32_t LLAMA_FILE_MAGIC_LORA = 0x67676C61; // 'ggla' uint32_t magic = file.read_u32(); - if (magic != LLAMA_FILE_MAGIC_LORA) { + if (magic != LLAMA_FILE_MAGIC_GGLA) { die_fmt("unexpected lora header file magic in '%s'", info->filename.c_str()); } uint32_t version = file.read_u32(); diff --git a/examples/imatrix/CMakeLists.txt b/examples/imatrix/CMakeLists.txt new file mode 100644 index 000000000..d688a1620 --- /dev/null +++ b/examples/imatrix/CMakeLists.txt @@ -0,0 +1,5 @@ +set(TARGET imatrix) +add_executable(${TARGET} imatrix.cpp) +install(TARGETS ${TARGET} RUNTIME) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/imatrix/imatrix.cpp b/examples/imatrix/imatrix.cpp new file mode 100644 index 000000000..1461bc963 --- /dev/null +++ b/examples/imatrix/imatrix.cpp @@ -0,0 +1,380 @@ +#include "common.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +struct Stats { + std::vector values; + int ncall = 0; +}; + +struct StatParams { + std::string ofile = "imatrix.dat"; + int n_output_frequency = 10; + int verbosity = 1; + bool collect_output_weight = false; +}; + +class IMatrixCollector { +public: + IMatrixCollector() = default; + void set_parameters(StatParams&& params) { m_params = std::move(params); } + void collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1); + void save_imatrix() const; +private: + std::unordered_map m_stats; + StatParams m_params; + std::mutex m_mutex; + int m_last_call = 0; +}; + +void IMatrixCollector::collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { + if (src1->ne[1] < 16 || src1->type != GGML_TYPE_F32) return; + if (!(strncmp(src0->name, "blk.", 4) == 0 || (m_params.collect_output_weight && strcmp(src0->name, "output.weight") == 0))) return; + std::lock_guard lock(m_mutex); + auto& e = m_stats[src0->name]; + if (e.values.empty()) { + e.values.resize(src1->ne[0], 0); + } + else if (e.values.size() != (size_t)src1->ne[0]) { + fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", src0->name, (int)e.values.size(), (int)src1->ne[0]); + exit(1); //GGML_ASSERT(false); + } + ++e.ncall; + if (m_params.verbosity > 1) { + printf("%s[%d]: %s, %d x %d, %d\n",__func__,m_last_call,src0->name,(int)src1->ne[0],(int)src1->ne[1],(int)src1->type); + } + for (int row = 0; row < (int)src1->ne[1]; ++row) { + const float * x = (const float *)src1->data + row * src1->ne[0]; + for (int j = 0; j < (int)src1->ne[0]; ++j) { + e.values[j] += x[j]*x[j]; + } + } + if (e.ncall > m_last_call) { + m_last_call = e.ncall; + if (m_last_call % m_params.n_output_frequency == 0) { + save_imatrix(); + } + } +} + +void IMatrixCollector::save_imatrix() const { + const char * fname = m_params.ofile.empty() ? "imatrix.dat" : m_params.ofile.c_str(); + std::ofstream out(fname, std::ios::binary); + int n_entries = m_stats.size(); + out.write((const char*)&n_entries, sizeof(n_entries)); + for (auto& p : m_stats) { + int len = p.first.size(); + out.write((const char*)&len, sizeof(len)); + out.write(p.first.c_str(), len); + out.write((const char*)&p.second.ncall, sizeof(p.second.ncall)); + int nval = p.second.values.size(); + out.write((const char*)&nval, sizeof(nval)); + if (nval > 0) out.write((const char*)p.second.values.data(), nval*sizeof(float)); + } + if (m_params.verbosity > 0) { + fprintf(stderr, "\n%s: stored collected data after %d chunks in %s\n",__func__,m_last_call,fname); + } +} + +static IMatrixCollector g_collector; + +static void ik_collect_imatrix(const struct ggml_tensor * src0, const struct ggml_tensor * src1) { + g_collector.collect_imatrix(src0, src1); +} + + +struct results_log_softmax { + double log_softmax; + float logit; + float prob; +}; + +static std::vector softmax(const std::vector& logits) { + std::vector probs(logits.size()); + float max_logit = logits[0]; + for (float v : logits) { + max_logit = std::max(max_logit, v); + } + double sum_exp = 0.0; + for (size_t i = 0; i < logits.size(); i++) { + // Subtract the maximum logit value from the current logit value for numerical stability + const float logit = logits[i] - max_logit; + const float exp_logit = expf(logit); + sum_exp += exp_logit; + probs[i] = exp_logit; + } + for (size_t i = 0; i < probs.size(); i++) { + probs[i] /= sum_exp; + } + return probs; +} + +static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) { + float max_logit = logits[0]; + for (int i = 1; i < n_vocab; ++i) { + max_logit = std::max(max_logit, logits[i]); + } + double sum_exp = 0.0; + for (int i = 0; i < n_vocab; ++i) { + sum_exp += expf(logits[i] - max_logit); + } + return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp}; +} + +static void process_logits( + int n_vocab, const float * logits, const int * tokens, int n_token, std::vector & workers, + double & nll, double & nll2, float * logit_history, float * prob_history +) { + std::mutex mutex; + int counter = 0; + auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () { + double local_nll = 0; + double local_nll2 = 0; + while (true) { + std::unique_lock lock(mutex); + int i = counter++; + if (i >= n_token) { + nll += local_nll; nll2 += local_nll2; + break; + } + lock.unlock(); + const results_log_softmax results = log_softmax(n_vocab, logits + i*n_vocab, tokens[i+1]); + const double v = -results.log_softmax; + local_nll += v; + local_nll2 += v*v; + + logit_history[i] = results.logit; + prob_history[i] = results.prob; + } + }; + for (auto & w : workers) { + w = std::thread(compute); + } + compute(); + for (auto & w : workers) { + w.join(); + } +} + +static bool compute_imatrix(llama_context * ctx, const gpt_params & params) { + + const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx)); + const int n_ctx = llama_n_ctx(ctx); + + auto tim1 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenizing the input ..\n", __func__); + + std::vector tokens = ::llama_tokenize(ctx, params.prompt, add_bos); + + auto tim2 = std::chrono::high_resolution_clock::now(); + fprintf(stderr, "%s: tokenization took %g ms\n",__func__,1e-3*std::chrono::duration_cast(tim2-tim1).count()); + + if (int(tokens.size()) < 2*n_ctx) { + fprintf(stderr, "%s: you need at least %d tokens for a context of %d tokens\n",__func__,2*n_ctx, + n_ctx); + fprintf(stderr, "%s: the data file you provided tokenizes to only %zu tokens\n",__func__,tokens.size()); + return false; + } + + std::vector logit_history; + logit_history.resize(tokens.size()); + + std::vector prob_history; + prob_history.resize(tokens.size()); + + const int n_chunk_max = tokens.size() / n_ctx; + + const int n_chunk = params.n_chunks < 0 ? n_chunk_max : std::min(params.n_chunks, n_chunk_max); + const int n_vocab = llama_n_vocab(llama_get_model(ctx)); + const int n_batch = params.n_batch; + + int count = 0; + double nll = 0.0; + double nll2 = 0.0; + + fprintf(stderr, "%s: computing over %d chunks with batch_size %d\n", __func__, n_chunk, n_batch); + + std::vector workers(std::thread::hardware_concurrency() - 1); + + for (int i = 0; i < n_chunk; ++i) { + const int start = i * n_ctx; + const int end = start + n_ctx; + + const int num_batches = (n_ctx + n_batch - 1) / n_batch; + + std::vector logits; + + const auto t_start = std::chrono::high_resolution_clock::now(); + + // clear the KV cache + llama_kv_cache_clear(ctx); + + for (int j = 0; j < num_batches; ++j) { + const int batch_start = start + j * n_batch; + const int batch_size = std::min(end - batch_start, n_batch); + + // save original token and restore it after eval + const auto token_org = tokens[batch_start]; + + // add BOS token for the first batch of each chunk + if (add_bos && j == 0) { + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); + } + + if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { + fprintf(stderr, "%s : failed to eval\n", __func__); + return false; + } + + // restore the original token in case it was set to BOS + tokens[batch_start] = token_org; + + const auto * batch_logits = llama_get_logits(ctx); + logits.insert(logits.end(), batch_logits, batch_logits + batch_size * n_vocab); + } + + const auto t_end = std::chrono::high_resolution_clock::now(); + + if (i == 0) { + const float t_total = std::chrono::duration(t_end - t_start).count(); + fprintf(stderr, "%s: %.2f seconds per pass - ETA ", __func__, t_total); + int total_seconds = (int)(t_total * n_chunk); + if (total_seconds >= 60*60) { + fprintf(stderr, "%d hours ", total_seconds / (60*60)); + total_seconds = total_seconds % (60*60); + } + fprintf(stderr, "%.2f minutes\n", total_seconds / 60.0); + } + + const int first = n_ctx/2; + process_logits(n_vocab, logits.data() + first*n_vocab, tokens.data() + start + first, n_ctx - 1 - first, + workers, nll, nll2, logit_history.data() + start + first, prob_history.data() + start + first); + count += n_ctx - first - 1; + + printf("[%d]%.4lf,", i + 1, std::exp(nll / count)); + fflush(stdout); + } + printf("\n"); + + nll2 /= count; + nll /= count; + const double ppl = exp(nll); + nll2 -= nll * nll; + if (nll2 > 0) { + nll2 = sqrt(nll2/(count-1)); + printf("Final estimate: PPL = %.4lf +/- %.5lf\n", ppl, nll2*ppl); + } else { + printf("Unexpected negative standard deviation of log(prob)\n"); + } + + return true; +} + +int main(int argc, char ** argv) { + + StatParams sparams; + std::vector args; + args.push_back(argv[0]); + int iarg = 1; + for (; iarg < argc-1; ++iarg) { + std::string arg{argv[iarg]}; + if (arg == "-o" || arg == "--output-file") { + sparams.ofile = argv[++iarg]; + } + else if (arg == "-ofreq" || arg == "--output-frequency") { + sparams.n_output_frequency = std::stoi(argv[++iarg]); + } + else if (arg == "-ow" || arg == "--output-weight") { + sparams.collect_output_weight = std::stoi(argv[++iarg]); + } + else if (arg == "--verbosity") { + sparams.verbosity = std::stoi(argv[++iarg]); + } else { + args.push_back(argv[iarg]); + } + } + if (iarg < argc) { + args.push_back(argv[iarg]); + } + + gpt_params params; + params.n_batch = 512; + if (!gpt_params_parse(args.size(), args.data(), params)) { + return 1; + } + + g_collector.set_parameters(std::move(sparams)); + + ggml_set_imatrix_collection(ik_collect_imatrix); + + params.logits_all = true; + params.n_batch = std::min(params.n_batch, params.n_ctx); + + print_build_info(); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + fprintf(stderr, "%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + llama_backend_init(params.numa); + + llama_model * model; + llama_context * ctx; + + // load the model and apply lora adapter, if any + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (model == NULL) { + fprintf(stderr, "%s: error: unable to load model\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(model); + if (params.n_ctx > n_ctx_train) { + fprintf(stderr, "%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, params.n_ctx); + } + + // print system information + { + fprintf(stderr, "\n"); + fprintf(stderr, "%s\n", get_system_info(params).c_str()); + } + + bool OK = compute_imatrix(ctx, params); + if (!OK) { + return 1; + } + + g_collector.save_imatrix(); + + llama_print_timings(ctx); + + llama_free(ctx); + llama_free_model(model); + + llama_backend_free(); + + return 0; +} diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 7f7186cde..97325b5bd 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -128,6 +128,25 @@ static std::string get_gpu_info() { // command line params enum output_formats {CSV, JSON, MARKDOWN, SQL}; +static const char * output_format_str(output_formats format) { + switch (format) { + case CSV: return "csv"; + case JSON: return "json"; + case MARKDOWN: return "md"; + case SQL: return "sql"; + default: GGML_ASSERT(!"invalid output format"); + } +} + +static const char * split_mode_str(llama_split_mode mode) { + switch (mode) { + case LLAMA_SPLIT_NONE: return "none"; + case LLAMA_SPLIT_LAYER: return "layer"; + case LLAMA_SPLIT_ROW: return "row"; + default: GGML_ASSERT(!"invalid split mode"); + } +} + struct cmd_params { std::vector model; std::vector n_prompt; @@ -137,6 +156,7 @@ struct cmd_params { std::vector type_v; std::vector n_threads; std::vector n_gpu_layers; + std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; std::vector mul_mat_q; @@ -155,6 +175,7 @@ static const cmd_params cmd_params_defaults = { /* type_v */ {GGML_TYPE_F16}, /* n_threads */ {get_num_physical_cores()}, /* n_gpu_layers */ {99}, + /* split_mode */ {LLAMA_SPLIT_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, /* mul_mat_q */ {true}, @@ -169,21 +190,22 @@ static void print_usage(int /* argc */, char ** argv) { printf("\n"); printf("options:\n"); printf(" -h, --help\n"); - printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); - printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); - printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); - printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); - printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); - printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); - printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); - printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); - printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); - printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); - printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); - printf(" -ts, --tensor_split \n"); - printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); - printf(" -o, --output (default: %s)\n", cmd_params_defaults.output_format == CSV ? "csv" : cmd_params_defaults.output_format == JSON ? "json" : cmd_params_defaults.output_format == MARKDOWN ? "md" : "sql"); - printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); + printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); + printf(" -p, --n-prompt (default: %s)\n", join(cmd_params_defaults.n_prompt, ",").c_str()); + printf(" -n, --n-gen (default: %s)\n", join(cmd_params_defaults.n_gen, ",").c_str()); + printf(" -b, --batch-size (default: %s)\n", join(cmd_params_defaults.n_batch, ",").c_str()); + printf(" -ctk , --cache-type-k (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_k, ggml_type_name), ",").c_str()); + printf(" -ctv , --cache-type-v (default: %s)\n", join(transform_to_str(cmd_params_defaults.type_v, ggml_type_name), ",").c_str()); + printf(" -t, --threads (default: %s)\n", join(cmd_params_defaults.n_threads, ",").c_str()); + printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); + printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); + printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); + printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); + printf(" -ts, --tensor_split (default: 0)\n"); + printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); + printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); + printf(" -v, --verbose (default: %s)\n", cmd_params_defaults.verbose ? "1" : "0"); printf("\n"); printf("Multiple values can be given for each parameter by separating them with ',' or by specifying the parameter multiple times.\n"); } @@ -306,6 +328,28 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-sm" || arg == "--split-mode") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = split(argv[i], split_delim); + std::vector modes; + for (const auto & m : p) { + llama_split_mode mode; + if (m == "none") { + mode = LLAMA_SPLIT_NONE; + } else if (m == "layer") { + mode = LLAMA_SPLIT_LAYER; + } else if (m == "row") { + mode = LLAMA_SPLIT_ROW; + } else { + invalid_param = true; + break; + } + modes.push_back(mode); + } + params.split_mode.insert(params.split_mode.end(), modes.begin(), modes.end()); } else if (arg == "-mg" || arg == "--main-gpu") { if (++i >= argc) { invalid_param = true; @@ -392,6 +436,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.type_k.empty()) { params.type_k = cmd_params_defaults.type_k; } if (params.type_v.empty()) { params.type_v = cmd_params_defaults.type_v; } if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } @@ -410,6 +455,7 @@ struct cmd_params_instance { ggml_type type_v; int n_threads; int n_gpu_layers; + llama_split_mode split_mode; int main_gpu; bool no_kv_offload; bool mul_mat_q; @@ -419,6 +465,7 @@ struct cmd_params_instance { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; + mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; mparams.tensor_split = tensor_split.data(); @@ -428,6 +475,7 @@ struct cmd_params_instance { bool equal_mparams(const cmd_params_instance & other) const { return model == other.model && n_gpu_layers == other.n_gpu_layers && + split_mode == other.split_mode && main_gpu == other.main_gpu && tensor_split == other.tensor_split; } @@ -446,45 +494,13 @@ struct cmd_params_instance { } }; -static std::vector get_cmd_params_instances_int(const cmd_params & params, int n_gen, int n_prompt) { - std::vector instances; - - for (const auto & m : params.model) - for (const auto & nl : params.n_gpu_layers) - for (const auto & mg : params.main_gpu) - for (const auto & ts : params.tensor_split) - for (const auto & nb : params.n_batch) - for (const auto & tk : params.type_k) - for (const auto & tv : params.type_v) - for (const auto & mmq : params.mul_mat_q) - for (const auto & nkvo : params.no_kv_offload) - for (const auto & nt : params.n_threads) { - cmd_params_instance instance = { - /* .model = */ m, - /* .n_prompt = */ n_prompt, - /* .n_gen = */ n_gen, - /* .n_batch = */ nb, - /* .type_k = */ tk, - /* .type_v = */ tv, - /* .n_threads = */ nt, - /* .n_gpu_layers = */ nl, - /* .main_gpu = */ mg, - /* .no_kv_offload= */ nkvo, - /* .mul_mat_q = */ mmq, - /* .tensor_split = */ ts, - }; - instances.push_back(instance); - } - return instances; -} - static std::vector get_cmd_params_instances(const cmd_params & params) { std::vector instances; -#if 1 // this ordering minimizes the number of times that each model needs to be reloaded for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) + for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) for (const auto & ts : params.tensor_split) for (const auto & nb : params.n_batch) @@ -506,6 +522,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, @@ -527,6 +544,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .type_v = */ tv, /* .n_threads = */ nt, /* .n_gpu_layers = */ nl, + /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .mul_mat_q = */ mmq, @@ -535,24 +553,6 @@ static std::vector get_cmd_params_instances(const cmd_param instances.push_back(instance); } } -#else - // this ordering separates the prompt and generation tests - for (const auto & n_prompt : params.n_prompt) { - if (n_prompt == 0) { - continue; - } - auto instances_prompt = get_cmd_params_instances_int(params, 0, n_prompt); - instances.insert(instances.end(), instances_prompt.begin(), instances_prompt.end()); - } - - for (const auto & n_gen : params.n_gen) { - if (n_gen == 0) { - continue; - } - auto instances_gen = get_cmd_params_instances_int(params, n_gen, 0); - instances.insert(instances.end(), instances_gen.begin(), instances_gen.end()); - } -#endif return instances; } @@ -576,6 +576,7 @@ struct test { ggml_type type_k; ggml_type type_v; int n_gpu_layers; + llama_split_mode split_mode; int main_gpu; bool no_kv_offload; bool mul_mat_q; @@ -597,6 +598,7 @@ struct test { type_k = inst.type_k; type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; + split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; mul_mat_q = inst.mul_mat_q; @@ -660,7 +662,8 @@ struct test { "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_threads", "type_k", "type_v", - "n_gpu_layers", "main_gpu", "no_kv_offload", + "n_gpu_layers", "split_mode", + "main_gpu", "no_kv_offload", "mul_mat_q", "tensor_split", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", @@ -711,7 +714,8 @@ struct test { cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), - std::to_string(n_gpu_layers), std::to_string(main_gpu), std::to_string(no_kv_offload), + std::to_string(n_gpu_layers), split_mode_str(split_mode), + std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(mul_mat_q), tensor_split_str, std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), @@ -867,6 +871,9 @@ struct markdown_printer : public printer { if (field == "n_gpu_layers") { return "ngl"; } + if (field == "split_mode") { + return "sm"; + } if (field == "n_threads") { return "threads"; } @@ -907,6 +914,9 @@ struct markdown_printer : public printer { if (params.main_gpu.size() > 1 || params.main_gpu != cmd_params_defaults.main_gpu) { fields.push_back("main_gpu"); } + if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { + fields.push_back("split_mode"); + } if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { fields.push_back("mul_mat_q"); } diff --git a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj index a8848a49f..3950b9e9d 100644 --- a/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj +++ b/examples/llama.swiftui/llama.swiftui.xcodeproj/project.pbxproj @@ -8,6 +8,7 @@ /* Begin PBXBuildFile section */ 549479CB2AC9E16000E0F78B /* Metal.framework in Frameworks */ = {isa = PBXBuildFile; fileRef = 549479CA2AC9E16000E0F78B /* Metal.framework */; }; + 79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */; }; 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */ = {isa = PBXBuildFile; fileRef = 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */; }; 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */; }; 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8A1C83782AC328BD0096AF73 /* ContentView.swift */; }; @@ -22,6 +23,7 @@ /* Begin PBXFileReference section */ 549479CA2AC9E16000E0F78B /* Metal.framework */ = {isa = PBXFileReference; lastKnownFileType = wrapper.framework; name = Metal.framework; path = System/Library/Frameworks/Metal.framework; sourceTree = SDKROOT; }; + 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = InputButton.swift; sourceTree = ""; }; 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = DownloadButton.swift; sourceTree = ""; }; 8A1C83732AC328BD0096AF73 /* llama.swiftui.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = llama.swiftui.app; sourceTree = BUILT_PRODUCTS_DIR; }; 8A1C83762AC328BD0096AF73 /* llama_swiftuiApp.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = llama_swiftuiApp.swift; sourceTree = ""; }; @@ -119,6 +121,7 @@ 7FA3D2B22B2EA2F600543F92 /* DownloadButton.swift */, 8A1C83782AC328BD0096AF73 /* ContentView.swift */, F1FE20E12B465EC900B45541 /* LoadCustomButton.swift */, + 79E1D9CC2B4CD16E005F8E46 /* InputButton.swift */, ); path = UI; sourceTree = ""; @@ -213,6 +216,7 @@ 8A1C83792AC328BD0096AF73 /* ContentView.swift in Sources */, 8A1C83772AC328BD0096AF73 /* llama_swiftuiApp.swift in Sources */, 7FA3D2B32B2EA2F600543F92 /* DownloadButton.swift in Sources */, + 79E1D9CD2B4CD16E005F8E46 /* InputButton.swift in Sources */, ); runOnlyForDeploymentPostprocessing = 0; }; @@ -345,7 +349,7 @@ CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = STLSG3FG8Q; + DEVELOPMENT_TEAM = K5UQJPP73A; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; @@ -377,7 +381,7 @@ CLANG_ENABLE_MODULES = YES; CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; - DEVELOPMENT_TEAM = STLSG3FG8Q; + DEVELOPMENT_TEAM = K5UQJPP73A; ENABLE_PREVIEWS = YES; GENERATE_INFOPLIST_FILE = YES; INFOPLIST_KEY_UIApplicationSceneManifest_Generation = YES; diff --git a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift index 17cb5b9dd..5bde18917 100644 --- a/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift +++ b/examples/llama.swiftui/llama.swiftui/Models/LlamaState.swift @@ -1,9 +1,19 @@ import Foundation +struct Model: Identifiable { + var id = UUID() + var name: String + var url: String + var filename: String + var status: String? +} + @MainActor class LlamaState: ObservableObject { @Published var messageLog = "" @Published var cacheCleared = false + @Published var downloadedModels: [Model] = [] + @Published var undownloadedModels: [Model] = [] let NS_PER_S = 1_000_000_000.0 private var llamaContext: LlamaContext? @@ -13,23 +23,102 @@ class LlamaState: ObservableObject { } init() { + loadModelsFromDisk() + loadDefaultModels() + } + + private func loadModelsFromDisk() { + do { + let documentsURL = getDocumentsDirectory() + let modelURLs = try FileManager.default.contentsOfDirectory(at: documentsURL, includingPropertiesForKeys: nil, options: [.skipsHiddenFiles, .skipsSubdirectoryDescendants]) + for modelURL in modelURLs { + let modelName = modelURL.deletingPathExtension().lastPathComponent + downloadedModels.append(Model(name: modelName, url: "", filename: modelURL.lastPathComponent, status: "downloaded")) + } + } catch { + print("Error loading models from disk: \(error)") + } + } + + private func loadDefaultModels() { do { try loadModel(modelUrl: defaultModelUrl) } catch { messageLog += "Error!\n" } + + for model in defaultModels { + let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename) + if FileManager.default.fileExists(atPath: fileURL.path) { + + } else { + var undownloadedModel = model + undownloadedModel.status = "download" + undownloadedModels.append(undownloadedModel) + } + } } + func getDocumentsDirectory() -> URL { + let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask) + return paths[0] + } + private let defaultModels: [Model] = [ + Model(name: "TinyLlama-1.1B (Q4_0, 0.6 GiB)",url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true",filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf", status: "download"), + Model( + name: "TinyLlama-1.1B Chat (Q8_0, 1.1 GiB)", + url: "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q8_0.gguf?download=true", + filename: "tinyllama-1.1b-chat-v1.0.Q8_0.gguf", status: "download" + ), + + Model( + name: "TinyLlama-1.1B (F16, 2.2 GiB)", + url: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true", + filename: "tinyllama-1.1b-f16.gguf", status: "download" + ), + + Model( + name: "Phi-2.7B (Q4_0, 1.6 GiB)", + url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true", + filename: "phi-2-q4_0.gguf", status: "download" + ), + + Model( + name: "Phi-2.7B (Q8_0, 2.8 GiB)", + url: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true", + filename: "phi-2-q8_0.gguf", status: "download" + ), + + Model( + name: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)", + url: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true", + filename: "mistral-7b-v0.1.Q4_0.gguf", status: "download" + ), + Model( + name: "OpenHermes-2.5-Mistral-7B (Q3_K_M, 3.52 GiB)", + url: "https://huggingface.co/TheBloke/OpenHermes-2.5-Mistral-7B-GGUF/resolve/main/openhermes-2.5-mistral-7b.Q3_K_M.gguf?download=true", + filename: "openhermes-2.5-mistral-7b.Q3_K_M.gguf", status: "download" + ) + ] func loadModel(modelUrl: URL?) throws { if let modelUrl { messageLog += "Loading model...\n" llamaContext = try LlamaContext.create_context(path: modelUrl.path()) messageLog += "Loaded model \(modelUrl.lastPathComponent)\n" + + // Assuming that the model is successfully loaded, update the downloaded models + updateDownloadedModels(modelName: modelUrl.lastPathComponent, status: "downloaded") } else { messageLog += "Load a model from the list below\n" } } + + private func updateDownloadedModels(modelName: String, status: String) { + undownloadedModels.removeAll { $0.name == modelName } + } + + func complete(text: String) async { guard let llamaContext else { return diff --git a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift index 7c81ea256..30c2dc431 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/ContentView.swift @@ -2,115 +2,57 @@ import SwiftUI struct ContentView: View { @StateObject var llamaState = LlamaState() - @State private var multiLineText = "" - - private static func cleanupModelCaches() { - // Delete all models (*.gguf) - let fileManager = FileManager.default - let documentsUrl = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0] - do { - let fileURLs = try fileManager.contentsOfDirectory(at: documentsUrl, includingPropertiesForKeys: nil) - for fileURL in fileURLs { - if fileURL.pathExtension == "gguf" { - try fileManager.removeItem(at: fileURL) - } - } - } catch { - print("Error while enumerating files \(documentsUrl.path): \(error.localizedDescription)") - } - } + @State private var showingHelp = false // To track if Help Sheet should be shown var body: some View { - VStack { - ScrollView(.vertical, showsIndicators: true) { - Text(llamaState.messageLog) - .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) + NavigationView { + VStack { + ScrollView(.vertical, showsIndicators: true) { + Text(llamaState.messageLog) + .font(.system(size: 12)) + .frame(maxWidth: .infinity, alignment: .leading) + .padding() + .onTapGesture { + UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil) + } + } + + TextEditor(text: $multiLineText) + .frame(height: 80) + .padding() + .border(Color.gray, width: 0.5) + + HStack { + Button("Send") { + sendText() + } + + Button("Bench") { + bench() + } + + Button("Clear") { + clear() + } + + Button("Copy") { + UIPasteboard.general.string = llamaState.messageLog + } + } + .buttonStyle(.bordered) .padding() - .onTapGesture { - UIApplication.shared.sendAction(#selector(UIResponder.resignFirstResponder), to: nil, from: nil, for: nil) - } - } - TextEditor(text: $multiLineText) - .frame(height: 80) + NavigationLink(destination: DrawerView(llamaState: llamaState)) { + Text("View Models") + } .padding() - .border(Color.gray, width: 0.5) - HStack { - Button("Send") { - sendText() - } - - Button("Bench") { - bench() - } - - Button("Clear") { - clear() - } - - Button("Copy") { - UIPasteboard.general.string = llamaState.messageLog - } - }.buttonStyle(.bordered) - - VStack(alignment: .leading) { - DownloadButton( - llamaState: llamaState, - modelName: "TinyLlama-1.1B (Q4_0, 0.6 GiB)", - modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q4_0.gguf?download=true", - filename: "tinyllama-1.1b-1t-openorca.Q4_0.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "TinyLlama-1.1B (Q8_0, 1.1 GiB)", - modelUrl: "https://huggingface.co/TheBloke/TinyLlama-1.1B-1T-OpenOrca-GGUF/resolve/main/tinyllama-1.1b-1t-openorca.Q8_0.gguf?download=true", - filename: "tinyllama-1.1b-1t-openorca.Q8_0.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "TinyLlama-1.1B (F16, 2.2 GiB)", - modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true", - filename: "tinyllama-1.1b-f16.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "Phi-2.7B (Q4_0, 1.6 GiB)", - modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true", - filename: "phi-2-q4_0.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "Phi-2.7B (Q8_0, 2.8 GiB)", - modelUrl: "https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q8_0.gguf?download=true", - filename: "phi-2-q8_0.gguf" - ) - - DownloadButton( - llamaState: llamaState, - modelName: "Mistral-7B-v0.1 (Q4_0, 3.8 GiB)", - modelUrl: "https://huggingface.co/TheBloke/Mistral-7B-v0.1-GGUF/resolve/main/mistral-7b-v0.1.Q4_0.gguf?download=true", - filename: "mistral-7b-v0.1.Q4_0.gguf" - ) - - Button("Clear downloaded models") { - ContentView.cleanupModelCaches() - llamaState.cacheCleared = true - } - - LoadCustomButton(llamaState: llamaState) } - .padding(.top, 4) - .font(.system(size: 12)) - .frame(maxWidth: .infinity, alignment: .leading) + .padding() + .navigationBarTitle("Model Settings", displayMode: .inline) + } - .padding() } func sendText() { @@ -131,8 +73,73 @@ struct ContentView: View { await llamaState.clear() } } + struct DrawerView: View { + + @ObservedObject var llamaState: LlamaState + @State private var showingHelp = false + func delete(at offsets: IndexSet) { + offsets.forEach { offset in + let model = llamaState.downloadedModels[offset] + let fileURL = getDocumentsDirectory().appendingPathComponent(model.filename) + do { + try FileManager.default.removeItem(at: fileURL) + } catch { + print("Error deleting file: \(error)") + } + } + + // Remove models from downloadedModels array + llamaState.downloadedModels.remove(atOffsets: offsets) + } + + func getDocumentsDirectory() -> URL { + let paths = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask) + return paths[0] + } + var body: some View { + List { + Section(header: Text("Download Models From Hugging Face")) { + HStack { + InputButton(llamaState: llamaState) + } + } + Section(header: Text("Downloaded Models")) { + ForEach(llamaState.downloadedModels) { model in + DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename) + } + .onDelete(perform: delete) + } + Section(header: Text("Default Models")) { + ForEach(llamaState.undownloadedModels) { model in + DownloadButton(llamaState: llamaState, modelName: model.name, modelUrl: model.url, filename: model.filename) + } + } + + } + .listStyle(GroupedListStyle()) + .navigationBarTitle("Model Settings", displayMode: .inline).toolbar { + ToolbarItem(placement: .navigationBarTrailing) { + Button("Help") { + showingHelp = true + } + } + }.sheet(isPresented: $showingHelp) { // Sheet for help modal + VStack(alignment: .leading) { + VStack(alignment: .leading) { + Text("1. Make sure the model is in GGUF Format") + .padding() + Text("2. Copy the download link of the quantized model") + .padding() + } + Spacer() + } + } + } + } } -//#Preview { -// ContentView() -//} +struct ContentView_Previews: PreviewProvider { + static var previews: some View { + ContentView() + } +} diff --git a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift index c9f322ca1..4584d6eaa 100644 --- a/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift +++ b/examples/llama.swiftui/llama.swiftui/UI/DownloadButton.swift @@ -53,6 +53,8 @@ struct DownloadButton: View { llamaState.cacheCleared = false + let model = Model(name: modelName, url: modelUrl, filename: filename, status: "downloaded") + llamaState.downloadedModels.append(model) status = "downloaded" } } catch let err { diff --git a/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift b/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift new file mode 100644 index 000000000..c5ffbad4e --- /dev/null +++ b/examples/llama.swiftui/llama.swiftui/UI/InputButton.swift @@ -0,0 +1,131 @@ +import SwiftUI + +struct InputButton: View { + @ObservedObject var llamaState: LlamaState + @State private var inputLink: String = "" + @State private var status: String = "download" + @State private var filename: String = "" + + @State private var downloadTask: URLSessionDownloadTask? + @State private var progress = 0.0 + @State private var observation: NSKeyValueObservation? + + private static func extractModelInfo(from link: String) -> (modelName: String, filename: String)? { + guard let url = URL(string: link), + let lastPathComponent = url.lastPathComponent.components(separatedBy: ".").first, + let modelName = lastPathComponent.components(separatedBy: "-").dropLast().joined(separator: "-").removingPercentEncoding, + let filename = lastPathComponent.removingPercentEncoding else { + return nil + } + + return (modelName, filename) + } + + private static func getFileURL(filename: String) -> URL { + FileManager.default.urls(for: .documentDirectory, in: .userDomainMask)[0].appendingPathComponent(filename) + } + + private func download() { + guard let extractedInfo = InputButton.extractModelInfo(from: inputLink) else { + // Handle invalid link or extraction failure + return + } + + let (modelName, filename) = extractedInfo + self.filename = filename // Set the state variable + + status = "downloading" + print("Downloading model \(modelName) from \(inputLink)") + guard let url = URL(string: inputLink) else { return } + let fileURL = InputButton.getFileURL(filename: filename) + + downloadTask = URLSession.shared.downloadTask(with: url) { temporaryURL, response, error in + if let error = error { + print("Error: \(error.localizedDescription)") + return + } + + guard let response = response as? HTTPURLResponse, (200...299).contains(response.statusCode) else { + print("Server error!") + return + } + + do { + if let temporaryURL = temporaryURL { + try FileManager.default.copyItem(at: temporaryURL, to: fileURL) + print("Writing to \(filename) completed") + + llamaState.cacheCleared = false + + let model = Model(name: modelName, url: self.inputLink, filename: filename, status: "downloaded") + llamaState.downloadedModels.append(model) + status = "downloaded" + } + } catch let err { + print("Error: \(err.localizedDescription)") + } + } + + observation = downloadTask?.progress.observe(\.fractionCompleted) { progress, _ in + self.progress = progress.fractionCompleted + } + + downloadTask?.resume() + } + + var body: some View { + VStack { + HStack { + TextField("Paste Quantized Download Link", text: $inputLink) + .textFieldStyle(RoundedBorderTextFieldStyle()) + + Button(action: { + downloadTask?.cancel() + status = "download" + }) { + Text("Cancel") + } + } + + if status == "download" { + Button(action: download) { + Text("Download Custom Model") + } + } else if status == "downloading" { + Button(action: { + downloadTask?.cancel() + status = "download" + }) { + Text("Downloading \(Int(progress * 100))%") + } + } else if status == "downloaded" { + Button(action: { + let fileURL = InputButton.getFileURL(filename: self.filename) + if !FileManager.default.fileExists(atPath: fileURL.path) { + download() + return + } + do { + try llamaState.loadModel(modelUrl: fileURL) + } catch let err { + print("Error: \(err.localizedDescription)") + } + }) { + Text("Load Custom Model") + } + } else { + Text("Unknown status") + } + } + .onDisappear() { + downloadTask?.cancel() + } + .onChange(of: llamaState.cacheCleared) { newValue in + if newValue { + downloadTask?.cancel() + let fileURL = InputButton.getFileURL(filename: self.filename) + status = FileManager.default.fileExists(atPath: fileURL.path) ? "downloaded" : "download" + } + } + } +} diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 19bad56cd..0024f16f6 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -508,7 +508,7 @@ int main(int argc, char ** argv) { while ((n_remain != 0 && !is_antiprompt) || params.interactive) { // predict if (!embd.empty()) { - // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // Note: (n_ctx - 4) here is to match the logic for commandline prompt handling via // --prompt or --file which uses the same value. int max_embd_size = n_ctx - 4; @@ -658,6 +658,10 @@ int main(int argc, char ** argv) { n_past += n_eval; LOG("n_past = %d\n", n_past); + // Display total tokens alongside total time + if (params.n_print > 0 && n_past % params.n_print == 0) { + LOG_TEE("\n\033[31mTokens consumed so far = %d / %d \033[0m\n", n_past, n_ctx); + } } if (!embd.empty() && !path_session.empty()) { diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index d27ea5e91..f878f6911 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -18,6 +18,7 @@ static const std::vector QUANT_OPTIONS = { { "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", }, { "Q5_1", LLAMA_FTYPE_MOSTLY_Q5_1, " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", }, { "Q2_K", LLAMA_FTYPE_MOSTLY_Q2_K, " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", }, + { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", }, { "Q3_K", LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" }, { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", }, { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", }, diff --git a/examples/server/README.md b/examples/server/README.md index d85a14f89..fd3034b99 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -23,7 +23,8 @@ Command line options: - `--host`: Set the hostname or ip address to listen. Default `127.0.0.1`. - `--port`: Set the port to listen. Default: `8080`. - `--path`: path from which to serve static files (default examples/server/public) -- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. +- `--api-key`: Set an api key for request authorization. By default the server responds to every request. With an api key set, the requests must have the Authorization header set with the api key as Bearer token. May be used multiple times to enable multiple valid keys. +- `--api-key-file`: path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access. May be used in conjunction with `--api-key`'s. - `--embedding`: Enable embedding extraction, Default: disabled. - `-np N`, `--parallel N`: Set the number of slots for process requests (default: 1) - `-cb`, `--cont-batching`: enable continuous batching (a.k.a dynamic batching) (default: disabled) @@ -110,6 +111,10 @@ node index.js ``` ## API Endpoints +- **GET** `/health`: Returns the current state of the server: + - `{"status": "loading model"}` if the model is still being loaded. + - `{"status": "error"}` if the model failed to load. + - `{"status": "ok"}` if the model is successfully loaded and the server is ready for further requests mentioned below. - **POST** `/completion`: Given a `prompt`, it returns the predicted completion. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 6c7fcd176..c1ab8f9dc 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #ifndef SERVER_VERBOSE #define SERVER_VERBOSE 1 @@ -38,7 +39,7 @@ using json = nlohmann::json; struct server_params { std::string hostname = "127.0.0.1"; - std::string api_key; + std::vector api_keys; std::string public_path = "examples/server/public"; int32_t port = 8080; int32_t read_timeout = 600; @@ -146,9 +147,15 @@ static std::vector base64_decode(const std::string & encoded_string) // parallel // +enum server_state { + SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet + SERVER_STATE_READY, // Server is ready and model is loaded + SERVER_STATE_ERROR // An error occurred, load_model failed +}; + enum task_type { - COMPLETION_TASK, - CANCEL_TASK + TASK_TYPE_COMPLETION, + TASK_TYPE_CANCEL, }; struct task_server { @@ -1395,11 +1402,11 @@ struct llama_server_context task.data = std::move(data); task.infill_mode = infill; task.embedding_mode = embedding; - task.type = COMPLETION_TASK; + task.type = TASK_TYPE_COMPLETION; task.multitask_id = multitask_id; // when a completion task's prompt array is not a singleton, we split it into multiple requests - if (task.data.at("prompt").size() > 1) + if (task.data.count("prompt") && task.data.at("prompt").size() > 1) { lock.unlock(); // entering new func scope return split_multiprompt_task(task); @@ -1517,7 +1524,7 @@ struct llama_server_context std::unique_lock lock(mutex_tasks); task_server task; task.id = id_gen++; - task.type = CANCEL_TASK; + task.type = TASK_TYPE_CANCEL; task.target_id = task_id; queue_tasks.push_back(task); condition_tasks.notify_one(); @@ -1553,7 +1560,7 @@ struct llama_server_context queue_tasks.erase(queue_tasks.begin()); switch (task.type) { - case COMPLETION_TASK: { + case TASK_TYPE_COMPLETION: { llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); if (slot == nullptr) { @@ -1570,9 +1577,9 @@ struct llama_server_context slot->reset(); - slot->infill = task.infill_mode; - slot->embedding = task.embedding_mode; - slot->task_id = task.id; + slot->infill = task.infill_mode; + slot->embedding = task.embedding_mode; + slot->task_id = task.id; slot->multitask_id = task.multitask_id; if (!launch_slot_with_data(slot, task.data)) @@ -1582,7 +1589,7 @@ struct llama_server_context break; } } break; - case CANCEL_TASK: { // release slot linked with the task id + case TASK_TYPE_CANCEL: { // release slot linked with the task id for (auto & slot : slots) { if (slot.task_id == task.target_id) @@ -1724,7 +1731,8 @@ struct llama_server_context const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()) || !slot.images.empty(); // empty prompt passed -> release the slot and send empty response - if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) + // note: infill mode allows empty prompt + if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt && !slot.infill) { slot.release(); slot.print_timings(); @@ -1997,12 +2005,15 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD printf(" -ngl N, --n-gpu-layers N\n"); printf(" number of layers to store in VRAM\n"); + printf(" -sm SPLIT_MODE, --split-mode SPLIT_MODE\n"); + printf(" how to split the model across multiple GPUs, one of:\n"); + printf(" - none: use one GPU only\n"); + printf(" - layer (default): split layers and KV across GPUs\n"); + printf(" - row: split rows across GPUs\n"); printf(" -ts SPLIT --tensor-split SPLIT\n"); - printf(" how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - printf(" -mg i, --main-gpu i the GPU to use for scratch and small tensors\n"); - printf(" -nommq, --no-mul-mat-q\n"); - printf(" use cuBLAS instead of custom mul_mat_q CUDA kernels.\n"); - printf(" Not recommended since this is both slower and uses more VRAM.\n"); + printf(" fraction of the model to offload to each GPU, comma-separated list of proportions, e.g. 3,1\n"); + printf(" -mg i, --main-gpu i the GPU to use for the model (with split-mode = none),\n"); + printf(" or for intermediate results and KV (with split-mode = row)\n"); #endif printf(" -m FNAME, --model FNAME\n"); printf(" model path (default: %s)\n", params.model.c_str()); @@ -2014,6 +2025,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --port PORT port to listen (default (default: %d)\n", sparams.port); printf(" --path PUBLIC_PATH path from which to serve static files (default %s)\n", sparams.public_path.c_str()); printf(" --api-key API_KEY optional api key to enhance server security. If set, requests must include this key for access.\n"); + printf(" --api-key-file FNAME path to file containing api keys delimited by new lines. If set, requests must include one of the keys for access.\n"); printf(" -to N, --timeout N server read/write timeout in seconds (default: %d)\n", sparams.read_timeout); printf(" --embedding enable embedding vector output (default: %s)\n", params.embedding ? "enabled" : "disabled"); printf(" -np N, --parallel N number of slots for process requests (default: %d)\n", params.n_parallel); @@ -2074,7 +2086,28 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, invalid_param = true; break; } - sparams.api_key = argv[i]; + sparams.api_keys.push_back(argv[i]); + } + else if (arg == "--api-key-file") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + std::ifstream key_file(argv[i]); + if (!key_file) { + fprintf(stderr, "error: failed to open file '%s'\n", argv[i]); + invalid_param = true; + break; + } + std::string key; + while (std::getline(key_file, key)) { + if (key.size() > 0) { + sparams.api_keys.push_back(key); + } + } + key_file.close(); } else if (arg == "--timeout" || arg == "-to") { @@ -2223,6 +2256,33 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, "See main README.md for information on enabling GPU BLAS support", {{"n_gpu_layers", params.n_gpu_layers}}); #endif + } + else if (arg == "--split-mode" || arg == "-sm") + { + if (++i >= argc) { + invalid_param = true; + break; + } + std::string arg_next = argv[i]; + if (arg_next == "none") + { + params.split_mode = LLAMA_SPLIT_NONE; + } + else if (arg_next == "layer") + { + params.split_mode = LLAMA_SPLIT_LAYER; + } + else if (arg_next == "row") + { + params.split_mode = LLAMA_SPLIT_ROW; + } + else { + invalid_param = true; + break; + } +#ifndef GGML_USE_CUBLAS + fprintf(stderr, "warning: llama.cpp was compiled without cuBLAS. Setting the split mode has no effect.\n"); +#endif // GGML_USE_CUBLAS } else if (arg == "--tensor-split" || arg == "-ts") { @@ -2453,7 +2513,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } } - static std::string random_string() { static const std::string str("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"); @@ -2509,7 +2568,7 @@ json oaicompat_completion_params_parse( // // https://platform.openai.com/docs/api-reference/chat/create llama_sampling_params default_sparams; - llama_params["model"] = json_value(body, "model", std::string("uknown")); + llama_params["model"] = json_value(body, "model", std::string("unknown")); llama_params["prompt"] = format_chatml(body["messages"]); // OpenAI 'messages' to llama.cpp 'prompt' llama_params["cache_prompt"] = json_value(body, "cache_prompt", false); llama_params["temperature"] = json_value(body, "temperature", 0.0); @@ -2581,8 +2640,8 @@ static json format_final_response_oaicompat(const json &request, const task_resu {"object", streaming ? "chat.completion.chunk" : "chat.completion"}, {"usage", json{{"completion_tokens", num_tokens_predicted}, - {"prompt_tokens", num_prompt_tokens}, - {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, + {"prompt_tokens", num_prompt_tokens}, + {"total_tokens", num_tokens_predicted + num_prompt_tokens}}}, {"id", gen_chatcmplid()}}; if (server_verbose) { @@ -2790,20 +2849,131 @@ int main(int argc, char **argv) {"system_info", llama_print_system_info()}, }); - // load the model - if (!llama.load_model(params)) + httplib::Server svr; + + std::atomic state{SERVER_STATE_LOADING_MODEL}; + + svr.set_default_headers({{"Server", "llama.cpp"}}); + + // CORS preflight + svr.Options(R"(.*)", [](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); + res.set_header("Access-Control-Allow-Credentials", "true"); + res.set_header("Access-Control-Allow-Methods", "POST"); + res.set_header("Access-Control-Allow-Headers", "*"); + }); + + svr.Get("/health", [&](const httplib::Request&, httplib::Response& res) { + server_state current_state = state.load(); + switch(current_state) { + case SERVER_STATE_READY: + res.set_content(R"({"status": "ok"})", "application/json"); + res.status = 200; // HTTP OK + break; + case SERVER_STATE_LOADING_MODEL: + res.set_content(R"({"status": "loading model"})", "application/json"); + res.status = 503; // HTTP Service Unavailable + break; + case SERVER_STATE_ERROR: + res.set_content(R"({"status": "error", "error": "Model failed to load"})", "application/json"); + res.status = 500; // HTTP Internal Server Error + break; + } + }); + + svr.set_logger(log_server_request); + + svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep) + { + const char fmt[] = "500 Internal Server Error\n%s"; + char buf[BUFSIZ]; + try + { + std::rethrow_exception(std::move(ep)); + } + catch (std::exception &e) + { + snprintf(buf, sizeof(buf), fmt, e.what()); + } + catch (...) + { + snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); + } + res.set_content(buf, "text/plain; charset=utf-8"); + res.status = 500; + }); + + svr.set_error_handler([](const httplib::Request &, httplib::Response &res) + { + if (res.status == 401) + { + res.set_content("Unauthorized", "text/plain; charset=utf-8"); + } + if (res.status == 400) + { + res.set_content("Invalid request", "text/plain; charset=utf-8"); + } + else if (res.status == 404) + { + res.set_content("File Not Found", "text/plain; charset=utf-8"); + res.status = 404; + } + }); + + // set timeouts and change hostname and port + svr.set_read_timeout (sparams.read_timeout); + svr.set_write_timeout(sparams.write_timeout); + + if (!svr.bind_to_port(sparams.hostname, sparams.port)) { + fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port); return 1; } - llama.initialize(); + // Set the base directory for serving static files + svr.set_base_dir(sparams.public_path); - httplib::Server svr; + // to make it ctrl+clickable: + LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); + + std::unordered_map log_data; + log_data["hostname"] = sparams.hostname; + log_data["port"] = std::to_string(sparams.port); + + if (sparams.api_keys.size() == 1) { + log_data["api_key"] = "api_key: ****" + sparams.api_keys[0].substr(sparams.api_keys[0].length() - 4); + } else if (sparams.api_keys.size() > 1) { + log_data["api_key"] = "api_key: " + std::to_string(sparams.api_keys.size()) + " keys loaded"; + } + + LOG_INFO("HTTP server listening", log_data); + // run the HTTP server in a thread - see comment below + std::thread t([&]() + { + if (!svr.listen_after_bind()) + { + state.store(SERVER_STATE_ERROR); + return 1; + } + + return 0; + }); + + // load the model + if (!llama.load_model(params)) + { + state.store(SERVER_STATE_ERROR); + return 1; + } else { + llama.initialize(); + state.store(SERVER_STATE_READY); + LOG_INFO("model loaded", {}); + } // Middleware for API key validation auto validate_api_key = [&sparams](const httplib::Request &req, httplib::Response &res) -> bool { // If API key is not set, skip validation - if (sparams.api_key.empty()) { + if (sparams.api_keys.empty()) { return true; } @@ -2812,7 +2982,7 @@ int main(int argc, char **argv) std::string prefix = "Bearer "; if (auth_header.substr(0, prefix.size()) == prefix) { std::string received_api_key = auth_header.substr(prefix.size()); - if (received_api_key == sparams.api_key) { + if (std::find(sparams.api_keys.begin(), sparams.api_keys.end(), received_api_key) != sparams.api_keys.end()) { return true; // API key is valid } } @@ -2826,10 +2996,6 @@ int main(int argc, char **argv) return false; }; - svr.set_default_headers({{"Server", "llama.cpp"}, - {"Access-Control-Allow-Origin", "*"}, - {"Access-Control-Allow-Headers", "content-type"}}); - // this is only called if no index.html is found in the public --path svr.Get("/", [](const httplib::Request &, httplib::Response &res) { @@ -2858,9 +3024,9 @@ int main(int argc, char **argv) return false; }); - svr.Get("/props", [&llama](const httplib::Request & /*req*/, httplib::Response &res) + svr.Get("/props", [&llama](const httplib::Request & req, httplib::Response &res) { - res.set_header("Access-Control-Allow-Origin", "*"); + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); json data = { { "user_name", llama.name_user.c_str() }, { "assistant_name", llama.name_assistant.c_str() } @@ -2870,6 +3036,7 @@ int main(int argc, char **argv) svr.Post("/completion", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!validate_api_key(req, res)) { return; } @@ -2937,10 +3104,9 @@ int main(int argc, char **argv) } }); - - - svr.Get("/v1/models", [¶ms](const httplib::Request&, httplib::Response& res) + svr.Get("/v1/models", [¶ms](const httplib::Request& req, httplib::Response& res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); std::time_t t = std::time(0); json models = { @@ -2958,9 +3124,11 @@ int main(int argc, char **argv) res.set_content(models.dump(), "application/json; charset=utf-8"); }); + // TODO: add mount point without "/v1" prefix -- how? svr.Post("/v1/chat/completions", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!validate_api_key(req, res)) { return; } @@ -3034,6 +3202,7 @@ int main(int argc, char **argv) svr.Post("/infill", [&llama, &validate_api_key](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); if (!validate_api_key(req, res)) { return; } @@ -3106,6 +3275,7 @@ int main(int argc, char **argv) svr.Post("/tokenize", [&llama](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); std::vector tokens; if (body.count("content") != 0) @@ -3118,6 +3288,7 @@ int main(int argc, char **argv) svr.Post("/detokenize", [&llama](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); std::string content; if (body.count("tokens") != 0) @@ -3132,6 +3303,7 @@ int main(int argc, char **argv) svr.Post("/embedding", [&llama](const httplib::Request &req, httplib::Response &res) { + res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); const json body = json::parse(req.body); json prompt; if (body.count("content") != 0) @@ -3157,81 +3329,6 @@ int main(int argc, char **argv) return res.set_content(result.result_json.dump(), "application/json; charset=utf-8"); }); - svr.set_logger(log_server_request); - - svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep) - { - const char fmt[] = "500 Internal Server Error\n%s"; - char buf[BUFSIZ]; - try - { - std::rethrow_exception(std::move(ep)); - } - catch (std::exception &e) - { - snprintf(buf, sizeof(buf), fmt, e.what()); - } - catch (...) - { - snprintf(buf, sizeof(buf), fmt, "Unknown Exception"); - } - res.set_content(buf, "text/plain; charset=utf-8"); - res.status = 500; - }); - - svr.set_error_handler([](const httplib::Request &, httplib::Response &res) - { - if (res.status == 401) - { - res.set_content("Unauthorized", "text/plain; charset=utf-8"); - } - if (res.status == 400) - { - res.set_content("Invalid request", "text/plain; charset=utf-8"); - } - else if (res.status == 404) - { - res.set_content("File Not Found", "text/plain; charset=utf-8"); - res.status = 404; - } - }); - - // set timeouts and change hostname and port - svr.set_read_timeout (sparams.read_timeout); - svr.set_write_timeout(sparams.write_timeout); - - if (!svr.bind_to_port(sparams.hostname, sparams.port)) - { - fprintf(stderr, "\ncouldn't bind to server socket: hostname=%s port=%d\n\n", sparams.hostname.c_str(), sparams.port); - return 1; - } - - // Set the base directory for serving static files - svr.set_base_dir(sparams.public_path); - - // to make it ctrl+clickable: - LOG_TEE("\nllama server listening at http://%s:%d\n\n", sparams.hostname.c_str(), sparams.port); - - std::unordered_map log_data; - log_data["hostname"] = sparams.hostname; - log_data["port"] = std::to_string(sparams.port); - - if (!sparams.api_key.empty()) { - log_data["api_key"] = "api_key: ****" + sparams.api_key.substr(sparams.api_key.length() - 4); - } - - LOG_INFO("HTTP server listening", log_data); - // run the HTTP server in a thread - see comment below - std::thread t([&]() - { - if (!svr.listen_after_bind()) - { - return 1; - } - - return 0; - }); - // GG: if I put the main loop inside a thread, it crashes on the first request when build in Debug!? // "Bus error: 10" - this is on macOS, it does not crash on Linux //std::thread t2([&]() diff --git a/ggml-alloc.c b/ggml-alloc.c index a27dd54b0..89b85d348 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -102,8 +102,6 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { } } - AT_PRINTF("block %d\n", best_fit_block); - if (best_fit_block == -1) { // the last block is our last resort struct free_block * block = &alloc->free_blocks[alloc->n_free_blocks - 1]; @@ -117,6 +115,7 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { return; } } + struct free_block * block = &alloc->free_blocks[best_fit_block]; void * addr = block->addr; block->addr = (char*)block->addr + size; @@ -129,6 +128,8 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { } } + AT_PRINTF("block %d, addr %p\n", best_fit_block, addr); + tensor->data = addr; tensor->buffer = alloc->buffer; if (!alloc->measure) { @@ -229,6 +230,7 @@ void ggml_tallocr_reset(ggml_tallocr_t alloc) { alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows } else { alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; + ggml_backend_buffer_reset(alloc->buffer); } } @@ -263,9 +265,9 @@ ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) { return alloc; } -ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) { +ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) { // create a backend buffer to get the correct tensor allocation sizes - ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, 1); + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1); // TODO: move alloc initialization to a common ggml_tallocr_new_impl function ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); @@ -275,13 +277,22 @@ ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backe return alloc; } -ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) { - ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size); +ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) { + return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend)); +} + +ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) { + // create a backend buffer to get the correct tensor allocation sizes + ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); alloc->buffer_owned = true; return alloc; } +ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) { + return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size); +} + ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) { ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr)); @@ -779,10 +790,21 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte if (nbytes == 0) { // all the tensors in the context are already allocated +#ifndef NDEBUG + fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); +#endif return NULL; } ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes); + if (buffer == NULL) { + // failed to allocate buffer +#ifndef NDEBUG + fprintf(stderr, "%s: failed to allocate buffer\n", __func__); +#endif + return NULL; + } + ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) { diff --git a/ggml-alloc.h b/ggml-alloc.h index 64a412468..4e5997521 100644 --- a/ggml-alloc.h +++ b/ggml-alloc.h @@ -52,8 +52,10 @@ typedef struct ggml_tallocr * ggml_tallocr_t; GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment); GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment); -GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer); +GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size); GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer +GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer); +GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft); GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend); GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc); diff --git a/ggml-backend-impl.h b/ggml-backend-impl.h index ca21b4743..1db32901f 100644 --- a/ggml-backend-impl.h +++ b/ggml-backend-impl.h @@ -16,9 +16,10 @@ extern "C" { typedef void * ggml_backend_buffer_type_context_t; struct ggml_backend_buffer_type_i { + const char * (*get_name) (ggml_backend_buffer_type_t buft); ggml_backend_buffer_t (*alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size); size_t (*get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment - size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding + size_t (*get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding bool (*supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend // check if tensor data is in host memory // should be equivalent to supports_backend(buft, ggml_backend_cpu_init()) @@ -34,16 +35,15 @@ extern "C" { typedef void * ggml_backend_buffer_context_t; struct ggml_backend_buffer_i { - void (*free_buffer) (ggml_backend_buffer_t buffer); - //void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras - void * (*get_base) (ggml_backend_buffer_t buffer); - void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); - void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); - // (optional) copy tensor between different buffer-type, allow for single-copy tranfers - void (*cpy_tensor_from)(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to) (ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + const char * (*get_name) (ggml_backend_buffer_t buffer); + void (*free_buffer)(ggml_backend_buffer_t buffer); + void * (*get_base) (ggml_backend_buffer_t buffer); + void (*init_tensor)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + void (*set_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); + void (*get_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*cpy_tensor) (ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst); // dst is in the buffer, src may be in any buffer + void (*clear) (ggml_backend_buffer_t buffer, uint8_t value); + void (*reset) (ggml_backend_buffer_t buffer); // reset any internal state due to tensor initialization, such as tensor extras }; struct ggml_backend_buffer { @@ -51,6 +51,7 @@ extern "C" { ggml_backend_buffer_type_t buft; ggml_backend_buffer_context_t context; size_t size; + enum ggml_backend_buffer_usage usage; }; ggml_backend_buffer_t ggml_backend_buffer_init( @@ -59,6 +60,8 @@ extern "C" { ggml_backend_buffer_context_t context, size_t size); + // do not use directly, use ggml_backend_tensor_copy instead + bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst); // // Backend @@ -74,22 +77,20 @@ extern "C" { // buffer allocation ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend); - // (optional) asynchroneous tensor data access + // (optional) asynchronous tensor data access void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size); void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size); + bool (*cpy_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * src, struct ggml_tensor * dst); - // (optional) asynchroneous tensor copy - void (*cpy_tensor_from_async)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - void (*cpy_tensor_to_async) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst); - + // (optional) complete all pending operations void (*synchronize)(ggml_backend_t backend); // compute graph with a plan - ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph); + ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph); void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan); void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan); - // compute graph without a plan + // compute graph without a plan (async) bool (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph); // check if the backend supports an operation @@ -102,7 +103,6 @@ extern "C" { ggml_backend_context_t context; }; - // // Backend registry // diff --git a/ggml-backend.c b/ggml-backend.c index 53e741cb8..4c2d8b0b2 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -15,6 +15,10 @@ // backend buffer type +const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) { + return buft->iface.get_name(buft); +} + ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { return buft->iface.alloc_buffer(buft, size); } @@ -58,11 +62,16 @@ ggml_backend_buffer_t ggml_backend_buffer_init( /* .buft = */ buft, /* .context = */ context, /* .size = */ size, + /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY }; return buffer; } +const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name(buffer); +} + void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) { if (buffer == NULL) { return; @@ -94,11 +103,11 @@ void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_t } size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) { - return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer)); + return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer)); } size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { - return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor); + return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor); } void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { @@ -106,13 +115,31 @@ void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { } bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) { - return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer)); + return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer)); } -ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) { +void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) { + buffer->usage = usage; +} + +ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) { return buffer->buft; } +void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) { + if (buffer->iface.reset) { + buffer->iface.reset(buffer); + } +} + +bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) { + ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer; + if (dst_buf->iface.cpy_tensor) { + return src->buffer->iface.cpy_tensor(dst_buf, src, dst); + } + return false; +} + // backend const char * ggml_backend_name(ggml_backend_t backend) { @@ -146,30 +173,42 @@ void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - backend->iface.set_tensor_async(backend, tensor, data, offset, size); + if (backend->iface.set_tensor_async == NULL) { + ggml_backend_tensor_set(tensor, data, offset, size); + } else { + backend->iface.set_tensor_async(backend, tensor, data, offset, size); + } } void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - backend->iface.get_tensor_async(backend, tensor, data, offset, size); + if (backend->iface.get_tensor_async == NULL) { + ggml_backend_tensor_get(tensor, data, offset, size); + } else { + backend->iface.get_tensor_async(backend, tensor, data, offset, size); + } } void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); - GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); + GGML_ASSERT(buf != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds"); - tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size); + tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size); } void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) { + ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer; + GGML_ASSERT(tensor->data != NULL && "tensor not allocated"); GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set"); GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds"); - tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size); + tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size); } void ggml_backend_synchronize(ggml_backend_t backend) { @@ -190,19 +229,10 @@ void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_pla void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { backend->iface.graph_plan_compute(backend, plan); - - // TODO: optional sync - ggml_backend_synchronize(backend); } bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) { - if (!backend->iface.graph_compute(backend, cgraph)) { - return false; - } - - // TODO: optional sync - ggml_backend_synchronize(backend); - return true; + return backend->iface.graph_compute(backend, cgraph); } bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) { @@ -227,28 +257,20 @@ static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml } void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) { - //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]); - //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]); GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); - // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src)); - if (src == dst) { return; } - // TODO: allow backends to support copy to/from same backend - - if (dst->buffer->iface.cpy_tensor_from != NULL) { - dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst); - } else if (src->buffer->iface.cpy_tensor_to != NULL) { - src->buffer->iface.cpy_tensor_to(src->buffer, src, dst); - } else { - // shouldn't be hit when copying from/to CPU - #ifndef NDEBUG - fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to " - "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name); - #endif + if (ggml_backend_buffer_is_host(src->buffer)) { + ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); + } else if (ggml_backend_buffer_is_host(dst->buffer)) { + ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); + } else if (!ggml_backend_buffer_copy_tensor(src, dst)) { +#ifndef NDEBUG + fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer)); +#endif size_t nbytes = ggml_nbytes(src); void * data = malloc(nbytes); ggml_backend_tensor_get(src, data, 0, nbytes); @@ -257,6 +279,31 @@ void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst } } +void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) { + GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts"); + + if (src == dst) { + return; + } + + if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) { + if (backend->iface.cpy_tensor_async != NULL) { + if (backend->iface.cpy_tensor_async(backend, src, dst)) { + return; + } + } + } + + size_t nbytes = ggml_nbytes(src); + if (ggml_backend_buffer_is_host(src->buffer)) { + ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes); + } + else { + ggml_backend_tensor_copy(src, dst); + } +} + + // backend registry #define GGML_MAX_BACKENDS_REG 16 @@ -392,6 +439,12 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { // backend CPU +static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { + return "CPU"; + + GGML_UNUSED(buffer); +} + static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { return (void *)buffer->context; } @@ -412,14 +465,12 @@ static void ggml_backend_cpu_buffer_get_tensor(ggml_backend_buffer_t buffer, con GGML_UNUSED(buffer); } -static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - - GGML_UNUSED(buffer); -} - -static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); +static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + return false; GGML_UNUSED(buffer); } @@ -429,30 +480,38 @@ static void ggml_backend_cpu_buffer_clear(ggml_backend_buffer_t buffer, uint8_t } static struct ggml_backend_buffer_i cpu_backend_buffer_i = { + /* .get_name = */ ggml_backend_cpu_buffer_name, /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer, /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; // for buffers from ptr, free is not called static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { + /* .get_name = */ ggml_backend_cpu_buffer_name, /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed /* .get_base = */ ggml_backend_cpu_buffer_get_base, /* .init_tensor = */ NULL, // no initialization required /* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor, - /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to, + /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor, /* .clear = */ ggml_backend_cpu_buffer_clear, + /* .reset = */ NULL, }; static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 +static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU"; + + GGML_UNUSED(buft); +} + static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? @@ -483,6 +542,7 @@ static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = { /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -501,6 +561,18 @@ ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) { #include +static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "CPU_HBM"; + + GGML_UNUSED(buft); +} + +static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) { + return "CPU_HBM"; + + GGML_UNUSED(buf); +} + static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) { hbw_free(buffer->context); } @@ -514,17 +586,18 @@ static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_ return NULL; } - // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; + buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name; buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer; return buffer; } -ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() { +ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = { /* .iface = */ { + /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -568,7 +641,7 @@ struct ggml_backend_plan_cpu { struct ggml_cgraph cgraph; }; -static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) { +static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) { struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context; struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu)); @@ -634,8 +707,7 @@ static struct ggml_backend_i cpu_backend_i = { /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, - /* .cpy_tensor_from_async = */ NULL, - /* .cpy_tensor_to_async = */ NULL, + /* .cpy_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create, /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free, @@ -661,7 +733,7 @@ ggml_backend_t ggml_backend_cpu_init(void) { } bool ggml_backend_is_cpu(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_cpu_name; + return backend && backend->iface.get_name == ggml_backend_cpu_name; } void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) { @@ -685,7 +757,7 @@ static ggml_backend_t ggml_backend_reg_cpu_init(const char * params, void * user // scheduler -#define GGML_MAX_BACKENDS 4 +#define GGML_MAX_BACKENDS 16 #define GGML_MAX_SPLITS 256 #define GGML_MAX_SPLIT_INPUTS 16 @@ -695,21 +767,29 @@ struct ggml_backend_sched_split { int i_end; struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; int n_inputs; + // graph view of this split struct ggml_cgraph graph; }; struct ggml_backend_sched { + bool is_reset; // true if the scheduler has been reset since the last graph split + int n_backends; ggml_backend_t backends[GGML_MAX_BACKENDS]; + ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS]; ggml_tallocr_t tallocs[GGML_MAX_BACKENDS]; ggml_gallocr_t galloc; + // hash keys of the nodes in the graph struct ggml_hash_set hash_set; - ggml_tallocr_t * node_talloc; // [hash_set.size] - struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS] + // hash values (arrays of [hash_set.size]) + ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend) + struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend + // copy of the graph with modified inputs struct ggml_cgraph * graph; + struct ggml_backend_sched_split splits[GGML_MAX_SPLITS]; int n_splits; @@ -750,14 +830,22 @@ static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) return INT_MAX; } -static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { +static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { if (buffer == NULL) { return NULL; } + + // check if this is already allocate in a allocr buffer (from user manual allocations) + for (int i = 0; i < sched->n_backends; i++) { + if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) { + return sched->tallocs[i]; + } + } + // find highest prio backend that supports the buffer type for (int i = 0; i < sched->n_backends; i++) { if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { - return sched->backends[i]; + return sched->tallocs[i]; } } GGML_ASSERT(false && "tensor buffer type not supported by any backend"); @@ -767,7 +855,6 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc if (allocr == NULL) { return NULL; } - // find highest prio backend that supports the buffer type for (int i = 0; i < sched->n_backends; i++) { if (sched->tallocs[i] == allocr) { return sched->backends[i]; @@ -777,7 +864,7 @@ static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_talloc } #if 0 -static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove +static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) #define GET_CAUSE(node) causes[hash_id(node)] #else @@ -786,45 +873,37 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_IN #endif // returns the backend that should be used for the node based on the current locations -static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) { - // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there - // ie. kv cache updates - // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend. +static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) { + // assign pre-allocated nodes to their backend // dst - ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer); - if (cur_backend != NULL) { + ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer); + if (cur_allocr != NULL) { SET_CAUSE(node, "1.dst"); - return cur_backend; + return cur_allocr; } - // view_src - if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) { - SET_CAUSE(node, "1.vsrc"); - return get_buffer_backend(sched, node->view_src->buffer); + if (node->view_src != NULL) { + cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer); + if (cur_allocr != NULL) { + SET_CAUSE(node, "1.vsrc"); + return cur_allocr; + } } - - // src - int cur_prio = INT_MAX; - size_t cur_size = 0; - + // assign nodes that use weights to the backend of the weights for (int i = 0; i < GGML_MAX_SRC; i++) { const struct ggml_tensor * src = node->src[i]; if (src == NULL) { break; } - ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer); - if (src_backend != NULL) { - int src_prio = sched_backend_prio(sched, src_backend); - size_t src_size = ggml_nbytes(src); - if (src_prio < cur_prio && src_size >= cur_size) { - cur_prio = src_prio; - cur_size = src_size; - cur_backend = src_backend; - SET_CAUSE(node, "1.src%d", i); - } + if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { + ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer); + // operations with weights are always run on the same backend as the weights + SET_CAUSE(node, "1.wgt%d", i); + return src_allocr; } } - return cur_backend; + + return NULL; } static char * fmt_size(size_t size) { @@ -857,7 +936,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra } ggml_tallocr_t node_allocr = node_allocr(node); ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME: - fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name, + fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; @@ -866,7 +945,7 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra } ggml_tallocr_t src_allocr = node_allocr(src); ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL; - fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name, + fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } fprintf(stderr, "\n"); @@ -882,15 +961,17 @@ static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, co return dup; } + +//#define DEBUG_PASS1 +//#define DEBUG_PASS2 +//#define DEBUG_PASS3 +//#define DEBUG_PASS4 + // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend -// TODO: merge passes static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - // reset state - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); - memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); - memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + // reset splits sched->n_splits = 0; + sched->is_reset = false; struct ggml_init_params params = { /* .mem_size = */ sizeof(sched->context_buffer), @@ -898,26 +979,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g /* .no_alloc = */ true }; - if (sched->ctx != NULL) { - ggml_free(sched->ctx); - } + ggml_free(sched->ctx); sched->ctx = ggml_init(params); + if (sched->ctx == NULL) { + fprintf(stderr, "%s: failed to initialize context\n", __func__); + GGML_ASSERT(false); + } - // pass 1: assign backends to ops with allocated inputs + // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; if (node_allocr(leaf) != NULL) { // do not overwrite user assignments continue; } - ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer); - if (leaf_backend == NULL && leaf->view_src != NULL) { - leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer); - } - if (leaf_backend != NULL) { - node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend); - } + node_allocr(leaf) = sched_allocr_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { @@ -926,50 +1003,102 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // do not overwrite user assignments continue; } - ggml_backend_t node_backend = sched_backend_from_cur(sched, node); - if (node_backend != NULL) { - node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend); + node_allocr(node) = sched_allocr_from_cur(sched, node); + // src + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + if (node_allocr(src) == NULL) { + node_allocr(src) = sched_allocr_from_cur(sched, src); + } } } - //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS1 + fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif - // pass 2: assign backends to ops from current assignments - // TODO: - // - reuse sched_backend_from_cur - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr == NULL) { - int cur_prio = INT_MAX; - size_t cur_size = 0; - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { - break; - } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != NULL) { - int src_prio = sched_allocr_prio(sched, src_allocr); - size_t src_size = ggml_nbytes(src); - if (src_prio < cur_prio && src_size >= cur_size) { - cur_prio = src_prio; - cur_size = src_size; - node_allocr = src_allocr; - SET_CAUSE(node, "2.src%d", j); - } - } + // pass 2: expand current backend assignments + // assign the same backend to adjacent nodes + // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend) + // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops + + // pass 2.1 expand gpu up + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; } + ggml_tallocr_t node_allocr = node_allocr(node); if (node_allocr != NULL) { - node_allocr(node) = node_allocr; + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + // skip cpu (lowest prio backend) + cur_allocr = NULL; + } else { + cur_allocr = node_allocr; + } + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.1"); } } } - //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); - // pass 3: assign backends to remaining src from dst (should only be leafs) + // pass 2.2 expand gpu down + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + // skip cpu (lowest prio backend) + cur_allocr = NULL; + } else { + cur_allocr = node_allocr; + } + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.2"); + } + } + } + + // pass 2.3 expand rest up + { + ggml_tallocr_t cur_allocr = NULL; + for (int i = graph->n_nodes - 1; i >= 0; i--) { + struct ggml_tensor * node = graph->nodes[i]; + if (ggml_is_view_op(node->op)) { + continue; + } + ggml_tallocr_t node_allocr = node_allocr(node); + if (node_allocr != NULL) { + cur_allocr = node_allocr; + } else { + node_allocr(node) = cur_allocr; + SET_CAUSE(node, "2.3"); + } + } + } +#ifdef DEBUG_PASS2 + fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif + + // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t node_allocr = node_allocr(node); + ggml_tallocr_t cur_allocr = node_allocr(node); + if (node->view_src != NULL && cur_allocr == NULL) { + cur_allocr = node_allocr(node) = node_allocr(node->view_src); + SET_CAUSE(node, "3.vsrc"); + } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -977,81 +1106,105 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g } ggml_tallocr_t src_allocr = node_allocr(src); if (src_allocr == NULL) { - node_allocr(src) = node_allocr; + if (src->view_src != NULL) { + // views are always on the same backend as the source + node_allocr(src) = node_allocr(src->view_src); + SET_CAUSE(src, "3.vsrc"); + } else { + node_allocr(src) = cur_allocr; + SET_CAUSE(src, "3.cur"); + } } } } - //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#ifdef DEBUG_PASS3 + fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif // pass 4: split graph, find tensors that need to be copied - // TODO: - // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost - // find first backend - int cur_split = 0; - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - if (node->view_src == NULL) { - sched->splits[0].tallocr = node_allocr(node); - break; - } - } - sched->splits[0].i_start = 0; - sched->splits[0].n_inputs = 0; - memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; - size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); - for (int i = 0; i < graph->n_nodes; i++) { - struct ggml_tensor * node = graph->nodes[i]; - - if (ggml_is_view_op(node->op)) { - continue; - } - - ggml_tallocr_t node_allocr = node_allocr(node); - - if (node_allocr != cur_allocr) { - sched->splits[cur_split].i_end = i; - cur_split++; - GGML_ASSERT(cur_split < GGML_MAX_SPLITS); - sched->splits[cur_split].tallocr = node_allocr; - sched->splits[cur_split].i_start = i; - sched->splits[cur_split].n_inputs = 0; - memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK - cur_allocr = node_allocr; - cur_backend_id = sched_allocr_prio(sched, cur_allocr); - } - - // find inputs that are not on the same backend - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * src = node->src[j]; - if (src == NULL) { + { + int cur_split = 0; + // find the backend of the first split, skipping view ops + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + if (!ggml_is_view_op(node->op)) { + sched->splits[0].tallocr = node_allocr(node); break; } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != node_allocr) { - int n_inputs = sched->splits[cur_split].n_inputs++; - GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src; + } + sched->splits[0].i_start = 0; + sched->splits[0].n_inputs = 0; + memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK + ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; + size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; - // create copies - size_t id = hash_id(src); - if (sched->node_copies[id][cur_backend_id] == NULL) { - struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); - sched->node_copies[id][cur_backend_id] = tensor_copy; - node_allocr(tensor_copy) = cur_allocr; - ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); - ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); + if (ggml_is_view_op(node->op)) { + continue; + } + + ggml_tallocr_t node_allocr = node_allocr(node); + + if (node_allocr != cur_allocr) { + sched->splits[cur_split].i_end = i; + cur_split++; + GGML_ASSERT(cur_split < GGML_MAX_SPLITS); + sched->splits[cur_split].tallocr = node_allocr; + sched->splits[cur_split].i_start = i; + sched->splits[cur_split].n_inputs = 0; + cur_allocr = node_allocr; + cur_backend_id = sched_allocr_prio(sched, cur_allocr); + } + + // find inputs that are not on the same backend + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_tallocr_t src_allocr = node_allocr(src); + GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now + if (src_allocr != node_allocr) { + // check if the input is already in the split + bool found = false; + for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { + if (sched->splits[cur_split].inputs[k] == src) { + found = true; + break; + } + } + + if (!found) { + int n_inputs = sched->splits[cur_split].n_inputs++; + //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr))); + GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); + sched->splits[cur_split].inputs[n_inputs] = src; + } + + // create a copy of the input in the split's backend + size_t id = hash_id(src); + if (sched->node_copies[id][cur_backend_id] == NULL) { + ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); + struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); + ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); + + sched->node_copies[id][cur_backend_id] = tensor_copy; + node_allocr(tensor_copy) = cur_allocr; + SET_CAUSE(tensor_copy, "4.cpy"); + } + node->src[j] = sched->node_copies[id][cur_backend_id]; } - node->src[j] = sched->node_copies[id][cur_backend_id]; } } + sched->splits[cur_split].i_end = graph->n_nodes; + sched->n_splits = cur_split + 1; } - sched->splits[cur_split].i_end = graph->n_nodes; - sched->n_splits = cur_split + 1; +#ifdef DEBUG_PASS4 + fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); +#endif - //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout); - -#if 1 +#ifndef NDEBUG // sanity check: all sources should have the same backend as the node for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1059,6 +1212,11 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g if (node_allocr == NULL) { fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); } + if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) { + fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", + node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", + node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL"); + } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { @@ -1070,8 +1228,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL"); } + if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) { + fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", + src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL", + src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL"); + } } } + fflush(stderr); #endif // create copies of the graph for each split @@ -1085,6 +1249,8 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + // add a dependency to the input source so that it is not freed before the copy is done + GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input); input_cpy->src[0] = input; graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } @@ -1119,24 +1285,16 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { uint64_t copy_start_us = ggml_time_us(); for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)]; - if (input->buffer == NULL) { - if (input->view_src == NULL) { - fprintf(stderr, "input %s has no buffer and no view_src\n", input->name); - exit(1); - } - // FIXME: may need to use the sched buffer instead - ggml_backend_view_init(input->view_src->buffer, input); - } - if (input_cpy->buffer == NULL) { - fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name); - exit(1); - } - //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend); - //GGML_ASSERT(input_cpy->buffer->backend == split_backend); - ggml_backend_tensor_copy(input, input_cpy); + struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id]; + + GGML_ASSERT(input->buffer != NULL); + GGML_ASSERT(input_cpy->buffer != NULL); + + // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change + // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times + ggml_backend_tensor_copy_async(split_backend, input, input_cpy); } - // ggml_backend_synchronize(split_backend); + //ggml_backend_synchronize(split_backend); // necessary to measure copy time int64_t copy_end_us = ggml_time_us(); copy_us[split_backend_id] += copy_end_us - copy_start_us; @@ -1148,7 +1306,7 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { uint64_t compute_start_us = ggml_time_us(); ggml_backend_graph_compute(split_backend, &split->graph); - // ggml_backend_synchronize(split_backend); + //ggml_backend_synchronize(split_backend); // necessary to measure compute time uint64_t compute_end_us = ggml_time_us(); compute_us[split_backend_id] += compute_end_us - compute_start_us; } @@ -1168,26 +1326,41 @@ static void sched_reset(ggml_backend_sched_t sched) { for (int i = 0; i < sched->n_backends; i++) { ggml_tallocr_reset(sched->tallocs[i]); } + // reset state for the next run + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); + memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); + memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); + + sched->is_reset = true; } -ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) { +ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { + GGML_ASSERT(n_backends > 0); GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS); - struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched)); - memset(sched, 0, sizeof(struct ggml_backend_sched)); + struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); + + // initialize hash table + sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1); + sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1); sched->n_backends = n_backends; for (int i = 0; i < n_backends; i++) { sched->backends[i] = backends[i]; + sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]); } sched->galloc = ggml_gallocr_new(); // init measure allocs for each backend for (int i = 0; i < n_backends; i++) { - sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]); + sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]); } + sched_reset(sched); + return sched; } @@ -1199,6 +1372,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { ggml_tallocr_free(sched->tallocs[i]); } ggml_gallocr_free(sched->galloc); + ggml_free(sched->ctx); free(sched->hash_set.keys); free(sched->node_talloc); free(sched->node_copies); @@ -1206,12 +1380,7 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { } void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { - // initialize hash tables - size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS; - sched->hash_set.size = hash_size; - sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size); - sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size); - sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size); + GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once sched_split_graph(sched, measure_graph); sched_alloc_splits(sched); @@ -1220,28 +1389,41 @@ void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgr for (int i = 0; i < sched->n_backends; i++) { size_t size = ggml_tallocr_max_size(sched->tallocs[i]); ggml_tallocr_free(sched->tallocs[i]); - sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size); + sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size); } sched_reset(sched); } void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { - GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + + if (!sched->is_reset) { + sched_reset(sched); + } sched_split_graph(sched, graph); sched_alloc_splits(sched); sched_compute_splits(sched); +} + +void ggml_backend_sched_reset(ggml_backend_sched_t sched) { sched_reset(sched); } +int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { + return sched->n_splits; +} + ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = sched_backend_prio(sched, backend); + GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); return sched->tallocs[backend_index]; } ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) { int backend_index = sched_backend_prio(sched, backend); + GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); return ggml_tallocr_get_buffer(sched->tallocs[backend_index]); } @@ -1251,10 +1433,19 @@ void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml node_allocr(node) = sched->tallocs[backend_index]; } +ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { + ggml_tallocr_t allocr = node_allocr(node); + if (allocr == NULL) { + return NULL; + } + return get_allocr_backend(sched, allocr); +} + // utils + void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); - //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized + //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); @@ -1320,6 +1511,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor struct ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { + graph_init_tensor(hash_set, node_copies, node_init, src->view_src); ggml_backend_view_init(dst->view_src->buffer, dst); } else { @@ -1353,6 +1545,21 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s struct ggml_context * ctx_allocated = ggml_init(params); struct ggml_context * ctx_unallocated = ggml_init(params); + if (ctx_allocated == NULL || ctx_unallocated == NULL) { + fprintf(stderr, "failed to allocate context for graph copy\n"); + free(hash_set.keys); + free(node_copies); + free(node_init); + ggml_free(ctx_allocated); + ggml_free(ctx_unallocated); + return (struct ggml_backend_graph_copy) { + /* .buffer = */ NULL, + /* .ctx_allocated = */ NULL, + /* .ctx_unallocated = */ NULL, + /* .graph = */ NULL, + }; + } + // dup nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1361,6 +1568,20 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // allocate nodes ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend); + if (buffer == NULL) { + fprintf(stderr, "failed to allocate buffer for graph copy\n"); + free(hash_set.keys); + free(node_copies); + free(node_init); + ggml_free(ctx_allocated); + ggml_free(ctx_unallocated); + return (struct ggml_backend_graph_copy) { + /* .buffer = */ NULL, + /* .ctx_allocated = */ NULL, + /* .ctx_unallocated = */ NULL, + /* .graph = */ NULL, + }; + } //printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024); @@ -1397,8 +1618,12 @@ void ggml_backend_graph_copy_free(struct ggml_backend_graph_copy copy) { ggml_free(copy.ctx_unallocated); } -void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { +bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) { struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph); + if (copy.buffer == NULL) { + return false; + } + struct ggml_cgraph * g1 = graph; struct ggml_cgraph * g2 = copy.graph; @@ -1428,4 +1653,6 @@ void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t } ggml_backend_graph_copy_free(copy); + + return true; } diff --git a/ggml-backend.h b/ggml-backend.h index 85ff67b0e..4eb244af1 100644 --- a/ggml-backend.h +++ b/ggml-backend.h @@ -17,22 +17,31 @@ extern "C" { // // buffer type - GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size); - GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); - GGML_API size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); - GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); - GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); + GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft); + GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size); + GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft); + GGML_API size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor); + GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend); + GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft); // buffer - GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); - GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); - GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); - GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); - GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); - GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); - GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer); + enum ggml_backend_buffer_usage { + GGML_BACKEND_BUFFER_USAGE_ANY = 0, + GGML_BACKEND_BUFFER_USAGE_WEIGHTS = 1, + }; + + GGML_API const char * ggml_backend_buffer_name (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer); + GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer); + GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); + GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value); + GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_set_usage (ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage); + GGML_API ggml_backend_buffer_type_t ggml_backend_buffer_get_type (ggml_backend_buffer_t buffer); + GGML_API void ggml_backend_buffer_reset (ggml_backend_buffer_t buffer); // // Backend @@ -140,23 +149,24 @@ extern "C" { typedef struct ggml_backend_sched * ggml_backend_sched_t; // Initialize a backend scheduler - GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends); - - GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); - + GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); + GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + // Get the number of splits of the last graph + GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend); - GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); + GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); - // Allocate a graph on the backend scheduler - GGML_API void ggml_backend_sched_graph_compute( - ggml_backend_sched_t sched, - struct ggml_cgraph * graph); + // Allocate and compute graph on the backend scheduler + GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs + GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); // // Utils @@ -176,7 +186,7 @@ extern "C" { typedef bool (*ggml_backend_eval_callback)(int node_index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data); // Compare the output of two backends - GGML_API void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); + GGML_API bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data); // Tensor initialization GGML_API void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, void * addr); diff --git a/ggml-cuda.cu b/ggml-cuda.cu index e26260a35..2db50437c 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -8,8 +8,13 @@ #include #include #include +#include #include - +#include +#include +#include "ggml-cuda.h" +#include "ggml.h" +#include "ggml-backend-impl.h" #if defined(GGML_USE_HIPBLAS) #include @@ -77,6 +82,7 @@ #define cudaMemcpyKind hipMemcpyKind #define cudaMemset hipMemset #define cudaMemsetAsync hipMemsetAsync +#define cudaMemGetInfo hipMemGetInfo #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize #define cudaSetDevice hipSetDevice #define cudaStreamCreateWithFlags hipStreamCreateWithFlags @@ -112,9 +118,7 @@ #endif // defined(GGML_USE_HIPBLAS) -#include "ggml-cuda.h" -#include "ggml.h" -#include "ggml-backend-impl.h" +#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed) #define CC_PASCAL 600 #define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products @@ -486,6 +490,15 @@ typedef struct { } block_iq2_xxs; static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding"); +#define QR2_XS 8 +#define QI2_XS (QK_K / (4*QR2_XS)) +typedef struct { + half d; + uint16_t qs[QK_K/8]; + uint8_t scales[QK_K/32]; +} block_iq2_xs; +static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); + #define WARP_SIZE 32 #define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses @@ -553,7 +566,7 @@ static void ggml_cuda_set_device(const int device) { static int g_device_count = -1; static int g_main_device = 0; -static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; +static std::array g_default_tensor_split = {}; struct cuda_device_capabilities { int cc; // compute capability @@ -564,10 +577,6 @@ struct cuda_device_capabilities { static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} }; -static void * g_scratch_buffer = nullptr; -static size_t g_scratch_size = 0; // disabled by default -static size_t g_scratch_offset = 0; - static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; [[noreturn]] @@ -596,16 +605,16 @@ static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) { } static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) { -#if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) - (void) a; - bad_arch(); -#else +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32)); } return a; -#endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#else + (void) a; + bad_arch(); +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL } static __device__ __forceinline__ float warp_reduce_max(float x) { @@ -617,16 +626,16 @@ static __device__ __forceinline__ float warp_reduce_max(float x) { } static __device__ __forceinline__ half2 warp_reduce_max(half2 x) { -#if __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) - (void) x; - bad_arch(); -#else +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX #pragma unroll for (int mask = 16; mask > 0; mask >>= 1) { x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32)); } return x; -#endif // __CUDA_ARCH__ < CC_PASCAL || (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) +#else + (void) x; + bad_arch(); +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX } static __device__ __forceinline__ float op_repeat(const float a, const float b) { @@ -1328,7 +1337,7 @@ static __global__ void dequantize_block_q6_K(const void * __restrict__ vx, dst_t #endif } -static const __device__ uint64_t kgrid_iq2xxs[256] = { +static const __device__ uint64_t iq2xxs_grid[256] = { 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, @@ -1395,6 +1404,137 @@ static const __device__ uint64_t kgrid_iq2xxs[256] = { 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, }; +static const __device__ uint64_t iq2xs_grid[512] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, + 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, + 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, + 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, + 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, + 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, + 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, + 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, + 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, + 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, + 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, + 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, + 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, + 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, + 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, + 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, + 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, + 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, + 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, + 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, + 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, + 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, + 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, + 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, + 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, + 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, + 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, + 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, + 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, + 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, + 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, + 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, + 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, + 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, + 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, + 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, + 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, + 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, + 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, + 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, + 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, + 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, + 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, + 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, + 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, + 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, + 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, + 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, + 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, + 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, + 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, + 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, + 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, + 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, + 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, + 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, + 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, + 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, + 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, + 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, + 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, + 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, + 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, + 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, + 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, + 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, + 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, + 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, + 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, + 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, + 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, + 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, + 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, + 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, + 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, + 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, + 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, + 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, + 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, + 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, + 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, + 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, + 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, + 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, + 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, + 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, + 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, + 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, + 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, + 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, + 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, + 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, + 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, + 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, + 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, + 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, + 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, + 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, + 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, + 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, + 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, + 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, + 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, + 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, + 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, + 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, + 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, + 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, + 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, + 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, + 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, + 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, + 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, + 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, + 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, + 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, +}; + static const __device__ uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -1439,7 +1579,7 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds dst_t * y = yy + i*QK_K + 32*ib + 8*il; const uint16_t * q2 = x[i].qs + 4*ib; const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid = (const uint8_t *)(kgrid_iq2xxs + aux8[il]); + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[il]); const uint32_t aux32 = q2[2] | (q2[3] << 16); const float d = (float)x[i].d * (0.5f + (aux32 >> 28)) * 0.25f; const uint8_t signs = ksigns_iq2xs[(aux32 >> 7*il) & 127]; @@ -1450,6 +1590,28 @@ static __global__ void dequantize_block_iq2_xxs(const void * __restrict__ vx, ds } +template +static __global__ void dequantize_block_iq2_xs(const void * __restrict__ vx, dst_t * __restrict__ yy) { + + const int i = blockIdx.x; + const block_iq2_xs * x = (const block_iq2_xs *) vx; + + const int tid = threadIdx.x; +#if QK_K == 256 + const int il = tid/8; // 0...3 + const int ib = tid%8; // 0...7 + dst_t * y = yy + i*QK_K + 32*ib + 8*il; + const uint16_t * q2 = x[i].qs + 4*ib; + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[il] & 511)); + const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib] >> 4*(il/2)) & 0xf)) * 0.25f; + const uint8_t signs = ksigns_iq2xs[q2[il] >> 9]; + for (int j = 0; j < 8; ++j) y[j] = d * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); +#else + assert(false); +#endif + +} + static __global__ void dequantize_mul_mat_vec_q2_k(const void * __restrict__ vx, const float * __restrict__ yy, float * __restrict__ dst, const int ncols, int nrows) { static_assert(16%K_QUANTS_PER_ITERATION == 0, "16 must be divisible by K_QUANTS_PER_ITERATION"); @@ -3996,7 +4158,7 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( uint32_t aux32 = q2[2] | (q2[3] << 16); int sumi = 0; for (int l = 0; l < 4; ++l) { - const uint8_t * grid = (const uint8_t *)(kgrid_iq2xxs + aux8[l]); + const uint8_t * grid = (const uint8_t *)(iq2xxs_grid + aux8[l]); const uint8_t signs = ksigns_iq2xs[aux32 & 127]; for (int j = 0; j < 8; ++j) { sumi += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); @@ -4012,8 +4174,8 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( const int il = iqs%2; const uint16_t * q2 = bq2->qs + 4*ib32; const uint8_t * aux8 = (const uint8_t *)q2; - const uint8_t * grid1 = (const uint8_t *)(kgrid_iq2xxs + aux8[2*il+0]); - const uint8_t * grid2 = (const uint8_t *)(kgrid_iq2xxs + aux8[2*il+1]); + const uint8_t * grid1 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+0]); + const uint8_t * grid2 = (const uint8_t *)(iq2xxs_grid + aux8[2*il+1]); const uint32_t aux32 = q2[2] | (q2[3] << 16); const float d = (float)bq2->d * (0.5f + (aux32 >> 28)) * (float)bq8_1[ib32].ds.x * 0.25f; const uint8_t signs1 = ksigns_iq2xs[(aux32 >> 14*il) & 127]; @@ -4032,6 +4194,42 @@ static __device__ __forceinline__ float vec_dot_iq2_xxs_q8_1( #endif } +static __device__ __forceinline__ float vec_dot_iq2_xs_q8_1( + const void * __restrict__ vbq, const block_q8_1 * __restrict__ bq8_1, const int & iqs) { +#if QK_K == 256 + const block_iq2_xs * bq2 = (const block_iq2_xs *) vbq; + + const int ib32 = iqs; + const uint16_t * q2 = bq2->qs + 4*ib32; + const int8_t * q8 = bq8_1[ib32].qs; + const uint8_t ls1 = bq2->scales[ib32] & 0xf; + const uint8_t ls2 = bq2->scales[ib32] >> 4; + int sumi1 = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi1 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + int sumi2 = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi2 += q8[j] * grid[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + const float d = (float)bq2->d * (float)bq8_1[ib32].ds.x * 0.25f; + return d * ((0.5f + ls1) * sumi1 + (0.5f + ls2) * sumi2); +#else + assert(false); + return 0.f; +#endif +} + template static __device__ __forceinline__ void mul_mat_q( @@ -5415,7 +5613,7 @@ static __global__ void diag_mask_inf_f32(const float * x, float * dst, const int template static __global__ void soft_max_f16(const float * x, const float * y, float * dst, const int ncols_par, const int nrows_y, const float scale) { -#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX const int ncols_data = ncols_template == 0 ? ncols_par : ncols_template; const int ncols_smem = GGML_PAD(ncols_data, 2*WARP_SIZE)/2; @@ -5540,7 +5738,7 @@ static __global__ void soft_max_f16(const float * x, const float * y, float * ds #else (void) x; (void) y; (void) dst; (void) ncols_par; (void) nrows_y; (void) scale; bad_arch(); -#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL +#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX } template @@ -6035,6 +6233,12 @@ static void dequantize_row_iq2_xxs_cuda(const void * vx, dst_t * y, const int k, dequantize_block_iq2_xxs<<>>(vx, y); } +template +static void dequantize_row_iq2_xs_cuda(const void * vx, dst_t * y, const int k, cudaStream_t stream) { + const int nb = k / QK_K; + dequantize_block_iq2_xs<<>>(vx, y); +} + template static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, cudaStream_t stream) { const int num_blocks = (k + CUDA_DEQUANTIZE_BLOCK_SIZE - 1) / CUDA_DEQUANTIZE_BLOCK_SIZE; @@ -6065,6 +6269,8 @@ static to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) { return dequantize_row_q6_K_cuda; case GGML_TYPE_IQ2_XXS: return dequantize_row_iq2_xxs_cuda; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_cuda; case GGML_TYPE_F32: return convert_unary_cuda; default: @@ -6096,6 +6302,8 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { return dequantize_row_q6_K_cuda; case GGML_TYPE_IQ2_XXS: return dequantize_row_iq2_xxs_cuda; + case GGML_TYPE_IQ2_XS: + return dequantize_row_iq2_xs_cuda; case GGML_TYPE_F16: return convert_unary_cuda; default: @@ -6299,6 +6507,15 @@ static void mul_mat_vec_iq2_xxs_q8_1_cuda(const void * vx, const void * vy, floa <<>>(vx, vy, dst, ncols, nrows); } +static void mul_mat_vec_iq2_xs_q8_1_cuda(const void * vx, const void * vy, float * dst, const int ncols, const int nrows, cudaStream_t stream) { + GGML_ASSERT(ncols % QK_K == 0); + const int block_num_y = (nrows + GGML_CUDA_MMV_Y - 1) / GGML_CUDA_MMV_Y; + const dim3 block_nums(block_num_y, 1, 1); + const dim3 block_dims(WARP_SIZE, GGML_CUDA_MMV_Y, 1); + mul_mat_vec_q + <<>>(vx, vy, dst, ncols, nrows); +} + static void ggml_mul_mat_q4_0_q8_1_cuda( const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x, const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) { @@ -7329,8 +7546,9 @@ void ggml_init_cublas() { CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no"); - g_tensor_split[id] = total_vram; + g_default_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; + #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD; #else @@ -7339,7 +7557,7 @@ void ggml_init_cublas() { g_device_caps[id].smpb = prop.sharedMemPerBlock; } for (int id = 0; id < g_device_count; ++id) { - g_tensor_split[id] /= total_vram; + g_default_tensor_split[id] /= total_vram; } for (int id = 0; id < g_device_count; ++id) { @@ -7363,30 +7581,6 @@ void ggml_init_cublas() { } } -void ggml_cuda_set_tensor_split(const float * tensor_split) { - if (tensor_split == nullptr) { - return; - } - bool all_zero = true; - for (int i = 0; i < g_device_count; ++i) { - if (tensor_split[i] != 0.0f) { - all_zero = false; - break; - } - } - if (all_zero) { - return; - } - float split_sum = 0.0f; - for (int i = 0; i < g_device_count; ++i) { - g_tensor_split[i] = split_sum; - split_sum += tensor_split[i]; - } - for (int i = 0; i < g_device_count; ++i) { - g_tensor_split[i] /= split_sum; - } -} - void * ggml_cuda_host_malloc(size_t size) { if (getenv("GGML_CUDA_NO_PINNED") != nullptr) { return nullptr; @@ -7838,11 +8032,11 @@ static void ggml_cuda_op_mul_mat_q( (void) src1_ddf_i; } -static int64_t get_row_rounding(ggml_type type) { +static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { int64_t min_compute_capability = INT_MAX; int64_t max_compute_capability = INT_MIN; for (int id = 0; id < g_device_count; ++id) { - if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { + if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { if (min_compute_capability > g_device_caps[id].cc) { min_compute_capability = g_device_caps[id].cc; } @@ -7871,6 +8065,7 @@ static int64_t get_row_rounding(ggml_type type) { case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: return max_compute_capability >= CC_RDNA2 ? 128 : 64; default: GGML_ASSERT(false); @@ -7892,6 +8087,7 @@ static int64_t get_row_rounding(ggml_type type) { case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: return max_compute_capability >= CC_VOLTA ? 128 : 64; case GGML_TYPE_Q6_K: return 64; @@ -7901,6 +8097,21 @@ static int64_t get_row_rounding(ggml_type type) { #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } +static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { + const int64_t nrows = ggml_nrows(tensor); + const int64_t rounding = get_row_rounding(tensor->type, tensor_split); + + *row_low = id == 0 ? 0 : nrows*tensor_split[id]; + *row_low -= *row_low % rounding; + + if (id == g_device_count - 1) { + *row_high = nrows; + } else { + *row_high = nrows*tensor_split[id + 1]; + *row_high -= *row_high % rounding; + } +} + static void ggml_cuda_op_mul_mat_vec_q( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i, const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, @@ -7945,6 +8156,9 @@ static void ggml_cuda_op_mul_mat_vec_q( case GGML_TYPE_IQ2_XXS: mul_mat_vec_iq2_xxs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); break; + case GGML_TYPE_IQ2_XS: + mul_mat_vec_iq2_xs_q8_1_cuda(src0_dd_i, src1_ddq_i, dst_dd_i, ne00, row_diff, stream); + break; default: GGML_ASSERT(false); break; @@ -8352,15 +8566,15 @@ static void ggml_cuda_op_soft_max( float scale = 1.0f; memcpy(&scale, dst->op_params, sizeof(float)); -#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) - const bool use_f16_soft_max = false; -#else +#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION >= CUDART_HMAX #ifdef GGML_CUDA_F16 const bool use_f16_soft_max = true; #else const bool use_f16_soft_max = false; #endif // GGML_CUDA_F16 -#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) +#else + const bool use_f16_soft_max = false; +#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) && CUDART_VERSION >= CUDART_HMAX if (use_f16_soft_max) { soft_max_f16_cuda(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00, nrows_x, nrows_y, scale, main_stream); @@ -8515,6 +8729,11 @@ static void ggml_cuda_set_peer_access(const int n_tokens) { peer_access_enabled = enable_peer_access; } +// FIXME: move this somewhere else +struct ggml_backend_cuda_split_buffer_type_context { + std::array tensor_split; +}; + static void ggml_cuda_op_mul_mat( const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op, const bool convert_src1_to_q8_1) { @@ -8566,6 +8785,14 @@ static void ggml_cuda_op_mul_mat( GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); + std::array tensor_split; + if (split) { + // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check + // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...); + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; + tensor_split = buft_ctx->tensor_split; + } + struct dev_data { cuda_pool_alloc src0_dd_alloc; cuda_pool_alloc src1_ddf_alloc; @@ -8593,17 +8820,17 @@ static void ggml_cuda_op_mul_mat( // for multi GPU, get the row boundaries from tensor split // and round to mul_mat_q tile sizes if (split) { - const int64_t rounding = get_row_rounding(src0->type); + const int64_t rounding = get_row_rounding(src0->type, tensor_split); if (id != 0) { - dev[id].row_low = ne01*g_tensor_split[id]; + dev[id].row_low = ne01*tensor_split[id]; if (dev[id].row_low < ne01) { dev[id].row_low -= dev[id].row_low % rounding; } } if (id != g_device_count - 1) { - dev[id].row_high = ne01*g_tensor_split[id + 1]; + dev[id].row_high = ne01*tensor_split[id + 1]; if (dev[id].row_high < ne01) { dev[id].row_high -= dev[id].row_high % rounding; } @@ -9149,10 +9376,17 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; - for (int id = 0; id < g_device_count; ++id) { - if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { - min_compute_capability = g_device_caps[id].cc; + + if (split) { + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context; + auto & tensor_split = buft_ctx->tensor_split; + for (int id = 0; id < g_device_count; ++id) { + if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) { + min_compute_capability = g_device_caps[id].cc; + } } + } else { + min_compute_capability = g_device_caps[g_main_device].cc; } #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) @@ -9191,7 +9425,7 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { // KQ + KQV multi-batch ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { @@ -9653,247 +9887,7 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { - const int64_t nrows = ggml_nrows(tensor); - - const int64_t ne0 = tensor->ne[0]; - - const size_t nb1 = tensor->nb[1]; - - ggml_backend_type backend = tensor->backend; - ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); - - for (int id = 0; id < g_device_count; ++id) { - if (backend == GGML_BACKEND_GPU && id != g_main_device) { - continue; - } - - ggml_cuda_set_device(id); - - int64_t row_low, row_high; - if (backend == GGML_BACKEND_GPU) { - row_low = 0; - row_high = nrows; - } else if (backend == GGML_BACKEND_GPU_SPLIT) { - const int64_t rounding = get_row_rounding(tensor->type); - - row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; - row_low -= row_low % rounding; - - if (id == g_device_count - 1) { - row_high = nrows; - } else { - row_high = nrows*g_tensor_split[id + 1]; - row_high -= row_high % rounding; - } - } else { - GGML_ASSERT(false); - } - if (row_low == row_high) { - continue; - } - - int64_t nrows_split = row_high - row_low; - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - char * buf; - CUDA_CHECK(cudaMalloc(&buf, size)); - char * buf_host = (char *)data + offset_split; - - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); - } - - CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice)); - - extra->data_device[id] = buf; - - if (backend == GGML_BACKEND_GPU_SPLIT) { - for (int64_t is = 0; is < MAX_STREAMS; ++is) { - CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); - } - } - } - - tensor->extra = extra; -} - -void ggml_cuda_free_data(struct ggml_tensor * tensor) { - if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) { - return; - } - - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - - for (int id = 0; id < g_device_count; ++id) { - ggml_cuda_set_device(id); - if (extra->data_device[id] != nullptr) { - CUDA_CHECK(cudaFree(extra->data_device[id])); - } - - for (int64_t is = 0; is < MAX_STREAMS; ++is) { - if (extra->events[id][is] != nullptr) { - CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); - } - } - } - - delete extra; -} - -static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr; -static size_t g_temp_tensor_extra_index = 0; - -static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { - if (g_temp_tensor_extras == nullptr) { - g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; - } - - size_t alloc_index = g_temp_tensor_extra_index; - g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES; - ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index]; - memset(extra, 0, sizeof(*extra)); - - return extra; -} - -static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) { - if (scratch && g_scratch_size == 0) { - return; - } - - tensor->backend = GGML_BACKEND_GPU; - - // recursively assign CUDA buffers until a compute tensor is found - if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) { - const ggml_op src0_op = tensor->src[0]->op; - if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) { - ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc); - } - } - if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) { - ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc); - } - - if (scratch && no_alloc) { - return; - } - - ggml_tensor_extra_gpu * extra; - - const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) || - tensor->op == GGML_OP_VIEW || - force_inplace; - const size_t size = ggml_nbytes(tensor); - - ggml_cuda_set_device(g_main_device); - if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; - char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; - size_t offset = 0; - if (tensor->op == GGML_OP_VIEW) { - memcpy(&offset, tensor->op_params, sizeof(size_t)); - } - extra = ggml_cuda_alloc_temp_tensor_extra(); - extra->data_device[g_main_device] = src0_ddc + offset; - } else if (tensor->op == GGML_OP_CPY) { - ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; - void * src1_ddv = src1_extra->data_device[g_main_device]; - extra = ggml_cuda_alloc_temp_tensor_extra(); - extra->data_device[g_main_device] = src1_ddv; - } else if (scratch) { - GGML_ASSERT(size <= g_scratch_size); - if (g_scratch_offset + size > g_scratch_size) { - g_scratch_offset = 0; - } - - char * data = (char *) g_scratch_buffer; - if (data == nullptr) { - CUDA_CHECK(cudaMalloc(&data, g_scratch_size)); - g_scratch_buffer = data; - } - extra = ggml_cuda_alloc_temp_tensor_extra(); - extra->data_device[g_main_device] = data + g_scratch_offset; - - g_scratch_offset += size; - - GGML_ASSERT(g_scratch_offset <= g_scratch_size); - } else { // allocate new buffers outside of scratch - void * data; - CUDA_CHECK(cudaMalloc(&data, size)); - CUDA_CHECK(cudaMemset(data, 0, size)); - extra = new ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); - extra->data_device[g_main_device] = data; - } - - tensor->extra = extra; -} - -void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) { - if (g_scratch_size == 0) { - return; - } - if (g_scratch_buffer == nullptr) { - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size)); - } - - ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra(); - - const bool inplace = tensor->view_src != nullptr; - - if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; - char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; - size_t view_offset = 0; - if (tensor->op == GGML_OP_VIEW) { - memcpy(&view_offset, tensor->op_params, sizeof(size_t)); - } - extra->data_device[g_main_device] = src0_ddc + view_offset; - } else { - extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset; - } - - tensor->extra = extra; -} - -void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) { - GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - GGML_ASSERT(ggml_is_contiguous(tensor)); - - ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_cuda_set_device(g_main_device); - CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice)); -} - -void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, true, false, false); -} - -void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, true, false, true); -} - -void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, false, false, false); -} - -void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) { - ggml_cuda_assign_buffers_impl(tensor, false, true, false); -} - -void ggml_cuda_set_main_device(const int main_device) { +static void ggml_cuda_set_main_device(const int main_device) { if (main_device >= g_device_count) { fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", main_device, g_device_count, g_main_device); @@ -9902,30 +9896,12 @@ void ggml_cuda_set_main_device(const int main_device) { if (g_main_device != main_device && g_device_count > 1) { g_main_device = main_device; - cudaDeviceProp prop; - CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device)); - fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name); + //cudaDeviceProp prop; + //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device)); + //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name); } } -void ggml_cuda_set_scratch_size(const size_t scratch_size) { - // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously - // it still won't always work as expected, but it's better than nothing - if (scratch_size > g_scratch_size) { - ggml_cuda_free_scratch(); - } - g_scratch_size = std::max(g_scratch_size, scratch_size); -} - -void ggml_cuda_free_scratch() { - if (g_scratch_buffer == nullptr) { - return; - } - - CUDA_CHECK(cudaFree(g_scratch_buffer)); - g_scratch_buffer = nullptr; -} - bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { if (!g_cublas_loaded) return false; @@ -10104,21 +10080,31 @@ void ggml_cuda_get_device_description(int device, char * description, size_t des #define UNUSED GGML_UNUSED +struct ggml_backend_cuda_context { + int device; + std::string name; +}; + // cuda buffer -struct ggml_backend_buffer_context_cuda { +struct ggml_backend_cuda_buffer_context { int device; void * dev_ptr = nullptr; ggml_tensor_extra_gpu * temp_tensor_extras = nullptr; size_t temp_tensor_extra_index = 0; + std::string name; - ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {} + ggml_backend_cuda_buffer_context(int device, void * dev_ptr) : + device(device), dev_ptr(dev_ptr), + name(GGML_CUDA_NAME + std::to_string(device)) { + } - ~ggml_backend_buffer_context_cuda() { + ~ggml_backend_cuda_buffer_context() { delete[] temp_tensor_extras; } ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() { + // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset if (temp_tensor_extras == nullptr) { temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES]; } @@ -10132,19 +10118,28 @@ struct ggml_backend_buffer_context_cuda { } }; +static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + return ctx->name.c_str(); +} + +static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) { + return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name; +} + static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; CUDA_CHECK(cudaFree(ctx->dev_ptr)); delete ctx; } static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; return ctx->dev_ptr; } static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { assert(tensor->view_src->buffer->buft == buffer->buft); @@ -10173,67 +10168,98 @@ static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, g CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0])); } } - - UNUSED(buffer); } static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaDeviceSynchronize()); } static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaDeviceSynchronize()); +} + +static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) { + if (ggml_backend_buffer_is_cuda(src->buffer)) { + ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context; + ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context; + + ggml_cuda_set_device(src_ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + ggml_cuda_set_device(dst_ctx->device); + CUDA_CHECK(cudaDeviceSynchronize()); + CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice)); + CUDA_CHECK(cudaDeviceSynchronize()); + + return true; + } + return false; } static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { - ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context; + ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context; ggml_cuda_set_device(ctx->device); CUDA_CHECK(cudaDeviceSynchronize()); - CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size)); + CUDA_CHECK(cudaDeviceSynchronize()); } -static struct ggml_backend_buffer_i cuda_backend_buffer_interface = { +static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_get_name, /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, /* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, - /* .cpy_tensor_from = */ NULL, - /* .cpy_tensor_to = */ NULL, + /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor, /* .clear = */ ggml_backend_cuda_buffer_clear, + /* .reset = */ NULL, }; // cuda buffer type -static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { - int device = (int) (intptr_t) buft->context; +struct ggml_backend_cuda_buffer_type_context { + int device; + std::string name; +}; - ggml_cuda_set_device(device); +static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) { + ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + + return ctx->name.c_str(); +} + +static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + + ggml_cuda_set_device(buft_ctx->device); size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0 void * dev_ptr; - CUDA_CHECK(cudaMalloc(&dev_ptr, size)); + cudaError_t err = cudaMalloc(&dev_ptr, size); + if (err != cudaSuccess) { + fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err)); + return nullptr; + } - ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr); + ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr); - return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size); + return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size); } static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { @@ -10242,7 +10268,7 @@ static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_ty UNUSED(buft); } -static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) { +static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { int64_t row_low = 0; int64_t row_high = ggml_nrows(tensor); int64_t nrows_split = row_high - row_low; @@ -10263,21 +10289,32 @@ static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_t } static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_cuda(backend); + if (!ggml_backend_is_cuda(backend)) { + return false; + } - UNUSED(buft); + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + return buft_ctx->device == cuda_ctx->device; } static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend, - /* .is_host = */ nullptr, + /* .is_host = */ NULL, }; ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { - static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; + // FIXME: this is not thread safe + if (device >= ggml_backend_cuda_get_device_count()) { + return nullptr; + } + + static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES]; static bool ggml_backend_cuda_buffer_type_initialized = false; @@ -10285,7 +10322,7 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) { ggml_backend_cuda_buffer_types[i] = { /* .iface = */ ggml_backend_cuda_buffer_type_interface, - /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i, + /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)}, }; } ggml_backend_cuda_buffer_type_initialized = true; @@ -10294,8 +10331,306 @@ ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) { return &ggml_backend_cuda_buffer_types[device]; } +// cuda split buffer + +struct ggml_backend_cuda_split_buffer_context { + ~ggml_backend_cuda_split_buffer_context() { + for (ggml_tensor_extra_gpu * extra : tensor_extras) { + for (int id = 0; id < g_device_count; ++id) { + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + if (extra->events[id][is] != nullptr) { + CUDA_CHECK(cudaEventDestroy(extra->events[id][is])); + } + } + if (extra->data_device[id] != nullptr) { + CUDA_CHECK(cudaFree(extra->data_device[id])); + } + } + delete extra; + } + } + + std::vector tensor_extras; +}; + +static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) { + return GGML_CUDA_NAME "_Split"; + + UNUSED(buffer); +} + +// unused at the moment +//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) { +// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name; +//} + +static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + delete ctx; +} + +static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) { + // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced + return (void *)0x1000; + + UNUSED(buffer); +} + +static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported + + ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context; + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + + ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + + ctx->tensor_extras.push_back(extra); + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + // FIXME: do not crash if cudaMalloc fails + // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first + ggml_cuda_set_device(id); + char * buf; + CUDA_CHECK(cudaMalloc(&buf, size)); + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size)); + } + + extra->data_device[id] = buf; + + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming)); + } + } + tensor->backend = GGML_BACKEND_GPU_SPLIT; + tensor->extra = extra; +} + +static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + const char * buf_host = (const char *)data + offset_split; + CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice)); + } +} + +static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + char * buf_host = (char *)data + offset_split; + CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost)); + } +} + +static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + UNUSED(buffer); + UNUSED(value); +} + +static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_get_name, + /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer, + /* .get_base = */ ggml_backend_cuda_split_buffer_get_base, + /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_cuda_split_buffer_clear, + /* .reset = */ NULL, +}; + +// cuda split buffer type + +static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_CUDA_NAME "_Split"; + + UNUSED(buft); +} + +static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point + // instead, we allocate them for each tensor separately in init_tensor + // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, + // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. + ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context(); + + return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size); +} + +static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + + UNUSED(buft); +} + +static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context; + + size_t total_size = 0; + + const int64_t ne0 = tensor->ne[0]; + + for (int id = 0; id < g_device_count; ++id) { + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + total_size += ggml_nbytes_split(tensor, nrows_split); + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return total_size; +} + +static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + return ggml_backend_is_cuda(backend); + + UNUSED(buft); +} + +static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + UNUSED(buft); +} + +static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = { + /* .get_name = */ ggml_backend_cuda_split_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment, + /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size, + /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host, +}; + +ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) { + // FIXME: this is not thread safe + static std::map, struct ggml_backend_buffer_type> buft_map; + + std::array tensor_split_arr = {}; + + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; }); + if (all_zero) { + tensor_split_arr = g_default_tensor_split; + } else { + float split_sum = 0.0f; + for (int i = 0; i < g_device_count; ++i) { + tensor_split_arr[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < g_device_count; ++i) { + tensor_split_arr[i] /= split_sum; + } + } + + auto it = buft_map.find(tensor_split_arr); + if (it != buft_map.end()) { + return &it->second; + } + + struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_cuda_split_buffer_type_interface, + /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr}, + }; + + auto result = buft_map.emplace(tensor_split_arr, buft); + return &result.first->second; +} + // host buffer type +static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_CUDA_NAME "_Host"; + + UNUSED(buft); +} + +static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) { + return GGML_CUDA_NAME "_Host"; + + UNUSED(buffer); +} + static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { ggml_cuda_host_free(buffer->context); } @@ -10308,9 +10643,9 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); } - // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; + buffer->iface.get_name = ggml_backend_cuda_host_buffer_name; buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer; return buffer; @@ -10319,6 +10654,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_host_buffer_type_alloc_buffer(ggm ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = { /* .iface = */ { + /* .get_name = */ ggml_backend_cuda_host_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, @@ -10333,31 +10669,27 @@ ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() { // backend -struct ggml_backend_context_cuda { - int device; -}; - static const char * ggml_backend_cuda_name(ggml_backend_t backend) { - return GGML_CUDA_NAME; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; - UNUSED(backend); + return cuda_ctx->name.c_str(); } static void ggml_backend_cuda_free(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; delete cuda_ctx; delete backend; } static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; return ggml_backend_cuda_buffer_type(cuda_ctx->device); } static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -10366,7 +10698,7 @@ static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tens } static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); @@ -10374,39 +10706,27 @@ static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggm CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0])); } +static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) { + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; + + if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) { + CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0])); + return true; + } + + return false; +} + static void ggml_backend_cuda_synchronize(ggml_backend_t backend) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0])); UNUSED(backend); } -static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) { - GGML_ASSERT(!"not implemented"); - - return nullptr; - - UNUSED(backend); - UNUSED(cgraph); -} - -static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); - - UNUSED(backend); - UNUSED(plan); -} - -static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); - - UNUSED(backend); - UNUSED(plan); -} - static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { - ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context; + ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context; ggml_cuda_set_main_device(cuda_ctx->device); @@ -10416,53 +10736,31 @@ static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) + if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; + } - assert(node->backend == GGML_BACKEND_GPU); +#ifndef NDEBUG + assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT); assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_GPU); + assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT); assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device)); assert(node->src[j]->extra != nullptr); } } +#endif bool ok = ggml_cuda_compute_forward(¶ms, node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); - -#if 0 - if (node->type == GGML_TYPE_F32) { - cudaDeviceSynchronize(); - std::vector tmp(ggml_nelements(node), 0.0f); - cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost); - printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op), - ggml_type_name(node->src[0]->type), - node->src[1] ? ggml_type_name(node->src[1]->type) : "none", - node->src[0]->name, - node->src[1] ? node->src[1]->name : "none"); - double sum = 0.0; - double sq_sum = 0.0; - for (int i = 0; i < ggml_nelements(node); i++) { - printf("%f ", tmp[i]); - sum += tmp[i]; - sq_sum += tmp[i]*tmp[i]; - } - printf("\n"); - printf("sum: %f, ", sum); - printf("sq_sum: %f\n", sq_sum); - } -#endif } - UNUSED(backend); - return true; } @@ -10577,18 +10875,17 @@ static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_ten UNUSED(backend); } -static ggml_backend_i cuda_backend_i = { +static ggml_backend_i ggml_backend_cuda_interface = { /* .get_name = */ ggml_backend_cuda_name, /* .free = */ ggml_backend_cuda_free, /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type, /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async, /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async, - /* .cpy_tensor_from_async = */ NULL, - /* .cpy_tensor_to_async = */ NULL, + /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async, /* .synchronize = */ ggml_backend_cuda_synchronize, - /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create, - /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free, - /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_cuda_graph_compute, /* .supports_op = */ ggml_backend_cuda_supports_op, }; @@ -10604,12 +10901,13 @@ ggml_backend_t ggml_backend_cuda_init(int device) { // not strictly necessary, but it may reduce the overhead of the first graph_compute ggml_cuda_set_main_device(device); - ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda { - /* .device = */ device + ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context { + /* .device = */ device, + /* .name = */ GGML_CUDA_NAME + std::to_string(device), }; ggml_backend_t cuda_backend = new ggml_backend { - /* .interface = */ cuda_backend_i, + /* .interface = */ ggml_backend_cuda_interface, /* .context = */ ctx }; @@ -10617,9 +10915,24 @@ ggml_backend_t ggml_backend_cuda_init(int device) { } bool ggml_backend_is_cuda(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_cuda_name; + return backend && backend->iface.get_name == ggml_backend_cuda_name; } +int ggml_backend_cuda_get_device_count() { + return ggml_cuda_get_device_count(); +} + +void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) { + ggml_cuda_get_device_description(device, description, description_size); +} + +void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) { + ggml_cuda_set_device(device); + + CUDA_CHECK(cudaMemGetInfo(free, total)); +} + +// backend registry static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) { ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data); return cuda_backend; diff --git a/ggml-cuda.h b/ggml-cuda.h index cdb0c0c41..d19cbf3fd 100644 --- a/ggml-cuda.h +++ b/ggml-cuda.h @@ -27,22 +27,6 @@ GGML_API void * ggml_cuda_host_malloc(size_t size); GGML_API void ggml_cuda_host_free(void * ptr); GGML_API bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API void ggml_cuda_set_tensor_split(const float * tensor_split); -GGML_API void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor); -GGML_API void ggml_cuda_free_data(struct ggml_tensor * tensor); - -GGML_API void ggml_cuda_assign_buffers(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor); - -GGML_API void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor); -GGML_API void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); -GGML_API void ggml_cuda_copy_to_device(struct ggml_tensor * tensor); - -GGML_API void ggml_cuda_set_main_device(int main_device); -GGML_API void ggml_cuda_set_mul_mat_q(bool mul_mat_q); -GGML_API void ggml_cuda_set_scratch_size(size_t scratch_size); -GGML_API void ggml_cuda_free_scratch(void); GGML_API bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor); GGML_API int ggml_cuda_get_device_count(void); @@ -52,13 +36,17 @@ GGML_API void ggml_cuda_get_device_description(int device, char * description, GGML_API ggml_backend_t ggml_backend_cuda_init(int device); GGML_API bool ggml_backend_is_cuda(ggml_backend_t backend); -GGML_API int ggml_backend_cuda_get_device(ggml_backend_t backend); GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device); - -// pinned host buffer for use with CPU backend for faster copies between CPU and GPU +// split tensor buffer that splits matrices by rows across multiple devices +GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split); +// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void); +GGML_API int ggml_backend_cuda_get_device_count(void); +GGML_API void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size); +GGML_API void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total); + #ifdef __cplusplus } #endif diff --git a/ggml-impl.h b/ggml-impl.h index 2faced080..2c58075ac 100644 --- a/ggml-impl.h +++ b/ggml-impl.h @@ -228,6 +228,8 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { #define GGML_HASHTABLE_FULL ((size_t)-1) #define GGML_HASHTABLE_ALREADY_EXISTS ((size_t)-2) +struct ggml_hash_set ggml_hash_set_new(size_t size); + bool ggml_hash_contains (const struct ggml_hash_set hash_set, struct ggml_tensor * key); // returns GGML_HASHTABLE_FULL if table is full, otherwise the current index of the key or where it should be inserted diff --git a/ggml-metal.m b/ggml-metal.m index 6c2a8d04e..c03624073 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -89,6 +89,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(get_rows_q6_K); GGML_METAL_DECL_KERNEL(get_rows_i32); GGML_METAL_DECL_KERNEL(get_rows_iq2_xxs); + GGML_METAL_DECL_KERNEL(get_rows_iq2_xs); GGML_METAL_DECL_KERNEL(rms_norm); GGML_METAL_DECL_KERNEL(group_norm); GGML_METAL_DECL_KERNEL(norm); @@ -108,6 +109,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_iq2_xxs_f32); + GGML_METAL_DECL_KERNEL(mul_mv_iq2_xs_f32); GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32); //GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16); GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32); @@ -124,6 +126,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xxs_f32); + GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xs_f32); GGML_METAL_DECL_KERNEL(mul_mm_f32_f32); GGML_METAL_DECL_KERNEL(mul_mm_f16_f32); GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32); @@ -137,6 +140,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_iq2_xxs_f32); + GGML_METAL_DECL_KERNEL(mul_mm_iq2_xs_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32); @@ -150,6 +154,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32); GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xxs_f32); + GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xs_f32); GGML_METAL_DECL_KERNEL(rope_f32); GGML_METAL_DECL_KERNEL(rope_f16); GGML_METAL_DECL_KERNEL(alibi_f32); @@ -385,6 +390,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(get_rows_q6_K); GGML_METAL_ADD_KERNEL(get_rows_i32); GGML_METAL_ADD_KERNEL(get_rows_iq2_xxs); + GGML_METAL_ADD_KERNEL(get_rows_iq2_xs); GGML_METAL_ADD_KERNEL(rms_norm); GGML_METAL_ADD_KERNEL(group_norm); GGML_METAL_ADD_KERNEL(norm); @@ -404,6 +410,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_iq2_xxs_f32); + GGML_METAL_ADD_KERNEL(mul_mv_iq2_xs_f32); GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32); //GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16); GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32); @@ -420,6 +427,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xxs_f32); + GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xs_f32); if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { GGML_METAL_ADD_KERNEL(mul_mm_f32_f32); GGML_METAL_ADD_KERNEL(mul_mm_f16_f32); @@ -434,6 +442,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_iq2_xxs_f32); + GGML_METAL_ADD_KERNEL(mul_mm_iq2_xs_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32); @@ -447,6 +456,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32); GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xxs_f32); + GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xs_f32); } GGML_METAL_ADD_KERNEL(rope_f32); GGML_METAL_ADD_KERNEL(rope_f16); @@ -513,6 +523,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(get_rows_q6_K); GGML_METAL_DEL_KERNEL(get_rows_i32); GGML_METAL_DEL_KERNEL(get_rows_iq2_xxs); + GGML_METAL_DEL_KERNEL(get_rows_iq2_xs); GGML_METAL_DEL_KERNEL(rms_norm); GGML_METAL_DEL_KERNEL(group_norm); GGML_METAL_DEL_KERNEL(norm); @@ -532,6 +543,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_iq2_xxs_f32); + GGML_METAL_DEL_KERNEL(mul_mv_iq2_xs_f32); GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32); //GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16); GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32); @@ -548,6 +560,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xxs_f32); + GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xs_f32); if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) { GGML_METAL_DEL_KERNEL(mul_mm_f32_f32); GGML_METAL_DEL_KERNEL(mul_mm_f16_f32); @@ -562,6 +575,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_iq2_xxs_f32); + GGML_METAL_DEL_KERNEL(mul_mm_iq2_xs_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32); @@ -575,6 +589,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32); GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xxs_f32); + GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xs_f32); } GGML_METAL_DEL_KERNEL(rope_f32); GGML_METAL_DEL_KERNEL(rope_f16); @@ -1067,6 +1082,10 @@ bool ggml_metal_graph_compute( GGML_ASSERT(!"unsupported op"); } +#ifndef GGML_METAL_NDEBUG + [encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]]; +#endif + const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -1557,6 +1576,7 @@ bool ggml_metal_graph_compute( case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break; case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break; case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xxs_f32]; break; + case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xs_f32]; break; default: GGML_ASSERT(false && "MUL MAT-MAT not implemented"); } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1675,6 +1695,12 @@ bool ggml_metal_graph_compute( nth1 = 16; [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xxs_f32]; } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xs_f32]; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t); @@ -1708,12 +1734,12 @@ bool ggml_metal_graph_compute( if (src0t == GGML_TYPE_Q4_0 || src0t == GGML_TYPE_Q4_1 || src0t == GGML_TYPE_Q5_0 || src0t == GGML_TYPE_Q5_1 || src0t == GGML_TYPE_Q8_0 || - //src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src0t == GGML_TYPE_IQ2_XXS) { - [encoder setThreadgroupMemoryLength:(256*8+128) atIndex:0]; + else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) { + const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src0t == GGML_TYPE_Q4_K) { @@ -1806,6 +1832,7 @@ bool ggml_metal_graph_compute( case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break; case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break; case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xxs_f32]; break; + case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xs_f32]; break; default: GGML_ASSERT(false && "MUL_MAT_ID not implemented"); } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -1927,6 +1954,12 @@ bool ggml_metal_graph_compute( nth1 = 16; [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xxs_f32]; } break; + case GGML_TYPE_IQ2_XS: + { + nth0 = 4; + nth1 = 16; + [encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xs_f32]; + } break; default: { GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t); @@ -1976,12 +2009,12 @@ bool ggml_metal_graph_compute( if (src2t == GGML_TYPE_Q4_0 || src2t == GGML_TYPE_Q4_1 || src2t == GGML_TYPE_Q5_0 || src2t == GGML_TYPE_Q5_1 || src2t == GGML_TYPE_Q8_0 || - //src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) { [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } - else if (src2t == GGML_TYPE_IQ2_XXS) { - [encoder setThreadgroupMemoryLength:(256*8+128) atIndex:0]; + else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) { + const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128; + [encoder setThreadgroupMemoryLength:mem_size atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)]; } else if (src2t == GGML_TYPE_Q4_K) { @@ -2022,6 +2055,7 @@ bool ggml_metal_graph_compute( case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break; case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break; case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xxs]; break; + case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xs]; break; default: GGML_ASSERT(false && "not implemented"); } @@ -2423,6 +2457,10 @@ bool ggml_metal_graph_compute( GGML_ASSERT(false); } } + +#ifndef GGML_METAL_NDEBUG + [encoder popDebugGroup]; +#endif } if (encoder != nil) { @@ -2482,10 +2520,10 @@ static void ggml_backend_metal_free_device(void) { } } -static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { - struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; +static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) { + return "Metal"; - return ctx->all_data; + UNUSED(buffer); } static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -2503,6 +2541,12 @@ static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) free(ctx); } +static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) { + struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context; + + return ctx->all_data; +} + static void ggml_backend_metal_buffer_set_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) { memcpy((char *)tensor->data + offset, data, size); @@ -2515,14 +2559,12 @@ static void ggml_backend_metal_buffer_get_tensor(ggml_backend_buffer_t buffer, c UNUSED(buffer); } -static void ggml_backend_metal_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src)); - - UNUSED(buffer); -} - -static void ggml_backend_metal_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) { - ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src)); +static bool ggml_backend_metal_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) { + if (ggml_backend_buffer_is_host(src->buffer)) { + memcpy(dst->data, src->data, ggml_nbytes(src)); + return true; + } + return false; UNUSED(buffer); } @@ -2534,18 +2576,25 @@ static void ggml_backend_metal_buffer_clear(ggml_backend_buffer_t buffer, uint8_ } static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = { + /* .get_name = */ ggml_backend_metal_buffer_get_name, /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer, /* .get_base = */ ggml_backend_metal_buffer_get_base, /* .init_tensor = */ NULL, /* .set_tensor = */ ggml_backend_metal_buffer_set_tensor, /* .get_tensor = */ ggml_backend_metal_buffer_get_tensor, - /* .cpy_tensor_from = */ ggml_backend_metal_buffer_cpy_tensor_from, - /* .cpy_tensor_to = */ ggml_backend_metal_buffer_cpy_tensor_to, + /* .cpy_tensor = */ ggml_backend_metal_buffer_cpy_tensor, /* .clear = */ ggml_backend_metal_buffer_clear, + /* .reset = */ NULL, }; // default buffer type +static const char * ggml_backend_metal_buffer_type_get_name(ggml_backend_buffer_type_t buft) { + return "Metal"; + + UNUSED(buft); +} + static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { struct ggml_backend_metal_buffer_context * ctx = malloc(sizeof(struct ggml_backend_metal_buffer_context)); @@ -2618,6 +2667,7 @@ static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t bu ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void) { static struct ggml_backend_buffer_type ggml_backend_buffer_type_metal = { /* .iface = */ { + /* .get_name = */ ggml_backend_metal_buffer_type_get_name, /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment, /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes @@ -2641,6 +2691,14 @@ ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t siz ctx->n_buffers = 0; const size_t size_page = sysconf(_SC_PAGESIZE); + + // page-align the data ptr + { + const uintptr_t offs = (uintptr_t) data % size_page; + data = (void *) ((char *) data - offs); + size += offs; + } + size_t size_aligned = size; if ((size_aligned % size_page) != 0) { size_aligned += (size_page - (size_aligned % size_page)); @@ -2741,14 +2799,13 @@ static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct UNUSED(backend); } -static struct ggml_backend_i metal_backend_i = { +static struct ggml_backend_i ggml_backend_metal_i = { /* .get_name = */ ggml_backend_metal_name, /* .free = */ ggml_backend_metal_free, /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type, /* .set_tensor_async = */ NULL, /* .get_tensor_async = */ NULL, - /* .cpy_tensor_from_async = */ NULL, - /* .cpy_tensor_to_async = */ NULL, + /* .cpy_tensor_async = */ NULL, /* .synchronize = */ NULL, /* .graph_plan_create = */ NULL, /* .graph_plan_free = */ NULL, @@ -2767,7 +2824,7 @@ ggml_backend_t ggml_backend_metal_init(void) { ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend)); *metal_backend = (struct ggml_backend) { - /* .interface = */ metal_backend_i, + /* .interface = */ ggml_backend_metal_i, /* .context = */ ctx, }; @@ -2775,7 +2832,7 @@ ggml_backend_t ggml_backend_metal_init(void) { } bool ggml_backend_is_metal(ggml_backend_t backend) { - return backend->iface.get_name == ggml_backend_metal_name; + return backend && backend->iface.get_name == ggml_backend_metal_name; } void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { diff --git a/ggml-metal.metal b/ggml-metal.metal index 229efb8b6..029578dc5 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -2452,6 +2452,13 @@ typedef struct { } block_iq2_xxs; // 66 bytes / block for QK_K = 256, so 2.0625 bpw +typedef struct { + half d; + uint16_t qs[QK_K/8]; + uint8_t scales[QK_K/32]; +} block_iq2_xs; +// 74 bytes / block for QK_K = 256, so 2.3125 bpw + //====================================== dot products ========================= void kernel_mul_mv_q2_K_f32_impl( @@ -3476,7 +3483,7 @@ kernel void kernel_mul_mv_q6_K_f32( // ======================= "True" 2-bit -constexpr constant static uint64_t kgrid_iq2xxs[256] = { +constexpr constant static uint64_t iq2xxs_grid[256] = { 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, @@ -3543,6 +3550,137 @@ constexpr constant static uint64_t kgrid_iq2xxs[256] = { 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, }; +constexpr constant static uint64_t iq2xs_grid[512] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, + 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, + 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, + 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, + 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, + 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, + 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, + 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, + 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, + 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, + 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, + 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, + 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, + 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, + 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, + 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, + 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, + 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, + 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, + 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, + 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, + 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, + 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, + 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, + 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, + 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, + 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, + 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, + 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, + 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, + 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, + 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, + 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, + 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, + 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, + 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, + 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, + 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, + 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, + 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, + 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, + 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, + 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, + 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, + 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, + 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, + 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, + 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, + 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, + 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, + 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, + 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, + 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, + 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, + 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, + 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, + 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, + 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, + 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, + 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, + 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, + 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, + 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, + 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, + 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, + 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, + 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, + 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, + 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, + 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, + 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, + 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, + 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, + 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, + 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, + 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, + 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, + 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, + 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, + 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, + 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, + 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, + 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, + 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, + 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, + 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, + 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, + 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, + 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, + 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, + 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, + 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, + 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, + 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, + 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, + 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, + 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, + 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, + 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, + 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, + 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, + 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, + 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, + 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, + 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, + 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, + 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, + 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, + 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, + 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, + 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, + 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, + 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, + 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, + 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, + 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, +}; + constexpr constant static uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -3600,7 +3738,7 @@ void kernel_mul_mv_iq2_xxs_f32_impl( { int nval = 4; int pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) values[pos + i] = kgrid_iq2xxs[pos + i]; + for (int i = 0; i < nval; ++i) values[pos + i] = iq2xxs_grid[pos + i]; nval = 2; pos = (32*sgitg + tiisg)*nval; for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i]; @@ -3689,6 +3827,149 @@ kernel void kernel_mul_mv_iq2_xxs_f32( kernel_mul_mv_iq2_xxs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); } +void kernel_mul_mv_iq2_xs_f32_impl( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant int64_t & ne10, + constant int64_t & ne12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + const int nb = ne00/QK_K; + const int r0 = tgpig.x; + const int r1 = tgpig.y; + const int im = tgpig.z; + + const int first_row = (r0 * N_SIMDGROUP + sgitg) * N_DST; + const int ib_row = first_row * nb; + + const uint i12 = im%ne12; + const uint i13 = im/ne12; + + const uint offset0 = (i12/r2)*(nb*ne01) + (i13/r3)*(nb*ne01*ne02); + + device const block_iq2_xs * x = (device const block_iq2_xs *) src0 + ib_row + offset0; + device const float * y = (device const float *) src1 + r1*ne10 + im*ne00*ne1; + + float yl[32]; + float sumf[N_DST]={0.f}, all_sum; + + const int nb32 = nb * (QK_K / 32); + + threadgroup uint64_t * values = (threadgroup uint64_t *)shared_values; + threadgroup uint8_t * shared_signs = (threadgroup uint8_t *)(values + 512); + { + int nval = 8; + int pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) values[pos + i] = iq2xs_grid[pos + i]; + nval = 2; + pos = (32*sgitg + tiisg)*nval; + for (int i = 0; i < nval; ++i) shared_signs[pos+i] = ksigns_iq2xs[pos+i]; + threadgroup_barrier(mem_flags::mem_threadgroup); + } + +#if QK_K == 256 + const int ix = tiisg; + + device const float * y4 = y + 32 * ix; + + for (int ib32 = ix; ib32 < nb32; ib32 += 32) { + + for (int i = 0; i < 32; ++i) { + yl[i] = y4[i]; + } + + const int ibl = ib32 / (QK_K / 32); + const int ib = ib32 % (QK_K / 32); + + device const block_iq2_xs * xr = x + ibl; + device const uint16_t * q2 = xr->qs + 4 * ib; + device const uint8_t * sc = xr->scales + ib; + device const half * dh = &xr->d; + + for (int row = 0; row < N_DST; row++) { + + const float db = dh[0]; + const uint8_t ls1 = sc[0] & 0xf; + const uint8_t ls2 = sc[0] >> 4; + const float d1 = db * (0.5f + ls1); + const float d2 = db * (0.5f + ls2); + + float sum1 = 0, sum2 = 0; + for (int l = 0; l < 2; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511)); + const uint8_t signs = shared_signs[(q2[l] >> 9)]; + for (int j = 0; j < 8; ++j) { + sum1 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + for (int l = 2; l < 4; ++l) { + const threadgroup uint8_t * grid = (const threadgroup uint8_t *)(values + (q2[l] & 511)); + const uint8_t signs = shared_signs[(q2[l] >> 9)]; + for (int j = 0; j < 8; ++j) { + sum2 += yl[8*l + j] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + } + sumf[row] += d1 * sum1 + d2 * sum2; + + dh += nb*sizeof(block_iq2_xs)/2; + q2 += nb*sizeof(block_iq2_xs)/2; + sc += nb*sizeof(block_iq2_xs); + } + + y4 += 32 * 32; + } +#else + // TODO +#endif + + for (int row = 0; row < N_DST; ++row) { + all_sum = simd_sum(sumf[row]); + if (tiisg == 0) { + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.25f; + } + } +} + +[[host_name("kernel_mul_mv_iq2_xs_f32")]] +kernel void kernel_mul_mv_iq2_xs_f32( + device const void * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint & r2, + constant uint & r3, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + + kernel_mul_mv_iq2_xs_f32_impl(src0, src1, dst, ne00, ne01, ne02, ne10, ne12, ne0, ne1, r2, r3, shared_values, tgpig, tiisg, sgitg); +} + //============================= templates and their specializations ============================= // NOTE: this is not dequantizing - we are simply fitting the template @@ -3973,18 +4254,39 @@ void dequantize_iq2_xxs(device const block_iq2_xxs * xb, short il, thread type4x const uint32_t aux32_s = q2[2] | (q2[3] << 16); thread const uint8_t * aux8 = (thread const uint8_t *)&aux32_g; const float dl = d * (0.5f + (aux32_s >> 28)) * 0.25f; - constant uint8_t * grid = (constant uint8_t *)(kgrid_iq2xxs + aux8[2*il+0]); + constant uint8_t * grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+0]); uint8_t signs = ksigns_iq2xs[(aux32_s >> 14*il) & 127]; for (int i = 0; i < 8; ++i) { reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); } - grid = (constant uint8_t *)(kgrid_iq2xxs + aux8[2*il+1]); + grid = (constant uint8_t *)(iq2xxs_grid + aux8[2*il+1]); signs = ksigns_iq2xs[(aux32_s >> (14*il+7)) & 127]; for (int i = 0; i < 8; ++i) { reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); } } +template +void dequantize_iq2_xs(device const block_iq2_xs * xb, short il, thread type4x4 & reg) { + // il is 0...15 for QK_K = 256 => index of block of 32 is il/2 + const float d = xb->d; + const int ib32 = il/2; + il = il%2; + // il = 0 or 1. il = 0 processes the first 16 quants in a block of 32, il = 1 the second 16 + device const uint16_t * q2 = xb->qs + 4*ib32; + const float dl = d * (0.5f + ((xb->scales[ib32] >> 4*il) & 0xf)) * 0.25f; + constant uint8_t * grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+0] & 511)); + uint8_t signs = ksigns_iq2xs[q2[2*il+0] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } + grid = (constant uint8_t *)(iq2xs_grid + (q2[2*il+1] & 511)); + signs = ksigns_iq2xs[q2[2*il+1] >> 9]; + for (int i = 0; i < 8; ++i) { + reg[2+i/4][i%4] = dl * grid[i] * (signs & kmask_iq2xs[i] ? -1.f : 1.f); + } +} + template kernel void kernel_get_rows( device const void * src0, @@ -4525,6 +4827,7 @@ template [[host_name("kernel_get_rows_q4_K")]] kernel get_rows_t kernel_get_rows template [[host_name("kernel_get_rows_q5_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_q6_K")]] kernel get_rows_t kernel_get_rows; template [[host_name("kernel_get_rows_iq2_xxs")]] kernel get_rows_t kernel_get_rows; +template [[host_name("kernel_get_rows_iq2_xs")]] kernel get_rows_t kernel_get_rows; // // matrix-matrix multiplication @@ -4562,6 +4865,7 @@ template [[host_name("kernel_mul_mm_q4_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_q6_K_f32")]] kernel mat_mm_t kernel_mul_mm; template [[host_name("kernel_mul_mm_iq2_xxs_f32")]] kernel mat_mm_t kernel_mul_mm; +template [[host_name("kernel_mul_mm_iq2_xs_f32")]] kernel mat_mm_t kernel_mul_mm; // // indirect matrix-matrix multiplication @@ -4611,6 +4915,7 @@ template [[host_name("kernel_mul_mm_id_q4_K_f32")]] kernel mat_mm_id_t kernel_mu template [[host_name("kernel_mul_mm_id_q5_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_q6_K_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; template [[host_name("kernel_mul_mm_id_iq2_xxs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; +template [[host_name("kernel_mul_mm_id_iq2_xs_f32")]] kernel mat_mm_id_t kernel_mul_mm_id; // // matrix-vector multiplication @@ -5448,3 +5753,68 @@ kernel void kernel_mul_mv_id_iq2_xxs_f32( tiisg, sgitg); } + +[[host_name("kernel_mul_mv_id_iq2_xs_f32")]] +kernel void kernel_mul_mv_id_iq2_xs_f32( + device const char * ids, + device const char * src1, + device float * dst, + constant uint64_t & nbi1, + constant int64_t & ne00, + constant int64_t & ne01, + constant int64_t & ne02, + constant uint64_t & nb00, + constant uint64_t & nb01, + constant uint64_t & nb02, + constant int64_t & ne10, + constant int64_t & ne11, + constant int64_t & ne12, + constant int64_t & ne13, + constant uint64_t & nb10, + constant uint64_t & nb11, + constant uint64_t & nb12, + constant int64_t & ne0, + constant int64_t & ne1, + constant uint64_t & nb1, + constant uint & r2, + constant uint & r3, + constant int & idx, + device const char * src00, + device const char * src01, + device const char * src02, + device const char * src03, + device const char * src04, + device const char * src05, + device const char * src06, + device const char * src07, + threadgroup int8_t * shared_values [[threadgroup(0)]], + uint3 tgpig[[threadgroup_position_in_grid]], + uint tiitg[[thread_index_in_threadgroup]], + uint tiisg[[thread_index_in_simdgroup]], + uint sgitg[[simdgroup_index_in_threadgroup]]) { + device const char * src0[8] = {src00, src01, src02, src03, src04, src05, src06, src07}; + + const int64_t bid = tgpig.z/(ne12*ne13); + + tgpig.z = tgpig.z%(ne12*ne13); + + const int32_t id = ((device int32_t *) (ids + bid*nbi1))[idx]; + + kernel_mul_mv_iq2_xs_f32_impl( + src0[id], + (device const float *) (src1 + bid*nb11), + dst + bid*ne0, + ne00, + ne01, + ne02, + ne10, + ne12, + ne0, + ne1, + r2, + r3, + shared_values, + tgpig, + tiisg, + sgitg); +} diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp index 496f9cdca..2bb93638f 100644 --- a/ggml-opencl.cpp +++ b/ggml-opencl.cpp @@ -1,5 +1,6 @@ #include "ggml.h" #include "ggml-opencl.h" +#include "ggml-backend-impl.h" #include #include @@ -10,7 +11,7 @@ #include #include -#define CL_TARGET_OPENCL_VERSION 110 +#define CL_TARGET_OPENCL_VERSION 120 #include #if defined(_MSC_VER) @@ -929,6 +930,12 @@ static cl_program build_program_from_source(cl_context ctx, cl_device_id dev, co } void ggml_cl_init(void) { + static bool initialized = false; + if (initialized) { + return; + } + initialized = true; + cl_int err; struct cl_device; @@ -1483,8 +1490,8 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } else { d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size); } - cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); - cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); + cl_mem d_Y = src1->backend == GGML_BACKEND_GPU ? (cl_mem) src1->extra : ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size); + cl_mem d_D = dst->backend == GGML_BACKEND_GPU ? (cl_mem) dst->extra : ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size); size_t x_offset = 0; @@ -1501,7 +1508,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // copy src1 to device - CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); + if (src1->backend == GGML_BACKEND_CPU) { + CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Y, 0, src1, i13, i12, NULL)); + } CL_CHECK(clFinish(queue)); @@ -1522,8 +1531,10 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); + if (dst->backend == GGML_BACKEND_CPU) { + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(float) * d_ne, d, 1, &ev_sgemm, NULL)); + } } } } @@ -1532,8 +1543,12 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr if (src0->backend != GGML_BACKEND_GPU) { ggml_cl_pool_free(d_X, x_size); } - ggml_cl_pool_free(d_Y, y_size); - ggml_cl_pool_free(d_D, d_size); + if (src1->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_Y, y_size); + } + if (dst->backend != GGML_BACKEND_GPU) { + ggml_cl_pool_free(d_D, d_size); + } } static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata, size_t wsize) { @@ -1598,6 +1613,8 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_X, 0, src0, i03, i02, NULL)); } + // FIXME: convert on device + for (int64_t i12 = i02 * r2, e12 = i12 + r2; i12 < e12; i12++) { // convert src1 to fp16 // TODO: use multiple threads @@ -1643,11 +1660,13 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr } // copy dst to host, then convert to float - CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); - - float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); - - ggml_fp16_to_fp32_row(tmp, d, d_ne); + if (dst->backend == GGML_BACKEND_CPU) { + CL_CHECK(clEnqueueReadBuffer(queue, d_D, true, 0, sizeof(ggml_fp16_t) * d_ne, tmp, 1, &ev_sgemm, NULL)); + float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3); + ggml_fp16_to_fp32_row(tmp, d, d_ne); + } else { + // FIXME: convert dst to fp32 on device + } } } } @@ -1801,7 +1820,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor * } -bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { +bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst) { const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; @@ -1895,3 +1914,291 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) { tensor->extra = dst; GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU); } + +// ggml-backend + +// buffer + +struct ggml_backend_opencl_buffer_context { + ~ggml_backend_opencl_buffer_context() { + if (buffer) { + clReleaseMemObject(buffer); + } + for (auto * sub_buffer : sub_buffers) { + clReleaseMemObject(sub_buffer); + } + } + + cl_mem buffer; + std::vector sub_buffers; +}; + +static void * const cl_ptr_base = (void *)(uintptr_t) 0x1000; + +static const char * ggml_backend_opencl_buffer_get_name(ggml_backend_buffer_t buffer) { + return "OpenCL"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + delete ctx; +} + +static void * ggml_backend_opencl_buffer_get_base(ggml_backend_buffer_t buffer) { + return cl_ptr_base; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) { + if (tensor->view_src != NULL && tensor->view_offs == 0) { + tensor->extra = tensor->view_src->extra; + } else { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + cl_buffer_region region = {(size_t)((char *)tensor->data - (char *)cl_ptr_base), ggml_nbytes(tensor)}; + cl_int err; + cl_mem sub_buffer = clCreateSubBuffer(ctx->buffer, CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &err); + CL_CHECK(err); + ctx->sub_buffers.push_back(sub_buffer); + tensor->extra = sub_buffer; + } + tensor->backend = GGML_BACKEND_GPU; +} + +static void ggml_backend_opencl_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) { + cl_mem tensor_buffer = (cl_mem) tensor->extra; + CL_CHECK(clEnqueueWriteBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) { + cl_mem tensor_buffer = (cl_mem) tensor->extra; + CL_CHECK(clEnqueueReadBuffer(queue, tensor_buffer, true, offset, size, data, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + CL_CHECK(clEnqueueFillBuffer(queue, ctx->buffer, &value, sizeof(value), 0, buffer->size, 0, NULL, NULL)); + CL_CHECK(clFinish(queue)); +} + +static void ggml_backend_opencl_buffer_reset(ggml_backend_buffer_t buffer) { + ggml_backend_opencl_buffer_context * ctx = (ggml_backend_opencl_buffer_context *) buffer->context; + for (auto * sub_buffer : ctx->sub_buffers) { + clReleaseMemObject(sub_buffer); + } + ctx->sub_buffers.clear(); +} + +static ggml_backend_buffer_i ggml_backend_opencl_buffer_interface = { + /* .get_name = */ ggml_backend_opencl_buffer_get_name, + /* .free_buffer = */ ggml_backend_opencl_buffer_free_buffer, + /* .get_base = */ ggml_backend_opencl_buffer_get_base, + /* .init_tensor = */ ggml_backend_opencl_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_opencl_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_opencl_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_opencl_buffer_clear, + /* .reset = */ ggml_backend_opencl_buffer_reset, +}; + +// buffer type + +static const char * ggml_backend_opencl_buffer_type_name(ggml_backend_buffer_type_t buffer_type) { + return "OpenCL"; + + GGML_UNUSED(buffer_type); +} + +static ggml_backend_buffer_t ggml_backend_opencl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buffer_type, size_t size) { + ggml_cl_init(); + + cl_int err; + cl_mem mem = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &err); + if (err != CL_SUCCESS) { + fprintf(stderr, "%s: failed to allocate %.2f MiB\n", __func__, size / 1024.0 / 1024.0); + return nullptr; + } + + ggml_backend_opencl_buffer_context * ctx = new ggml_backend_opencl_buffer_context{mem, {}}; + + return ggml_backend_buffer_init(buffer_type, ggml_backend_opencl_buffer_interface, ctx, size); +} + +static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_type_t buffer_type) { + // FIXME: not thread safe, device may not be initialized yet + static cl_uint alignment = -1; + if (alignment == (cl_uint)-1) { + ggml_cl_init(); + clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL); + } + return alignment; + + GGML_UNUSED(buffer_type); +} + +static bool ggml_backend_opencl_buffer_type_supports_backend(ggml_backend_buffer_type_t buffer_type, ggml_backend_t backend) { + //return ggml_backend_is_opencl(backend); // opencl must be used through the cpu backend + return ggml_backend_is_cpu(backend); + + GGML_UNUSED(buffer_type); +} + +static ggml_backend_buffer_type_i ggml_backend_opencl_buffer_type_interface = { + /* .get_name = */ ggml_backend_opencl_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment, + /* .get_alloc_size = */ NULL, + /* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend, + /* .is_host = */ NULL, +}; + + +ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type() { + static ggml_backend_buffer_type buffer_type = { + /* .iface = */ ggml_backend_opencl_buffer_type_interface, + /* .context = */ nullptr, + }; + + return &buffer_type; +} + +#if 0 +// host buffer type + +static const char * ggml_backend_opencl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { + return "CL_Host"; + + GGML_UNUSED(buft); +} + +static const char * ggml_backend_opencl_host_buffer_name(ggml_backend_buffer_t buffer) { + return "CL_Host"; + + GGML_UNUSED(buffer); +} + +static void ggml_backend_opencl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_cl_host_free(buffer->context); +} + +static ggml_backend_buffer_t ggml_backend_opencl_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + void * ptr = ggml_cl_host_malloc(size); + + if (ptr == nullptr) { + // fallback to cpu buffer + return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size); + } + + ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); + buffer->buft = buft; + buffer->iface.get_name = ggml_backend_opencl_host_buffer_name; + buffer->iface.free_buffer = ggml_backend_opencl_host_buffer_free_buffer; + + return buffer; +} + +ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type() { + static struct ggml_backend_buffer_type ggml_backend_opencl_buffer_type_host = { + /* .iface = */ { + /* .get_name = */ ggml_backend_opencl_host_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment, + /* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size, + /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend, + /* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host, + }, + /* .context = */ nullptr, + }; + + return &ggml_backend_opencl_buffer_type_host; +} + +// backend + +static const char * ggml_backend_opencl_name(ggml_backend_t backend) { + return "OpenCL"; + + GGML_UNUSED(backend); +} + +static void ggml_backend_opencl_free(ggml_backend_t backend) { + GGML_UNUSED(backend); +} + +static ggml_backend_buffer_type_t ggml_backend_opencl_get_default_buffer_type(ggml_backend_t backend) { + return ggml_backend_opencl_buffer_type(); + + GGML_UNUSED(backend); +} + +static bool ggml_backend_opencl_graph_compute(ggml_backend_t backend, ggml_cgraph * graph) { + for (int i = 0; i < graph->n_nodes; ++i) { + ggml_tensor * node = graph->nodes[i]; + switch (node->op) { + case GGML_OP_MUL_MAT: + ggml_cl_mul_mat(node->src[0], node->src[1], node, nullptr, 0); + break; + case GGML_OP_MUL: + ggml_cl_mul(node->src[0], node->src[1], node); + break; + default: + GGML_ASSERT(false); + } + } + + return true; + + GGML_UNUSED(backend); +} + +static bool ggml_backend_opencl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { + switch (op->op) { + case GGML_OP_MUL_MAT: + return ggml_cl_can_mul_mat(op->src[0], op->src[1], op); + case GGML_OP_MUL: + // return ggml_can_repeat_rows(op->src[1], op->src[0]); + return true; + default: + return false; + } + + GGML_UNUSED(backend); +} + +static ggml_backend_i opencl_backend_i = { + /* .get_name = */ ggml_backend_opencl_name, + /* .free = */ ggml_backend_opencl_free, + /* .get_default_buffer_type = */ ggml_backend_opencl_get_default_buffer_type, + /* .set_tensor_async = */ NULL, + /* .get_tensor_async = */ NULL, + /* .cpy_tensor_from_async = */ NULL, + /* .cpy_tensor_to_async = */ NULL, + /* .synchronize = */ NULL, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, + /* .graph_compute = */ ggml_backend_opencl_graph_compute, + /* .supports_op = */ ggml_backend_opencl_supports_op, +}; + +ggml_backend_t ggml_backend_opencl_init() { + ggml_backend_t backend = new ggml_backend { + /* .interface = */ opencl_backend_i, + /* .context = */ nullptr + }; + + return backend; +} + +bool ggml_backend_is_opencl(ggml_backend_t backend) { + return backend && backend->iface.get_name == ggml_backend_opencl_name; +} +#endif diff --git a/ggml-opencl.h b/ggml-opencl.h index 44d05bd64..919b00d63 100644 --- a/ggml-opencl.h +++ b/ggml-opencl.h @@ -1,6 +1,7 @@ #pragma once #include "ggml.h" +#include "ggml-backend.h" #ifdef __cplusplus extern "C" { @@ -9,17 +10,26 @@ extern "C" { GGML_API void ggml_cl_init(void); GGML_API void ggml_cl_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); +GGML_API bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, const struct ggml_tensor * dst); GGML_API size_t ggml_cl_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); GGML_API void ggml_cl_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize); -GGML_API void * ggml_cl_host_malloc(size_t size); -GGML_API void ggml_cl_host_free(void * ptr); +// GGML_API void * ggml_cl_host_malloc(size_t size); +// GGML_API void ggml_cl_host_free(void * ptr); GGML_API void ggml_cl_free_data(const struct ggml_tensor* tensor); GGML_API void ggml_cl_transform_tensor(void * data, struct ggml_tensor * tensor); +// backend API + +// GGML_API ggml_backend_t ggml_backend_opencl_init(void); + +// GGML_API bool ggml_backend_is_opencl(ggml_backend_t backend); + +GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_buffer_type(void); +// GGML_API ggml_backend_buffer_type_t ggml_backend_opencl_host_buffer_type(void); + #ifdef __cplusplus } #endif diff --git a/ggml-quants.c b/ggml-quants.c index d497e6de9..a24b4b244 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -2342,15 +2342,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * // ====================== "True" 2-bit (de)-quantization -void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) { - (void)x; - (void)y; - (void)k; - assert(k % QK_K == 0); - //fprintf(stderr, "=========================== %s: not implemented\n", __func__); -} - -static const uint64_t iq2xxs_grid[256] = { +static const uint64_t iq2xxs_grid[256] = { 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b2b08, 0x08080808082b2b2b, 0x0808080819080819, @@ -2417,6 +2409,137 @@ static const uint64_t iq2xxs_grid[256] = { 0x2b2b082b08080808, 0x2b2b190808192b08, 0x2b2b2b0819190808, 0x2b2b2b1908081908, }; +static const uint64_t iq2xs_grid[512] = { + 0x0808080808080808, 0x080808080808082b, 0x0808080808081919, 0x0808080808082b08, + 0x0808080808082b2b, 0x0808080808190819, 0x0808080808191908, 0x080808080819192b, + 0x0808080808192b19, 0x08080808082b0808, 0x08080808082b082b, 0x08080808082b1919, + 0x08080808082b2b08, 0x0808080819080819, 0x0808080819081908, 0x080808081908192b, + 0x0808080819082b19, 0x0808080819190808, 0x080808081919082b, 0x0808080819191919, + 0x0808080819192b08, 0x08080808192b0819, 0x08080808192b1908, 0x080808082b080808, + 0x080808082b08082b, 0x080808082b081919, 0x080808082b082b08, 0x080808082b190819, + 0x080808082b191908, 0x080808082b192b19, 0x080808082b2b0808, 0x0808081908080819, + 0x0808081908081908, 0x080808190808192b, 0x0808081908082b19, 0x0808081908190808, + 0x080808190819082b, 0x0808081908191919, 0x0808081908192b08, 0x0808081908192b2b, + 0x08080819082b0819, 0x08080819082b1908, 0x0808081919080808, 0x080808191908082b, + 0x0808081919081919, 0x0808081919082b08, 0x0808081919190819, 0x0808081919191908, + 0x08080819192b0808, 0x08080819192b2b08, 0x080808192b080819, 0x080808192b081908, + 0x080808192b190808, 0x0808082b08080808, 0x0808082b0808082b, 0x0808082b08081919, + 0x0808082b08082b08, 0x0808082b08190819, 0x0808082b08191908, 0x0808082b082b0808, + 0x0808082b19080819, 0x0808082b19081908, 0x0808082b19190808, 0x0808082b19191919, + 0x0808082b2b080808, 0x0808082b2b082b2b, 0x0808190808080819, 0x0808190808081908, + 0x080819080808192b, 0x0808190808082b19, 0x0808190808190808, 0x080819080819082b, + 0x0808190808191919, 0x0808190808192b08, 0x08081908082b0819, 0x08081908082b1908, + 0x0808190819080808, 0x080819081908082b, 0x0808190819081919, 0x0808190819082b08, + 0x0808190819190819, 0x0808190819191908, 0x080819081919192b, 0x08081908192b0808, + 0x080819082b080819, 0x080819082b081908, 0x080819082b190808, 0x0808191908080808, + 0x080819190808082b, 0x0808191908081919, 0x0808191908082b08, 0x0808191908190819, + 0x0808191908191908, 0x08081919082b0808, 0x0808191919080819, 0x0808191919081908, + 0x0808191919190808, 0x08081919192b0819, 0x080819192b080808, 0x0808192b08080819, + 0x0808192b08081908, 0x0808192b08190808, 0x0808192b082b192b, 0x0808192b19080808, + 0x0808192b1908082b, 0x0808192b2b081908, 0x08082b0808080808, 0x08082b080808082b, + 0x08082b0808081919, 0x08082b0808082b08, 0x08082b0808082b2b, 0x08082b0808190819, + 0x08082b0808191908, 0x08082b08082b0808, 0x08082b08082b1919, 0x08082b0819080819, + 0x08082b0819081908, 0x08082b0819190808, 0x08082b0819192b08, 0x08082b082b080808, + 0x08082b082b2b0808, 0x08082b082b2b2b2b, 0x08082b1908080819, 0x08082b1908081908, + 0x08082b1908190808, 0x08082b1919080808, 0x08082b192b080819, 0x08082b192b082b19, + 0x08082b2b08080808, 0x08082b2b082b0808, 0x08082b2b082b2b08, 0x08082b2b2b19192b, + 0x08082b2b2b2b0808, 0x0819080808080819, 0x0819080808081908, 0x081908080808192b, + 0x0819080808082b19, 0x0819080808190808, 0x081908080819082b, 0x0819080808191919, + 0x0819080808192b08, 0x08190808082b0819, 0x08190808082b1908, 0x0819080819080808, + 0x081908081908082b, 0x0819080819081919, 0x0819080819082b08, 0x0819080819190819, + 0x0819080819191908, 0x08190808192b0808, 0x08190808192b2b2b, 0x081908082b080819, + 0x081908082b081908, 0x081908082b190808, 0x0819081908080808, 0x081908190808082b, + 0x0819081908081919, 0x0819081908082b08, 0x0819081908190819, 0x0819081908191908, + 0x08190819082b0808, 0x0819081919080819, 0x0819081919081908, 0x0819081919190808, + 0x081908192b080808, 0x081908192b191908, 0x081908192b19192b, 0x0819082b08080819, + 0x0819082b08081908, 0x0819082b0808192b, 0x0819082b08190808, 0x0819082b19080808, + 0x0819082b192b0808, 0x0819190808080808, 0x081919080808082b, 0x0819190808081919, + 0x0819190808082b08, 0x0819190808190819, 0x0819190808191908, 0x08191908082b0808, + 0x0819190819080819, 0x0819190819081908, 0x0819190819082b19, 0x0819190819190808, + 0x08191908192b1908, 0x081919082b080808, 0x0819191908080819, 0x0819191908081908, + 0x0819191908190808, 0x0819191919080808, 0x0819192b08080808, 0x0819192b08191908, + 0x0819192b19082b19, 0x08192b0808080819, 0x08192b0808081908, 0x08192b0808190808, + 0x08192b080819082b, 0x08192b0819080808, 0x08192b0819191908, 0x08192b082b08192b, + 0x08192b1908080808, 0x08192b1908081919, 0x08192b19192b192b, 0x08192b2b19190819, + 0x08192b2b2b2b2b19, 0x082b080808080808, 0x082b08080808082b, 0x082b080808081919, + 0x082b080808082b08, 0x082b080808082b2b, 0x082b080808190819, 0x082b080808191908, + 0x082b0808082b0808, 0x082b080819080819, 0x082b080819081908, 0x082b080819190808, + 0x082b08082b080808, 0x082b08082b2b0808, 0x082b081908080819, 0x082b081908081908, + 0x082b081908190808, 0x082b081919080808, 0x082b081919082b08, 0x082b0819192b1919, + 0x082b082b08080808, 0x082b082b082b082b, 0x082b082b2b080808, 0x082b082b2b2b2b08, + 0x082b190808080819, 0x082b190808081908, 0x082b190808190808, 0x082b1908082b2b19, + 0x082b190819080808, 0x082b191908080808, 0x082b191919080819, 0x082b19191919082b, + 0x082b19192b192b19, 0x082b192b08080819, 0x082b192b08192b2b, 0x082b192b2b2b192b, + 0x082b2b0808080808, 0x082b2b0808082b08, 0x082b2b0808082b2b, 0x082b2b08082b0808, + 0x082b2b0819191919, 0x082b2b082b082b08, 0x082b2b082b2b082b, 0x082b2b19192b2b08, + 0x082b2b192b190808, 0x082b2b2b08082b08, 0x082b2b2b082b0808, 0x082b2b2b2b08082b, + 0x082b2b2b2b082b08, 0x082b2b2b2b082b2b, 0x1908080808080819, 0x1908080808081908, + 0x190808080808192b, 0x1908080808082b19, 0x1908080808190808, 0x190808080819082b, + 0x1908080808191919, 0x1908080808192b08, 0x19080808082b0819, 0x19080808082b1908, + 0x1908080819080808, 0x190808081908082b, 0x1908080819081919, 0x1908080819082b08, + 0x1908080819082b2b, 0x1908080819190819, 0x1908080819191908, 0x19080808192b0808, + 0x19080808192b1919, 0x190808082b080819, 0x190808082b081908, 0x190808082b190808, + 0x1908081908080808, 0x190808190808082b, 0x1908081908081919, 0x1908081908082b08, + 0x1908081908190819, 0x1908081908191908, 0x19080819082b0808, 0x1908081919080819, + 0x1908081919081908, 0x1908081919190808, 0x190808192b080808, 0x190808192b081919, + 0x190808192b2b082b, 0x1908082b08080819, 0x1908082b08081908, 0x1908082b08190808, + 0x1908082b0819082b, 0x1908082b082b2b19, 0x1908082b19080808, 0x1908190808080808, + 0x190819080808082b, 0x1908190808081919, 0x1908190808082b08, 0x1908190808190819, + 0x1908190808191908, 0x1908190808192b19, 0x19081908082b0808, 0x1908190819080819, + 0x1908190819081908, 0x1908190819190808, 0x190819082b080808, 0x190819082b191908, + 0x1908191908080819, 0x1908191908081908, 0x1908191908190808, 0x19081919082b1908, + 0x1908191919080808, 0x190819192b192b2b, 0x1908192b08080808, 0x1908192b08082b2b, + 0x1908192b19081908, 0x1908192b19190808, 0x19082b0808080819, 0x19082b0808081908, + 0x19082b0808190808, 0x19082b0819080808, 0x19082b0819081919, 0x19082b0819191908, + 0x19082b08192b082b, 0x19082b1908080808, 0x19082b1908190819, 0x19082b1919081908, + 0x19082b1919190808, 0x19082b19192b2b19, 0x19082b2b08081908, 0x1919080808080808, + 0x191908080808082b, 0x1919080808081919, 0x1919080808082b08, 0x1919080808190819, + 0x1919080808191908, 0x19190808082b0808, 0x19190808082b2b08, 0x1919080819080819, + 0x1919080819081908, 0x1919080819190808, 0x191908082b080808, 0x1919081908080819, + 0x1919081908081908, 0x1919081908190808, 0x1919081908191919, 0x1919081919080808, + 0x191908191908082b, 0x1919082b08080808, 0x1919082b19081908, 0x1919082b2b2b2b2b, + 0x1919190808080819, 0x1919190808081908, 0x1919190808190808, 0x19191908082b0819, + 0x1919190819080808, 0x19191908192b0808, 0x191919082b080819, 0x191919082b2b0819, + 0x1919191908080808, 0x1919191908082b08, 0x191919192b080808, 0x191919192b082b08, + 0x1919192b082b0819, 0x1919192b192b2b08, 0x1919192b2b2b0819, 0x19192b0808080808, + 0x19192b0808191908, 0x19192b0819080819, 0x19192b0819190808, 0x19192b082b192b19, + 0x19192b1908192b2b, 0x19192b1919080808, 0x19192b191908082b, 0x19192b2b2b081919, + 0x192b080808080819, 0x192b080808081908, 0x192b080808190808, 0x192b080819080808, + 0x192b080819191908, 0x192b0808192b082b, 0x192b08082b08192b, 0x192b08082b2b2b19, + 0x192b081908080808, 0x192b082b082b1908, 0x192b082b19082b2b, 0x192b082b2b19082b, + 0x192b190808080808, 0x192b19080819192b, 0x192b191908190808, 0x192b191919080808, + 0x192b191919081919, 0x192b19192b2b1908, 0x192b2b0808080819, 0x192b2b08192b2b2b, + 0x192b2b19082b1919, 0x192b2b2b0808192b, 0x192b2b2b19191908, 0x192b2b2b192b082b, + 0x2b08080808080808, 0x2b0808080808082b, 0x2b08080808081919, 0x2b08080808082b08, + 0x2b08080808190819, 0x2b08080808191908, 0x2b080808082b0808, 0x2b080808082b2b2b, + 0x2b08080819080819, 0x2b08080819081908, 0x2b08080819190808, 0x2b0808082b080808, + 0x2b0808082b08082b, 0x2b0808082b2b2b08, 0x2b0808082b2b2b2b, 0x2b08081908080819, + 0x2b08081908081908, 0x2b0808190808192b, 0x2b08081908190808, 0x2b08081919080808, + 0x2b08081919190819, 0x2b08081919192b19, 0x2b08082b08080808, 0x2b08082b082b0808, + 0x2b08082b2b080808, 0x2b08082b2b08082b, 0x2b08082b2b2b0808, 0x2b08082b2b2b2b08, + 0x2b08190808080819, 0x2b08190808081908, 0x2b08190808190808, 0x2b0819080819082b, + 0x2b08190808191919, 0x2b08190819080808, 0x2b081908192b0808, 0x2b0819082b082b19, + 0x2b08191908080808, 0x2b08191919081908, 0x2b0819192b2b1919, 0x2b08192b08192b08, + 0x2b08192b192b2b2b, 0x2b082b0808080808, 0x2b082b0808082b08, 0x2b082b08082b1919, + 0x2b082b0819192b2b, 0x2b082b082b080808, 0x2b082b082b08082b, 0x2b082b082b2b2b08, + 0x2b082b190808192b, 0x2b082b2b082b082b, 0x2b082b2b2b080808, 0x2b082b2b2b082b08, + 0x2b082b2b2b19192b, 0x2b082b2b2b2b2b08, 0x2b19080808080819, 0x2b19080808081908, + 0x2b19080808190808, 0x2b19080819080808, 0x2b1908081919192b, 0x2b1908082b081908, + 0x2b19081908080808, 0x2b190819082b082b, 0x2b190819192b1908, 0x2b19082b1919192b, + 0x2b19082b2b082b19, 0x2b19190808080808, 0x2b19190808081919, 0x2b19190819081908, + 0x2b19190819190808, 0x2b19190819192b08, 0x2b191919082b2b19, 0x2b1919192b190808, + 0x2b1919192b19082b, 0x2b19192b19080819, 0x2b192b0819190819, 0x2b192b082b2b192b, + 0x2b192b1919082b19, 0x2b192b2b08191919, 0x2b192b2b192b0808, 0x2b2b080808080808, + 0x2b2b08080808082b, 0x2b2b080808082b08, 0x2b2b080808082b2b, 0x2b2b0808082b0808, + 0x2b2b0808082b2b2b, 0x2b2b08082b2b0808, 0x2b2b081919190819, 0x2b2b081919192b19, + 0x2b2b08192b2b192b, 0x2b2b082b08080808, 0x2b2b082b0808082b, 0x2b2b082b08082b08, + 0x2b2b082b082b2b2b, 0x2b2b082b2b080808, 0x2b2b082b2b2b0808, 0x2b2b190819080808, + 0x2b2b19082b191919, 0x2b2b192b192b1919, 0x2b2b192b2b192b08, 0x2b2b2b0808082b2b, + 0x2b2b2b08082b0808, 0x2b2b2b08082b082b, 0x2b2b2b08082b2b08, 0x2b2b2b082b2b0808, + 0x2b2b2b082b2b2b08, 0x2b2b2b1908081908, 0x2b2b2b192b081908, 0x2b2b2b192b08192b, + 0x2b2b2b2b082b2b08, 0x2b2b2b2b082b2b2b, 0x2b2b2b2b2b190819, 0x2b2b2b2b2b2b2b2b, +}; + static const uint8_t ksigns_iq2xs[128] = { 0, 129, 130, 3, 132, 5, 6, 135, 136, 9, 10, 139, 12, 141, 142, 15, 144, 17, 18, 147, 20, 149, 150, 23, 24, 153, 154, 27, 156, 29, 30, 159, @@ -2427,8 +2550,17 @@ static const uint8_t ksigns_iq2xs[128] = { 96, 225, 226, 99, 228, 101, 102, 231, 232, 105, 106, 235, 108, 237, 238, 111, 240, 113, 114, 243, 116, 245, 246, 119, 120, 249, 250, 123, 252, 125, 126, 255, }; + static const uint8_t kmask_iq2xs[8] = {1, 2, 4, 8, 16, 32, 64, 128}; +void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k) { + (void)x; + (void)y; + (void)k; + assert(k % QK_K == 0); + //fprintf(stderr, "=========================== %s: not implemented\n", __func__); +} + void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2472,6 +2604,58 @@ size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_ return (n/QK_K*sizeof(block_iq2_xxs)); } +// ====================== 2.3125 bpw (de)-quantization + +void quantize_row_iq2_xs_reference(const float * restrict x, block_iq2_xs * restrict y, int k) { + (void)x; + (void)y; + (void)k; + assert(k % QK_K == 0); + //fprintf(stderr, "=========================== %s: not implemented\n", __func__); +} + +void dequantize_row_iq2_xs(const block_iq2_xs * restrict x, float * restrict y, int k) { + assert(k % QK_K == 0); + const int nb = k / QK_K; + + float db[2]; + + for (int i = 0; i < nb; i++) { + + const float d = GGML_FP16_TO_FP32(x[i].d); + + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + db[0] = d * (0.5f + (x[i].scales[ib32] & 0xf)) * 0.25f; + db[1] = d * (0.5f + (x[i].scales[ib32] >> 4)) * 0.25f; + for (int l = 0; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (x[i].qs[4*ib32 + l] & 511)); + const uint8_t signs = ksigns_iq2xs[x[i].qs[4*ib32 + l] >> 9]; + for (int j = 0; j < 8; ++j) { + y[j] = db[l/2] * grid[j] * (signs & kmask_iq2xs[j] ? -1.f : 1.f); + } + y += 8; + } + } + } +} + +void quantize_row_iq2_xs(const float * restrict x, void * restrict vy, int k) { + assert(k % QK_K == 0); + block_iq2_xs * restrict y = vy; + quantize_row_iq2_xs_reference(x, y, k); +} + +size_t ggml_quantize_iq2_xs(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK_K == 0); + (void)hist; // TODO: collect histograms + + for (int j = 0; j < n; j += k) { + block_iq2_xs * restrict y = (block_iq2_xs *)dst + j/QK_K; + quantize_row_iq2_xs_reference(src + j, y, k); + } + return (n/QK_K*sizeof(block_iq2_xs)); +} + //===================================== Q8_K ============================================== void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { @@ -7357,3 +7541,161 @@ void ggml_vec_dot_iq2_xxs_q8_K(const int n, float * restrict s, const void * res *s = 0.125f * sumf; #endif } + +void ggml_vec_dot_iq2_xs_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { + assert(n % QK_K == 0); + + const block_iq2_xs * restrict x = vx; + const block_q8_K * restrict y = vy; + + const int nb = n / QK_K; + +#if defined(__ARM_NEON) + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + int8x16x4_t q2u; + int8x16x4_t q2s; + int8x16x4_t q8b; + + int32x4x4_t scales32; + + float sumf = 0; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + const uint8x8_t scales8 = vld1_u8(x[i].scales); + const uint8x8_t scales_l = vand_u8(scales8, vdup_n_u8(0xf)); + const uint8x8_t scales_h = vshr_n_u8(scales8, 4); + uint8x16_t scales = vcombine_u8(vzip1_u8(scales_l, scales_h), vzip2_u8(scales_l, scales_h)); + scales = vaddq_u8(vshlq_n_u8(scales, 1), vdupq_n_u8(1)); + const uint16x8_t scales1 = vmovl_u8(vget_low_u8(scales)); + const uint16x8_t scales2 = vmovl_u8(vget_high_u8(scales)); + scales32.val[0] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales1))); + scales32.val[1] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales1))); + scales32.val[2] = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(scales2))); + scales32.val[3] = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(scales2))); + int32x4_t sumi = vdupq_n_s32(0); + for (int ib64 = 0; ib64 < QK_K/64; ++ib64) { + q8b = vld1q_s8_x4(q8); q8 += 64; + q2u.val[0] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[0] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[1] & 511)))); + q2u.val[1] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[2] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[3] & 511)))); + q2u.val[2] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[4] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[5] & 511)))); + q2u.val[3] = vcombine_s8(vld1_s8((const void *)(iq2xs_grid + (q2[6] & 511))), vld1_s8((const void *)(iq2xs_grid + (q2[7] & 511)))); + q2s.val[0] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[0] >> 9))), vld1_s8((const void *)(signs64 + (q2[1] >> 9)))); + q2s.val[1] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[2] >> 9))), vld1_s8((const void *)(signs64 + (q2[3] >> 9)))); + q2s.val[2] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[4] >> 9))), vld1_s8((const void *)(signs64 + (q2[5] >> 9)))); + q2s.val[3] = vcombine_s8(vld1_s8((const void *)(signs64 + (q2[6] >> 9))), vld1_s8((const void *)(signs64 + (q2[7] >> 9)))); + q2u.val[0] = vmulq_s8(q2u.val[0], q2s.val[0]); + q2u.val[1] = vmulq_s8(q2u.val[1], q2s.val[1]); + q2u.val[2] = vmulq_s8(q2u.val[2], q2s.val[2]); + q2u.val[3] = vmulq_s8(q2u.val[3], q2s.val[3]); + const int32x4_t p1 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[0], q8b.val[0]); + const int32x4_t p2 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[1], q8b.val[1]); + const int32x4_t p3 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[2], q8b.val[2]); + const int32x4_t p4 = ggml_vdotq_s32(vdupq_n_s32(0), q2u.val[3], q8b.val[3]); + const int32x4_t p = vpaddq_s32(vpaddq_s32(p1, p2), vpaddq_s32(p3, p4)); + sumi = vmlaq_s32(sumi, p, scales32.val[ib64]); + q2 += 8; + } + sumf += d*vaddvq_s32(sumi); + } + *s = 0.125f * sumf; + +#elif defined(__AVX2__) + + const __m128i m4 = _mm_set1_epi8(0xf); + const __m128i m1 = _mm_set1_epi8(1); + const __m128i m511 = _mm_set1_epi16(511); + const __m128i m127 = _mm_set1_epi16(127); + + const uint64_t * signs64 = (const uint64_t *)keven_signs_q2xs; + + uint64_t aux64; + + // somewhat hacky, but gives a significant boost in performance + __m128i aux_gindex, aux_sindex; + const uint16_t * gindex = (const uint16_t *)&aux_gindex; + const uint16_t * sindex = (const uint16_t *)&aux_sindex; + + __m256 accumf = _mm256_setzero_ps(); + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const int8_t * restrict q8 = y[i].qs; + + memcpy(&aux64, x[i].scales, 8); + __m128i stmp = _mm_set1_epi64x(aux64); + stmp = _mm_unpacklo_epi8(_mm_and_si128(stmp, m4), _mm_and_si128(_mm_srli_epi16(stmp, 4), m4)); + const __m128i scales = _mm_add_epi8(_mm_slli_epi16(stmp, 1), m1); + + __m256i sumi1 = _mm256_setzero_si256(); + __m256i sumi2 = _mm256_setzero_si256(); + for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { + const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; + const __m128i q2_data = _mm_loadu_si128((const __m128i*)q2); q2 += 8; + aux_gindex = _mm_and_si128(q2_data, m511); + aux_sindex = _mm_and_si128(_mm_srli_epi16(q2_data, 9), m127); + const __m256i q2_1 = _mm256_set_epi64x(iq2xs_grid[gindex[3]], iq2xs_grid[gindex[2]], iq2xs_grid[gindex[1]], iq2xs_grid[gindex[0]]); + const __m256i q2_2 = _mm256_set_epi64x(iq2xs_grid[gindex[7]], iq2xs_grid[gindex[6]], iq2xs_grid[gindex[5]], iq2xs_grid[gindex[4]]); + const __m256i s2_1 = _mm256_set_epi64x(signs64[sindex[3]], signs64[sindex[2]], signs64[sindex[1]], signs64[sindex[0]]); + const __m256i s2_2 = _mm256_set_epi64x(signs64[sindex[7]], signs64[sindex[6]], signs64[sindex[5]], signs64[sindex[4]]); + const __m256i q8s_1 = _mm256_sign_epi8(q8_1, s2_1); + const __m256i q8s_2 = _mm256_sign_epi8(q8_2, s2_2); + const __m256i dot1 = _mm256_maddubs_epi16(q2_1, q8s_1); + const __m256i dot2 = _mm256_maddubs_epi16(q2_2, q8s_2); + + const __m256i sc1 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+0))); + const __m256i sc2 = _mm256_cvtepi8_epi16(_mm_shuffle_epi8(scales, get_scale_shuffle(ib32+1))); + + sumi1 = _mm256_add_epi32(sumi1, _mm256_madd_epi16(dot1, sc1)); + sumi2 = _mm256_add_epi32(sumi2, _mm256_madd_epi16(dot2, sc2)); + } + + accumf = _mm256_fmadd_ps(_mm256_set1_ps(d), _mm256_cvtepi32_ps(_mm256_add_epi32(sumi1, sumi2)), accumf); + + } + + *s = 0.125f * hsum_float_8(accumf); + +#else + + float sumf = 0.f; + for (int i = 0; i < nb; ++i) { + const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; + const uint16_t * restrict q2 = x[i].qs; + const uint8_t * restrict sc = x[i].scales; + const int8_t * restrict q8 = y[i].qs; + int32_t bsum = 0; + for (int ib32 = 0; ib32 < QK_K/32; ++ib32) { + const uint16_t ls1 = 2*(sc[ib32] & 0xf) + 1; + const uint16_t ls2 = 2*(sc[ib32] >> 4) + 1; + int32_t sumi = 0; + for (int l = 0; l < 2; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls1; + sumi = 0; + for (int l = 2; l < 4; ++l) { + const uint8_t * grid = (const uint8_t *)(iq2xs_grid + (q2[l] & 511)); + const uint8_t signs = ksigns_iq2xs[q2[l] >> 9]; + for (int j = 0; j < 8; ++j) { + sumi += grid[j] * q8[j] * (signs & kmask_iq2xs[j] ? -1 : 1); + } + q8 += 8; + } + bsum += sumi * ls2; + q2 += 4; + } + sumf += d * bsum; + } + *s = 0.125f * sumf; +#endif +} diff --git a/ggml-quants.h b/ggml-quants.h index 8dd911d41..df5e7ae80 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -174,6 +174,14 @@ typedef struct { } block_iq2_xxs; static_assert(sizeof(block_iq2_xxs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t), "wrong iq2_xxs block size/padding"); +// 2.3125 bpw quants +typedef struct { + ggml_fp16_t d; + uint16_t qs[QK_K/8]; + uint8_t scales[QK_K/32]; +} block_iq2_xs; +static_assert(sizeof(block_iq2_xs) == sizeof(ggml_fp16_t) + QK_K/8*sizeof(uint16_t) + QK_K/32, "wrong iq2_xs block size/padding"); + // Quantization void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); @@ -189,6 +197,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); void quantize_row_iq2_xxs_reference(const float * restrict x, block_iq2_xxs * restrict y, int k); +void quantize_row_iq2_xs_reference (const float * restrict x, block_iq2_xs * restrict y, int k); void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); @@ -204,6 +213,7 @@ void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); void quantize_row_iq2_xxs(const float * restrict x, void * restrict y, int k); +void quantize_row_iq2_xs (const float * restrict x, void * restrict y, int k); // Dequantization void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); @@ -220,6 +230,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); void dequantize_row_iq2_xxs(const block_iq2_xxs * restrict x, float * restrict y, int k); +void dequantize_row_iq2_xs (const block_iq2_xs * restrict x, float * restrict y, int k); // Dot product void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); @@ -234,3 +245,4 @@ void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); void ggml_vec_dot_iq2_xxs_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_iq2_xs_q8_K (int n, float * restrict s, const void * restrict vx, const void * restrict vy); diff --git a/ggml.c b/ggml.c index adb387100..6dbd7626c 100644 --- a/ggml.c +++ b/ggml.c @@ -132,7 +132,7 @@ void ggml_print_backtrace(void) { "-ex", "bt -frame-info source-and-location", "-ex", "detach", "-ex", "quit", - NULL); + (char *) NULL); } else { waitpid(pid, NULL, 0); } @@ -394,6 +394,12 @@ static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); +ggml_collect_imatrix_t g_imatrix_collect = NULL; + +void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect) { + g_imatrix_collect = imatrix_collect; +} + static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { .type_name = "i8", @@ -584,6 +590,17 @@ static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { .vec_dot = ggml_vec_dot_iq2_xxs_q8_K, .vec_dot_type = GGML_TYPE_Q8_K, }, + [GGML_TYPE_IQ2_XS] = { + .type_name = "iq2_xs", + .blck_size = QK_K, + .type_size = sizeof(block_iq2_xs), + .is_quantized = true, + .to_float = (ggml_to_float_t) dequantize_row_iq2_xs, + .from_float = quantize_row_iq2_xs, + .from_float_reference = (ggml_from_float_t) quantize_row_iq2_xs_reference, + .vec_dot = ggml_vec_dot_iq2_xs_q8_K, + .vec_dot_type = GGML_TYPE_Q8_K, + }, [GGML_TYPE_Q8_K] = { .type_name = "q8_K", .blck_size = QK_K, @@ -2123,6 +2140,7 @@ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { case GGML_FTYPE_MOSTLY_Q5_K: wtype = GGML_TYPE_Q5_K; break; case GGML_FTYPE_MOSTLY_Q6_K: wtype = GGML_TYPE_Q6_K; break; case GGML_FTYPE_MOSTLY_IQ2_XXS: wtype = GGML_TYPE_IQ2_XXS; break; + case GGML_FTYPE_MOSTLY_IQ2_XS: wtype = GGML_TYPE_IQ2_XS; break; case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; } @@ -2336,6 +2354,10 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { } void ggml_free(struct ggml_context * ctx) { + if (ctx == NULL) { + return; + } + // make this function thread safe ggml_critical_section_start(); @@ -4311,13 +4333,13 @@ struct ggml_tensor * ggml_set_2d_inplace( static struct ggml_tensor * ggml_cpy_impl( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * b) { GGML_ASSERT(ggml_nelements(a) == ggml_nelements(b)); bool is_node = false; - if (!inplace && (a->grad || b->grad)) { + if (a->grad || b->grad) { + // inplace is false and either one have a grad is_node = true; } @@ -4341,29 +4363,38 @@ struct ggml_tensor * ggml_cpy( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) { - return ggml_cpy_impl(ctx, a, b, false); + return ggml_cpy_impl(ctx, a, b); } -struct ggml_tensor * ggml_cpy_inplace( +struct ggml_tensor * ggml_cast( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b) { - return ggml_cpy_impl(ctx, a, b, true); + struct ggml_tensor * a, + enum ggml_type type) { + bool is_node = false; + + struct ggml_tensor * result = ggml_new_tensor(ctx, type, GGML_MAX_DIMS, a->ne); + ggml_format_name(result, "%s (copy)", a->name); + + result->op = GGML_OP_CPY; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = result; + + return result; } // ggml_cont static struct ggml_tensor * ggml_cont_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - bool inplace) { + struct ggml_tensor * a) { bool is_node = false; - if (!inplace && a->grad) { + if (a->grad) { is_node = true; } - struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = ggml_dup_tensor(ctx, a); ggml_format_name(result, "%s (cont)", a->name); result->op = GGML_OP_CONT; @@ -4376,13 +4407,7 @@ static struct ggml_tensor * ggml_cont_impl( struct ggml_tensor * ggml_cont( struct ggml_context * ctx, struct ggml_tensor * a) { - return ggml_cont_impl(ctx, a, false); -} - -struct ggml_tensor * ggml_cont_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a) { - return ggml_cont_impl(ctx, a, true); + return ggml_cont_impl(ctx, a); } // make contiguous, with new shape @@ -7449,6 +7474,7 @@ static void ggml_compute_forward_add( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: { ggml_compute_forward_add_q_f32(params, src0, src1, dst); } break; @@ -7714,6 +7740,7 @@ static void ggml_compute_forward_add1( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: { ggml_compute_forward_add1_q_f32(params, src0, src1, dst); } break; @@ -7829,6 +7856,7 @@ static void ggml_compute_forward_acc( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: default: { GGML_ASSERT(false); @@ -9762,6 +9790,10 @@ static void ggml_compute_forward_mul_mat( const int ith = params->ith; const int nth = params->nth; + if (ith == 1 && g_imatrix_collect) { + g_imatrix_collect(src0, src1); + } + const enum ggml_type type = src0->type; const bool src1_cont = ggml_is_contiguous(src1); @@ -10065,6 +10097,10 @@ static void ggml_compute_forward_mul_mat_id( const struct ggml_tensor * src0_cur = dst->src[cur_a + 2]; + if (ith == 1 && g_imatrix_collect) { + g_imatrix_collect(src0_cur, src1); + } + const void * wdata = (src1->type == vec_dot_type) ? src1->data : params->wdata; const size_t row_size = ggml_row_size(vec_dot_type, ne10); @@ -10471,6 +10507,7 @@ static void ggml_compute_forward_out_prod( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: { ggml_compute_forward_out_prod_q_f32(params, src0, src1, dst); } break; @@ -10646,6 +10683,7 @@ static void ggml_compute_forward_set( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: default: { GGML_ASSERT(false); @@ -10841,6 +10879,7 @@ static void ggml_compute_forward_get_rows( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: { ggml_compute_forward_get_rows_q(params, src0, src1, dst); } break; @@ -11478,6 +11517,7 @@ static void ggml_compute_forward_alibi( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -11553,6 +11593,7 @@ static void ggml_compute_forward_clamp( case GGML_TYPE_Q5_K: case GGML_TYPE_Q6_K: case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: case GGML_TYPE_Q8_K: case GGML_TYPE_I8: case GGML_TYPE_I16: @@ -14851,7 +14892,7 @@ size_t ggml_hash_find_or_insert(struct ggml_hash_set hash_set, struct ggml_tenso return i; } -static struct ggml_hash_set ggml_hash_set_new(size_t size) { +struct ggml_hash_set ggml_hash_set_new(size_t size) { size = ggml_hash_size(size); struct ggml_hash_set result; result.size = size; @@ -16600,7 +16641,7 @@ static thread_ret_t ggml_graph_compute_thread(void * data) { return GGML_EXIT_SUCCESS; } -struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { +struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threads) { if (n_threads <= 0) { n_threads = GGML_DEFAULT_N_THREADS; } @@ -16662,14 +16703,15 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { } break; case GGML_OP_MUL_MAT_ID: { + cur = 0; const struct ggml_tensor * src0 = node->src[2]; const struct ggml_tensor * src1 = node->src[1]; const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type; if (src1->type != vec_dot_type) { - cur = ggml_row_size(vec_dot_type, ggml_nelements(src1)); + cur += ggml_row_size(vec_dot_type, ggml_nelements(src1)); } const int n_as = ggml_get_op_params_i32(node, 1); - cur = GGML_PAD(cur, sizeof(int64_t)); // align + cur += GGML_PAD(cur, sizeof(int64_t)); // align cur += n_as * sizeof(int64_t); // matrix_row_counts cur += n_as * src1->ne[1] * sizeof(int64_t); // matrix_rows } break; @@ -18674,6 +18716,12 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i block_iq2_xxs * block = (block_iq2_xxs*)dst + start / QK_K; result = ggml_quantize_iq2_xxs(src + start, block, n, n, hist); } break; + case GGML_TYPE_IQ2_XS: + { + GGML_ASSERT(start % QK_K == 0); + block_iq2_xs * block = (block_iq2_xs*)dst + start / QK_K; + result = ggml_quantize_iq2_xs(src + start, block, n, n, hist); + } break; case GGML_TYPE_F16: { int elemsize = sizeof(ggml_fp16_t); @@ -19029,8 +19077,8 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p (int64_t) info->ne[3]; if (ne % ggml_blck_size(info->type) != 0) { - fprintf(stderr, "%s: tensor '%s' number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", - __func__, info->name.data, ne, ggml_blck_size(info->type)); + fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%d)\n", + __func__, info->name.data, (int)info->type, ggml_type_name(info->type), ne, ggml_blck_size(info->type)); fclose(file); gguf_free(ctx); return NULL; diff --git a/ggml.h b/ggml.h index c55e598b4..b18ba7812 100644 --- a/ggml.h +++ b/ggml.h @@ -218,7 +218,9 @@ #define GGML_MAX_PARAMS 2048 #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 10 +#ifndef GGML_MAX_NAME #define GGML_MAX_NAME 64 +#endif #define GGML_MAX_OP_PARAMS 64 #define GGML_DEFAULT_N_THREADS 4 #define GGML_DEFAULT_GRAPH_SIZE 2048 @@ -340,6 +342,7 @@ extern "C" { GGML_TYPE_Q6_K = 14, GGML_TYPE_Q8_K = 15, GGML_TYPE_IQ2_XXS = 16, + GGML_TYPE_IQ2_XS = 17, GGML_TYPE_I8, GGML_TYPE_I16, GGML_TYPE_I32, @@ -375,6 +378,7 @@ extern "C" { GGML_FTYPE_MOSTLY_Q5_K = 13, // except 1d tensors GGML_FTYPE_MOSTLY_Q6_K = 14, // except 1d tensors GGML_FTYPE_MOSTLY_IQ2_XXS = 15, // except 1d tensors + GGML_FTYPE_MOSTLY_IQ2_XS = 16, // except 1d tensors }; // available tensor operations: @@ -1161,22 +1165,16 @@ extern "C" { struct ggml_tensor * a, struct ggml_tensor * b); - // a -> b, in-place, return view(b) - GGML_API struct ggml_tensor * ggml_cpy_inplace( + GGML_API struct ggml_tensor * ggml_cast( struct ggml_context * ctx, struct ggml_tensor * a, - struct ggml_tensor * b); + enum ggml_type type); // make contiguous GGML_API struct ggml_tensor * ggml_cont( struct ggml_context * ctx, struct ggml_tensor * a); - // make contiguous, in-place - GGML_API struct ggml_tensor * ggml_cont_inplace( - struct ggml_context * ctx, - struct ggml_tensor * a); - // make contiguous, with new shape GGML_API struct ggml_tensor * ggml_cont_1d( struct ggml_context * ctx, @@ -1849,8 +1847,8 @@ extern "C" { // ggml_graph_plan() has to be called before ggml_graph_compute() // when plan.work_size > 0, caller must allocate memory for plan.work_data - GGML_API struct ggml_cplan ggml_graph_plan (struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); - GGML_API int ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); + GGML_API struct ggml_cplan ggml_graph_plan (const struct ggml_cgraph * cgraph, int n_threads /*= GGML_DEFAULT_N_THREADS*/); + GGML_API int ggml_graph_compute( struct ggml_cgraph * cgraph, struct ggml_cplan * cplan); // same as ggml_graph_compute() but the work data is allocated as a part of the context // note: the drawback of this API is that you must have ensured that the context has enough memory for the work data @@ -2070,9 +2068,16 @@ extern "C" { GGML_API size_t ggml_quantize_q5_K(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_iq2_xxs(const float * src, void * dst, int n, int k, int64_t * hist); + GGML_API size_t ggml_quantize_iq2_xs (const float * src, void * dst, int n, int k, int64_t * hist); GGML_API size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, int start, int n, int64_t * hist); + // + // Importance matrix + // + typedef void(*ggml_collect_imatrix_t)(const struct ggml_tensor * src0, const struct ggml_tensor * src1); + GGML_API void ggml_set_imatrix_collection(ggml_collect_imatrix_t imatrix_collect); + // // gguf // diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 80c1d5449..24a089037 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -57,6 +57,7 @@ class TensorNameMap: "transformer.norm_f", # mpt "ln_f", # refact bloom qwen gpt2 "language_model.encoder.final_layernorm", # persimmon + "model.final_layernorm", # persimmon "lm_head.ln", # phi2 ), @@ -98,6 +99,7 @@ class TensorNameMap: "transformer.h.{bid}.self_attention.query_key_value", # falcon "h.{bid}.self_attention.query_key_value", # bloom "language_model.encoder.layers.{bid}.self_attention.query_key_value", # persimmon + "model.layers.{bid}.self_attn.query_key_value", # persimmon "h.{bid}.attn.c_attn", # gpt2 "transformer.h.{bid}.mixer.Wqkv", # phi2 ), @@ -141,6 +143,7 @@ class TensorNameMap: "encoder.layer.{bid}.attention.output.dense", # bert "transformer.h.{bid}.attn.out_proj", # gpt-j "language_model.encoder.layers.{bid}.self_attention.dense", # persimmon + "model.layers.{bid}.self_attn.dense", # persimmon "h.{bid}.attn.c_proj", # gpt2 "transformer.h.{bid}.mixer.out_proj", # phi2 "model.layers.layers.{bid}.self_attn.o_proj", # plamo @@ -184,6 +187,7 @@ class TensorNameMap: "encoder.layer.{bid}.intermediate.dense", # bert "transformer.h.{bid}.mlp.fc_in", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_h_to_4h", # persimmon + "model.layers.{bid}.mlp.dense_h_to_4h", # persimmon "transformer.h.{bid}.mlp.w1", # qwen "h.{bid}.mlp.c_fc", # gpt2 "transformer.h.{bid}.mlp.fc1", # phi2 @@ -225,6 +229,7 @@ class TensorNameMap: "encoder.layer.{bid}.output.dense", # bert "transformer.h.{bid}.mlp.fc_out", # gpt-j "language_model.encoder.layers.{bid}.mlp.dense_4h_to_h", # persimmon + "model.layers.{bid}.mlp.dense_4h_to_h", # persimmon "h.{bid}.mlp.c_proj", # gpt2 "transformer.h.{bid}.mlp.fc2", # phi2 "model.layers.layers.{bid}.mlp.down_proj", # plamo @@ -237,10 +242,12 @@ class TensorNameMap: MODEL_TENSOR.ATTN_Q_NORM: ( "language_model.encoder.layers.{bid}.self_attention.q_layernorm", + "model.layers.{bid}.self_attn.q_layernorm", # persimmon ), MODEL_TENSOR.ATTN_K_NORM: ( "language_model.encoder.layers.{bid}.self_attention.k_layernorm", + "model.layers.{bid}.self_attn.k_layernorm", # persimmon ), MODEL_TENSOR.ROPE_FREQS: ( diff --git a/llama.cpp b/llama.cpp index 618e47cf0..d4668a221 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1,5 +1,4 @@ #define LLAMA_API_INTERNAL -//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading #include "llama.h" #include "unicode.h" @@ -154,10 +153,6 @@ static bool is_float_close(float a, float b, float abs_tol) { return std::fabs(b - a) <= abs_tol; } -#ifdef GGML_USE_CPU_HBM -#include -#endif - static void zeros(std::ofstream & file, size_t n) { char zero = 0; for (size_t i = 0; i < n; ++i) { @@ -1249,12 +1244,6 @@ struct llama_mlock { #endif }; -typedef void (*offload_func_t)(struct ggml_tensor * tensor); - -static void ggml_offload_nop(struct ggml_tensor * tensor) { - (void) tensor; -} - static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) { std::vector result(8, 0); const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size()); @@ -1270,19 +1259,14 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_ return std::string(result.data(), result.size()); } -static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { +static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) { ggml_backend_buffer_type_t buft = nullptr; -#ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - buft = ggml_backend_metal_buffer_type(); +#if defined(GGML_USE_CUBLAS) + // host buffers should only be used when data is expected to be copied to/from the GPU + if (host_buffer) { + buft = ggml_backend_cuda_host_buffer_type(); } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (n_gpu_layers > 0) { - buft = ggml_backend_cuda_buffer_type(0); - } -#elif defined(GGML_USE_CUBLAS) - buft = ggml_backend_cuda_host_buffer_type(); #elif defined(GGML_USE_CPU_HBM) buft = ggml_backend_cpu_hbm_buffer_type(); #endif @@ -1290,10 +1274,45 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) { if (buft == nullptr) { buft = ggml_backend_cpu_buffer_type(); } - return buft; - GGML_UNUSED(n_gpu_layers); + GGML_UNUSED(host_buffer); +} + +static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) { + ggml_backend_buffer_type_t buft = nullptr; + +#ifdef GGML_USE_METAL + buft = ggml_backend_metal_buffer_type(); +#elif defined(GGML_USE_CUBLAS) + buft = ggml_backend_cuda_buffer_type(gpu); +#elif defined(GGML_USE_CLBLAST) + buft = ggml_backend_opencl_buffer_type(); +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_cpu(true); + } + return buft; + + GGML_UNUSED(gpu); +} + +static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) { + ggml_backend_buffer_type_t buft = nullptr; + +#ifdef GGML_USE_CUBLAS + if (ggml_backend_cuda_get_device_count() > 1) { + buft = ggml_backend_cuda_split_buffer_type(tensor_split); + } +#endif + + if (buft == nullptr) { + buft = llama_default_buffer_type_offload(fallback_gpu); + } + return buft; + + GGML_UNUSED(tensor_split); } // @@ -1504,24 +1523,24 @@ struct llama_kv_cache { std::vector k_l; // per layer std::vector v_l; - struct ggml_context * ctx = NULL; + std::vector ctxs; + std::vector bufs; - ggml_backend_buffer_t buf = NULL; + size_t total_size() const { + size_t size = 0; + for (ggml_backend_buffer_t buf : bufs) { + size += ggml_backend_buffer_get_size(buf); + } + return size; + } ~llama_kv_cache() { -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (ggml_cublas_loaded()) { - for (size_t i = 0; i < k_l.size(); ++i) { - ggml_cuda_free_data(k_l[i]); - ggml_cuda_free_data(v_l[i]); - } - } -#endif - if (ctx) { + for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); } - - ggml_backend_buffer_free(buf); + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } } }; @@ -1598,16 +1617,32 @@ struct llama_model { std::vector layers; + llama_split_mode split_mode; + int main_gpu; int n_gpu_layers; // gguf metadata std::unordered_map gguf_kv; - // context - struct ggml_context * ctx = NULL; + // layer -> buffer type mapping + struct layer_buft { + layer_buft() : buft_matrix(nullptr), buft(nullptr) {} + layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {} + layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {} - // the model memory buffer - ggml_backend_buffer_t buf = NULL; + ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication + ggml_backend_buffer_type_t buft; // everything else + }; + + layer_buft buft_input; + layer_buft buft_output; + std::vector buft_layer; + + // contexts where the model tensors metadata is stored + std::vector ctxs; + + // the model memory buffers for the tensor data + std::vector bufs; // model memory mapped file std::unique_ptr mapping; @@ -1623,39 +1658,32 @@ struct llama_model { int64_t t_start_us = 0; ~llama_model() { -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (ggml_cublas_loaded()) { - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cuda_free_data(tensors_by_name[i].second); - } - ggml_cuda_free_scratch(); - } -#endif - -#if defined(GGML_USE_CLBLAST) - for (size_t i = 0; i < tensors_by_name.size(); ++i) { - ggml_cl_free_data(tensors_by_name[i].second); - } -#endif - if (ctx) { + for (struct ggml_context * ctx : ctxs) { ggml_free(ctx); } - - ggml_backend_buffer_free(buf); + for (ggml_backend_buffer_t buf : bufs) { + ggml_backend_buffer_free(buf); + } } }; struct llama_context { llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {} ~llama_context() { - ggml_allocr_free(alloc); - ggml_backend_buffer_free(buf_alloc); - ggml_backend_free(backend); + ggml_backend_sched_free(sched); + + for (ggml_backend_t backend : backends) { + ggml_backend_free(backend); + } } llama_cparams cparams; - ggml_backend_t backend = nullptr; + std::vector backends; +#ifdef GGML_USE_METAL + ggml_backend_t backend_metal = nullptr; +#endif + ggml_backend_t backend_cpu = nullptr; const llama_model & model; @@ -1689,8 +1717,9 @@ struct llama_context { // memory buffers used to evaluate the model std::vector buf_compute_meta; - ggml_backend_buffer_t buf_alloc = NULL; - ggml_allocr * alloc = NULL; + ggml_backend_sched_t sched = nullptr; + // allocator for the input tensors + ggml_tallocr * alloc = nullptr; // TODO(jared): remove this #if defined(GGML_USE_KOMPUTE) @@ -1710,16 +1739,17 @@ struct llama_context { // static bool llama_kv_cache_init( - const struct llama_hparams & hparams, struct llama_kv_cache & cache, + const llama_model & model, ggml_type ktype, ggml_type vtype, uint32_t n_ctx, - int n_gpu_layers, bool offload) { + const struct llama_hparams & hparams = model.hparams; + const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const uint32_t n_layer = hparams.n_layer; + const int64_t n_layer = hparams.n_layer; cache.has_shift = false; @@ -1730,62 +1760,65 @@ static bool llama_kv_cache_init( cache.cells.clear(); cache.cells.resize(n_ctx); - struct ggml_init_params params; - params.mem_size = 2u*n_layer*ggml_tensor_overhead(); - params.mem_buffer = NULL; - params.no_alloc = true; +#ifdef GGML_USE_CLBLAST + offload = false; +#endif - cache.ctx = ggml_init(params); + // count used buffer types + std::map buft_layer_count; + if (offload) { + for (int64_t i = 0; i < n_layer; ++i) { + buft_layer_count[model.buft_layer[i].buft]++; + } + } else { + buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer; + } - size_t vram_kv_cache = 0; - - if (!cache.ctx) { - LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); - return false; + // create a context for each buffer type + std::map ctx_map; + for (auto & it : buft_layer_count) { + int n_layers = it.second; + struct ggml_init_params params = { + /*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead(), + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + ggml_context * ctx = ggml_init(params); + if (!ctx) { + LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__); + return false; + } + ctx_map[it.first] = ctx; + cache.ctxs.push_back(ctx); } cache.k_l.reserve(n_layer); cache.v_l.reserve(n_layer); - const int i_gpu_start = (int) n_layer - n_gpu_layers; - for (int i = 0; i < (int) n_layer; i++) { - ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx); - ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx); + struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front(); + ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx); + ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx); ggml_format_name(k, "cache_k_l%d", i); ggml_format_name(v, "cache_v_l%d", i); cache.k_l.push_back(k); cache.v_l.push_back(v); -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (i >= i_gpu_start) { - if (offload) { - ggml_cuda_assign_buffers_no_scratch(k); - ggml_cuda_assign_buffers_no_scratch(v); - vram_kv_cache += ggml_nbytes(k); - vram_kv_cache += ggml_nbytes(v); - // HACK: mark tensor as allocated - k->data = v->data = (void *)(uintptr_t)1; - } + } + + // allocate tensors and initialize the buffers to avoid NaNs in the padding + for (auto it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (!buf) { + LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__); + return false; } -#endif // GGML_USE_CUBLAS + ggml_backend_buffer_clear(buf, 0); + LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0); + cache.bufs.push_back(buf); } - // allocate tensors - cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers)); - - // buf may be NULL with full offload - if (cache.buf) { - // initialize the buffer to avoid NaNs in the padding - ggml_backend_buffer_clear(cache.buf, 0); - } - - if (vram_kv_cache > 0) { - LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0); - } - - GGML_UNUSED(i_gpu_start); - GGML_UNUSED(offload); - return true; } @@ -2287,6 +2320,7 @@ struct llama_model_loader { case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break; case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break; case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break; + case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break; default: { LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max)); @@ -2420,9 +2454,8 @@ struct llama_model_loader { return get_tensor_meta(get_tensor_name(i)); } - struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) { + struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) { struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta); - tensor->backend = backend; // TODO: ggml_set_backend ggml_set_name(tensor, ggml_get_name(meta)); n_created++; @@ -2430,7 +2463,7 @@ struct llama_model_loader { return tensor; } - struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, ggml_backend_type backend, bool required = true) { + struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector & ne, bool required = true) { struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str()); if (cur == NULL) { @@ -2440,12 +2473,6 @@ struct llama_model_loader { throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str())); } - if (backend == GGML_BACKEND_GPU_SPLIT) { - if (ne.size() == 1) { - throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str())); - } - } - { bool is_ok = true; for (size_t i = 0; i < ne.size(); ++i) { @@ -2463,7 +2490,7 @@ struct llama_model_loader { } } - return create_tensor_for(ctx, cur, backend); + return create_tensor_for(ctx, cur); } void done_getting_tensors() const { @@ -2482,26 +2509,36 @@ struct llama_model_loader { return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx); } - void init_mapping(bool prefetch = true) { - /* - // prefetch only CPU tensors - if (use_mmap) { - size_t size_pref = 0; // prefetch - - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - if (cur->backend == GGML_BACKEND_CPU) { - size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur); - size_pref = std::max(size_pref, tensor_end); - } - } - mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa())); - } - */ + void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) { // prefetch the whole file - all the data is needed anyway if (use_mmap) { mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa())); } + + // compute the total size of all tensors for progress reporting + for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { + struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i)); + size_data += ggml_nbytes(cur); + } + + if (use_mmap && mapping) { + if (lmlock) { + lmlock->init(mapping->addr); + } + mmap_used_first = mapping->size; + } + } + + void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const { + GGML_ASSERT(mapping); + + *first = mapping->size; + *last = 0; + for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { + const size_t offs = file_offset(ggml_get_name(tensor)); + *first = std::min(*first, offs); + *last = std::max(*last, offs + ggml_nbytes(tensor)); + } } // for backwards compatibility, does not support ggml-backend @@ -2509,8 +2546,11 @@ struct llama_model_loader { const size_t offs = file_offset(ggml_get_name(cur)); if (use_mmap && mapping) { - GGML_ASSERT(cur->data == nullptr); - cur->data = (uint8_t *)mapping->addr + offs; + if (cur->data == nullptr) { + cur->data = (uint8_t *)mapping->addr + offs; + } else { + memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur)); + } } else { GGML_ASSERT(cur->data != nullptr); file.seek(offs, SEEK_SET); @@ -2518,37 +2558,23 @@ struct llama_model_loader { } } + size_t size_done = 0; + size_t size_data = 0; + size_t mmap_used_first = -1; + size_t mmap_used_last = 0; + // Returns false if cancelled by progress_callback - bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const { - size_t size_data = 0; - - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - size_data += ggml_nbytes(cur); - } - - if (use_mmap && buf_mmap) { - if (lmlock) { - lmlock->init(mapping->addr); - } - } - -#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) - const bool legacy_offload = true; -#else - const bool legacy_offload = false; -#endif + bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) { + GGML_ASSERT(size_data != 0 && "call init_mapping() first"); std::vector> read_buf; - size_t size_done = 0; - - size_t mmap_first = -1; - size_t mmap_last = 0; - for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) { struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i)); - GGML_ASSERT(cur); // unused tensors should have been caught by load_data already + if (!cur) { + // some tensors may be allocated in a different context + continue; + } if (progress_callback) { if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) { @@ -2558,67 +2584,48 @@ struct llama_model_loader { const size_t offs = file_offset(ggml_get_name(cur)); - if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) { - if (use_mmap && mapping) { - if (buf_mmap) { - ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); - if (lmlock) { - lmlock->grow_to(offs + ggml_nbytes(cur)); - } - mmap_first = std::min(mmap_first, offs); - mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur)); - } else { - ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); + if (use_mmap && mapping) { + if (buf_mmap && cur->data == nullptr) { + ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs); + if (lmlock) { + lmlock->grow_to(offs + ggml_nbytes(cur)); } + mmap_used_first = std::min(mmap_used_first, offs); + mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur)); } else { - if (ggml_backend_buffer_is_host(cur->buffer)) { - file.seek(offs, SEEK_SET); - file.read_raw(cur->data, ggml_nbytes(cur)); - } else { - read_buf.resize(ggml_nbytes(cur)); - file.seek(offs, SEEK_SET); - file.read_raw(read_buf.data(), ggml_nbytes(cur)); - ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); - } + ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur)); } } else { - // HACK: mark tensor as allocated - cur->data = (void *)(uintptr_t)1; - void * data; - if (use_mmap && mapping) { - data = (uint8_t *) mapping->addr + offs; + if (ggml_backend_buffer_is_host(cur->buffer)) { + file.seek(offs, SEEK_SET); + file.read_raw(cur->data, ggml_nbytes(cur)); } else { read_buf.resize(ggml_nbytes(cur)); file.seek(offs, SEEK_SET); file.read_raw(read_buf.data(), ggml_nbytes(cur)); - data = read_buf.data(); + ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur)); } - -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - ggml_cuda_transform_tensor(data, cur); -#elif defined(GGML_USE_CLBLAST) - GGML_ASSERT(cur->backend == GGML_BACKEND_GPU); - ggml_cl_transform_tensor(data, cur); -#else - GGML_ASSERT(!"GPU tensor without a GPU backend"); - GGML_UNUSED(data); -#endif } size_done += ggml_nbytes(cur); } - // unmap offloaded tensors and metadata - if (use_mmap && mapping) { - mapping->unmap_fragment(0, mmap_first); - mapping->unmap_fragment(mmap_last, mapping->size); + // check if this is the last call and do final cleanup + if (size_done >= size_data) { + // unmap offloaded tensors and metadata + if (use_mmap && mapping) { + mapping->unmap_fragment(0, mmap_used_first); + if (mmap_used_last != 0) { + mapping->unmap_fragment(mmap_used_last, mapping->size); + } + } + if (progress_callback) { + // Even though the model is done loading, we still honor + // cancellation since we need to free allocations. + return progress_callback(1.0f, progress_callback_user_data); + } } - if (progress_callback) { - // Even though the model is done loading, we still honor - // cancellation since we need to free allocations. - return progress_callback(1.0f, progress_callback_user_data); - } return true; } }; @@ -2652,7 +2659,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0"; // K-quants - case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K"; + case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium"; + case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small"; case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large"; @@ -2662,6 +2670,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) { case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium"; case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K"; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw"; + case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw"; default: return "unknown, may not work"; } @@ -3245,6 +3254,7 @@ static bool llm_load_tensors( llama_model_loader & ml, llama_model & model, int n_gpu_layers, + enum llama_split_mode split_mode, int main_gpu, const float * tensor_split, bool use_mlock, @@ -3252,702 +3262,563 @@ static bool llm_load_tensors( void * progress_callback_user_data) { model.t_start_us = ggml_time_us(); - auto & ctx = model.ctx; auto & hparams = model.hparams; + model.split_mode = split_mode; + model.main_gpu = main_gpu; model.n_gpu_layers = n_gpu_layers; - size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors; + const int64_t n_layer = hparams.n_layer; + const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0); - LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0); + // there is very little benefit to offloading the input layer, so always keep it on the CPU + model.buft_input = llama_default_buffer_type_cpu(true); - // create the ggml context + model.buft_layer.resize(n_layer); + + // assign cpu layers + for (int64_t i = 0; i < i_gpu_start; ++i) { + model.buft_layer[i] = llama_default_buffer_type_cpu(true); + } + +#ifdef GGML_USE_CUBLAS + if (split_mode == LLAMA_SPLIT_LAYER) { + // calculate the split points + int device_count = ggml_backend_cuda_get_device_count(); + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; }); + float splits[GGML_CUDA_MAX_DEVICES]; + if (all_zero) { + // default split, by free memory + for (int i = 0; i < device_count; ++i) { + size_t total; + size_t free; + ggml_backend_cuda_get_device_memory(i, &total, &free); + splits[i] = free; + } + } else { + std::copy(tensor_split, tensor_split + device_count, splits); + } + + // sum and normalize the splits to get the split points + float split_sum = 0.0f; + for (int i = 0; i < device_count; ++i) { + split_sum += splits[i]; + splits[i] = split_sum; + } + for (int i = 0; i < device_count; ++i) { + splits[i] /= split_sum; + } + + // assign the repeating layers to the devices according to the splits + int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1); + for (int64_t i = i_gpu_start; i < n_layer; ++i) { + int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits; + model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu); + } + // assign the output layer + if (n_gpu_layers > n_layer) { + int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits; + model.buft_output = llama_default_buffer_type_offload(layer_gpu); + } else { + model.buft_output = llama_default_buffer_type_cpu(true); + } + } else +#endif { + ggml_backend_buffer_type_t split_buft; + if (split_mode == LLAMA_SPLIT_ROW) { + split_buft = llama_default_buffer_type_split(main_gpu, tensor_split); + } else { + // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported + split_buft = llama_default_buffer_type_offload(main_gpu); + } + // assign the repeating layers + for (int64_t i = i_gpu_start; i < n_layer; ++i) { + model.buft_layer[i] = { + split_buft, + llama_default_buffer_type_offload(main_gpu) + }; + } + // assign the output layer + if (n_gpu_layers > n_layer) { + model.buft_output = { + split_buft, + llama_default_buffer_type_offload(main_gpu) + }; + } else { + model.buft_output = llama_default_buffer_type_cpu(true); + } + } + + // count used buffer types + std::map buft_layer_count; + buft_layer_count[model.buft_input.buft]++; + buft_layer_count[model.buft_input.buft_matrix]++; + buft_layer_count[model.buft_output.buft]++; + buft_layer_count[model.buft_output.buft_matrix]++; + for (int64_t i = 0; i < n_layer; ++i) { + buft_layer_count[model.buft_layer[i].buft]++; + buft_layer_count[model.buft_layer[i].buft_matrix]++; + } + + // create one context per buffer type + size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors; + std::map ctx_map; + for (auto & it : buft_layer_count) { struct ggml_init_params params = { /*.mem_size =*/ ctx_size, /*.mem_buffer =*/ NULL, /*.no_alloc =*/ true, }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - throw std::runtime_error(format("ggml_init() failed")); + ggml_context * ctx = ggml_init(params); + if (!ctx) { + throw std::runtime_error(format("failed to create context")); } + ctx_map[it.first] = ctx; + model.ctxs.push_back(ctx); } - (void) main_gpu; - - enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU; - enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU; - -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (ggml_cublas_loaded()) { - LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__); - ggml_cuda_set_main_device(main_gpu); - - llama_backend_offload = GGML_BACKEND_GPU; - llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT; - } -#elif defined(GGML_USE_CLBLAST) - LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__); - llama_backend_offload = GGML_BACKEND_GPU; - llama_backend_offload_split = GGML_BACKEND_GPU; -#endif + LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0); // create tensors for the weights { const int64_t n_embd = hparams.n_embd; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa(); - const int64_t n_layer = hparams.n_layer; + const int64_t n_embd_gqa = n_embd_v_gqa; const int64_t n_vocab = hparams.n_vocab; + const int64_t n_ff = hparams.n_ff; + + GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); + + ggml_context * ctx_input = ctx_map.at(model.buft_input.buft); + ggml_context * ctx_output = ctx_map.at(model.buft_output.buft); + ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix); + auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); }; + auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); }; + + model.layers.resize(n_layer); const auto tn = LLM_TN(model.arch); switch (model.arch) { case LLM_ARCH_LLAMA: case LLM_ARCH_REFACT: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); // optional bias tensors - layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false); - layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false); - layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false); + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false); + layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false); if (layer.ffn_gate_inp == nullptr) { GGML_ASSERT(hparams.n_expert == 0); GGML_ASSERT(hparams.n_expert_used == 0); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } else { GGML_ASSERT(hparams.n_expert > 0); GGML_ASSERT(hparams.n_expert_used > 0); // MoE branch for (uint32_t x = 0; x < hparams.n_expert; ++x) { - layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); - layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split); - layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split); + layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}); + layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}); + layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}); } } } } break; case LLM_ARCH_BAICHUAN: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_FALCON: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) { - layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend); - layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend); + layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}); + layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}); } - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_STARCODER: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_PERSIMMON: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); - const int i_gpu_start = n_layer - n_gpu_layers; - model.layers.resize(n_layer); - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); - layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend); - layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend); - layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend); - layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend); + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); + + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}); + layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}); + + layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}); + layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}); } } break; case LLM_ARCH_BLOOM: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU); - model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}); + model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_MPT: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); // AWQ ScaleActivation layer - layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false); + layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false); } } break; case LLM_ARCH_STABLELM: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - /* - llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ] - */ - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_QWEN: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + } - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - } - - const uint32_t n_ff = hparams.n_ff / 2; - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2}); } } break; case LLM_ARCH_PHI2: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); - model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); + model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; case LLM_ARCH_PLAMO: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); - layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split); - layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); - layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); + layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); } } break; case LLM_ARCH_GPT2: { - model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU); - model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU); + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}); // output { - ggml_backend_type backend_norm; - ggml_backend_type backend_output; - - if (n_gpu_layers > int(n_layer)) { - backend_norm = llama_backend_offload; - backend_output = llama_backend_offload_split; - } else { - backend_norm = GGML_BACKEND_CPU; - backend_output = GGML_BACKEND_CPU; - } - - model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm); - model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm); - model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output); + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}); } - const uint32_t n_ff = hparams.n_ff; - const int64_t n_embd_gqa = n_embd_v_gqa; - GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa()); - GGML_ASSERT(n_embd_gqa == n_embd_k_gqa); - - const int i_gpu_start = n_layer - n_gpu_layers; - - model.layers.resize(n_layer); - - for (uint32_t i = 0; i < n_layer; ++i) { - const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT - const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); auto & layer = model.layers[i]; - layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend); - layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend); + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); - layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split); - layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend); + layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}); + layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}); - layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split); - layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); - layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend); - layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend); + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); - layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split); - layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend); + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}); + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); - layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split); - layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}); } } break; default: @@ -3957,78 +3828,51 @@ static bool llm_load_tensors( ml.done_getting_tensors(); - ml.init_mapping(); + ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr); - // allocate tensors - size_t vram_weights = 0; - size_t buf_size = 0; + // create the backend buffers + std::vector> ctx_bufs; - ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers); + for (auto & it : ctx_map) { + ggml_backend_buffer_type_t buft = it.first; + ggml_context * ctx = it.second; + ggml_backend_buffer_t buf = nullptr; - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend - if (t->backend == GGML_BACKEND_CPU) { - buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft)); - } else { - vram_weights += ggml_nbytes(t); + // only the mmap region containing the tensors in the model is mapped to the backend buffer + // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers + // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size + if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) { + size_t first, last; + ml.get_mapping_range(&first, &last, ctx); + buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first); } - } - - // create backend buffer - ggml_backend_buffer_t buf_mmap = nullptr; - #ifdef GGML_USE_METAL - if (n_gpu_layers > 0) { - if (ml.use_mmap) { + else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) { const size_t max_size = ggml_get_max_tensor_size(ctx); - model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size); - buf_mmap = model.buf; - } else { - model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type()); + size_t first, last; + ml.get_mapping_range(&first, &last, ctx); + buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size); } - } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - // for testing only - if (n_gpu_layers > 0) { - model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0)); - } #endif - - if (model.buf == nullptr) { - // CPU backend, and indirectly CUDA and OpenCL - if (ml.use_mmap) { - model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size); - buf_mmap = model.buf; - } else { - // allocate only CPU tensors - model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size); - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf); - for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) { - if (t->backend == GGML_BACKEND_CPU) { - ggml_tallocr_alloc(alloc, t); - } + else { + buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft); + if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) { + model.mlock_buf.init (ggml_backend_buffer_get_base(buf)); + model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf)); } - ggml_tallocr_free(alloc); } - } - - if (use_mlock && ggml_backend_buffer_is_host(model.buf)) { - model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf)); - model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf)); + if (buf == nullptr) { + throw std::runtime_error("failed to allocate buffer"); + } + // indicate that this buffer contains weights + // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight + ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.bufs.push_back(buf); + ctx_bufs.emplace_back(ctx, buf); } // print memory requirements { - size_t sys_mem_required = ctx_size + buf_size; - - if (sys_mem_required > 0) { - LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0); - } - if (vram_weights > 0) { - LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0); - } - -#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST) const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer)); LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu); @@ -4040,23 +3884,26 @@ static bool llm_load_tensors( const int max_offloadable_layers = hparams.n_layer + 1; LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers); -#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) - } -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - ggml_cuda_set_tensor_split(tensor_split); -#else - GGML_UNUSED(tensor_split); -#endif // GGML_USE_CUBLAS + for (ggml_backend_buffer_t buf : model.bufs) { + LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); + } + } // populate tensors_by_name - for (int i = 0; i < ml.n_tensors; ++i) { - struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i)); - model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); + for (ggml_context * ctx : model.ctxs) { + for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) { + model.tensors_by_name.emplace_back(ggml_get_name(cur), cur); + } } - if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) { - return false; + // load tensor data + for (auto & it : ctx_bufs) { + ggml_context * ctx = it.first; + ggml_backend_buffer_t buf = it.second; + if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) { + return false; + } } model.mapping = std::move(ml.mapping); @@ -4105,13 +3952,13 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam #endif if (!llm_load_tensors( - ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock, + ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock, params.progress_callback, params.progress_callback_user_data )) { return -2; } } catch (const std::exception & err) { - LLAMA_LOG_ERROR("error loading model: %s\n", err.what()); + LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what()); return -1; } @@ -4183,7 +4030,6 @@ static void llm_build_k_shift( struct ggml_cgraph * graph, llm_rope_type type, int64_t n_ctx, - int n_rot, float freq_base, float freq_scale, const llm_build_cb & cb) { @@ -4191,14 +4037,13 @@ static void llm_build_k_shift( const int64_t n_head_kv = hparams.n_head_kv; const int64_t n_embd_head_k = hparams.n_embd_head_k; const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa(); + const int32_t n_rot = hparams.n_rot; const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx; const float ext_factor = cparams.yarn_ext_factor; const float attn_factor = cparams.yarn_attn_factor; const float beta_fast = cparams.yarn_beta_fast; const float beta_slow = cparams.yarn_beta_slow; - GGML_ASSERT(n_embd_head_k % n_rot == 0); - struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx); cb(K_shift, "K_shift", -1); @@ -4566,8 +4411,6 @@ struct llm_build_context { , ctx_kompute (lctx.ctx_kompute) #endif { - GGML_ASSERT(!!kv_self.ctx); - // all initializations should be done in init() } @@ -4611,7 +4454,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4647,16 +4490,22 @@ struct llm_build_context { cb(Vcur, "Vcur", il); } + // these nodes are added to the graph together so that they are not reordered + // by doing so, the number of splits in the graph is reduced + ggml_build_forward_expand(gf, Qcur); + ggml_build_forward_expand(gf, Kcur); + ggml_build_forward_expand(gf, Vcur); + Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -4779,6 +4628,7 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4796,7 +4646,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4822,12 +4672,12 @@ struct llm_build_context { case MODEL_7B: Qcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); Kcur = ggml_rope_custom( ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, - n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale, + hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); break; @@ -4900,6 +4750,7 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; const int64_t n_embd_gqa = hparams.n_embd_v_gqa(); GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -4917,7 +4768,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -4958,13 +4809,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5121,15 +4972,14 @@ struct llm_build_context { struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); const int64_t n_embd_head = hparams.n_embd_head_v; - GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); - - const int64_t n_rot = n_embd_head_k / 2; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head/2 == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb); - cb(inpL, "imp_embd", -1); + cb(inpL, "inp_embd", -1); // inp_pos - contains the positions struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); @@ -5140,7 +4990,7 @@ struct llm_build_context { cb(KQ_mask, "KQ_mask", -1); if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5200,7 +5050,7 @@ struct llm_build_context { // RoPE the first n_rot of q/k, pass the other half, and concat. struct ggml_tensor * qrot = ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, + ctx0, tmpq, hparams.n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, 0 @@ -5208,7 +5058,7 @@ struct llm_build_context { cb(qrot, "qrot", il); struct ggml_tensor * krot = ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, + ctx0, tmpk, hparams.n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, 0 @@ -5217,29 +5067,29 @@ struct llm_build_context { // get the second half of tmpq, e.g tmpq[n_rot:, :, :] struct ggml_tensor * qpass = ggml_view_3d( - ctx0, tmpq, n_rot, n_head, n_tokens, + ctx0, tmpq, hparams.n_rot, n_head, n_tokens, ggml_element_size(tmpq) * n_embd_head, ggml_element_size(tmpq) * n_embd_head * n_head, - ggml_element_size(tmpq) * n_rot + ggml_element_size(tmpq) * hparams.n_rot ); cb(qpass, "qpass", il); struct ggml_tensor * kpass = ggml_view_3d( - ctx0, tmpk, n_rot, n_head, n_tokens, + ctx0, tmpk, hparams.n_rot, n_head, n_tokens, ggml_element_size(tmpk) * n_embd_head, ggml_element_size(tmpk) * n_embd_head * n_head, - ggml_element_size(tmpk) * n_rot + ggml_element_size(tmpk) * hparams.n_rot ); cb(kpass, "kpass", il); struct ggml_tensor * qrotated = ggml_rope_custom( - ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx, + ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(qrotated, "qrotated", il); struct ggml_tensor * krotated = ggml_rope_custom( - ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx, + ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(krotated, "krotated", il); @@ -5636,7 +5486,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5749,7 +5599,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5781,13 +5631,13 @@ struct llm_build_context { // using mode = 2 for neox mode Qcur = ggml_rope_custom( - ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx, + ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow ); cb(Kcur, "Kcur", il); @@ -5866,7 +5716,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -5962,6 +5812,7 @@ struct llm_build_context { const int64_t n_embd_head = hparams.n_embd_head_v; GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); struct ggml_tensor * cur; struct ggml_tensor * inpL; @@ -5979,7 +5830,7 @@ struct llm_build_context { // shift the entire K-cache if needed if (do_rope_shift) { - llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb); + llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb); } for (int il = 0; il < n_layer; ++il) { @@ -6005,13 +5856,13 @@ struct llm_build_context { cb(Vcur, "Vcur", il); Qcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos, n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); Kcur = ggml_rope_custom( - ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos, n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow); cb(Kcur, "Kcur", il); @@ -6165,199 +6016,13 @@ struct llm_build_context { } }; -// -// tensor offloading helpers -// -// TODO: will be removed with backend v2 - -enum llm_offload_func_e { - OFFLOAD_FUNC_NOP, - OFFLOAD_FUNC, - OFFLOAD_FUNC_FRC, // force offload - OFFLOAD_FUNC_KQV, - OFFLOAD_FUNC_NR, - OFFLOAD_FUNC_EMB, // embeddings - OFFLOAD_FUNC_OUT, -}; - -// TODO: will be removed with backend v2 -struct llm_offload_trie { - struct node { - ~node() { - for (int i = 0; i < 256; ++i) { - if (children[i]) { - delete children[i]; - } - } - } - - node * children[256] = { nullptr }; - llm_offload_func_e func = OFFLOAD_FUNC_NOP; - }; - - llm_offload_trie() { - root = new node; - } - - llm_offload_trie(const std::unordered_map & map) { - root = new node; - - for (const auto & kv : map) { - add(kv.first, kv.second); - } - } - - ~llm_offload_trie() { - delete root; - } - - void add(const char * name, llm_offload_func_e func) { - node * cur = root; - - for (int i = 0; ; ++i) { - const uint8_t c = name[i]; - - if (!c) { - break; - } - - if (!cur->children[c]) { - cur->children[c] = new node; - } - - cur = cur->children[c]; - } - - cur->func = func; - } - - llm_offload_func_e find(const char * name) const { - const node * cur = root; - - for (int i = 0; ; ++i) { - const uint8_t c = name[i]; - - if (!c) { - break; - } - - if (!cur->children[c]) { - return OFFLOAD_FUNC_NOP; - } - - cur = cur->children[c]; - } - - return cur->func; - } - - node * root = nullptr; -}; - -// TODO: will be removed with backend v2 -static const std::unordered_map k_offload_map = { - //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel - { "pos_embd", OFFLOAD_FUNC_NR }, - - { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope) - { "KQ_mask", OFFLOAD_FUNC_FRC }, - { "K_shift", OFFLOAD_FUNC_FRC }, - - { "K_shifted", OFFLOAD_FUNC }, - - { "inp_norm", OFFLOAD_FUNC_NR }, - { "inp_norm_w", OFFLOAD_FUNC_NR }, - { "inp_norm_wb", OFFLOAD_FUNC_NR }, - - { "norm", OFFLOAD_FUNC }, - { "norm_w", OFFLOAD_FUNC }, - { "norm_wb", OFFLOAD_FUNC }, - - { "attn_norm", OFFLOAD_FUNC }, - { "attn_norm_2", OFFLOAD_FUNC }, - - { "wqkv", OFFLOAD_FUNC_KQV }, - { "bqkv", OFFLOAD_FUNC_KQV }, - { "wqkv_clamped", OFFLOAD_FUNC_KQV }, - - { "tmpk", OFFLOAD_FUNC_KQV }, - { "tmpq", OFFLOAD_FUNC_KQV }, - { "tmpv", OFFLOAD_FUNC_KQV }, - { "Kcur", OFFLOAD_FUNC_KQV }, - { "Qcur", OFFLOAD_FUNC_KQV }, - { "Vcur", OFFLOAD_FUNC_KQV }, - - { "krot", OFFLOAD_FUNC_KQV }, - { "qrot", OFFLOAD_FUNC_KQV }, - { "kpass", OFFLOAD_FUNC_KQV }, - { "qpass", OFFLOAD_FUNC_KQV }, - { "krotated", OFFLOAD_FUNC_KQV }, - { "qrotated", OFFLOAD_FUNC_KQV }, - - { "q", OFFLOAD_FUNC_KQV }, - { "k", OFFLOAD_FUNC_KQV }, - { "kq", OFFLOAD_FUNC_KQV }, - { "kq_scaled", OFFLOAD_FUNC_KQV }, - { "kq_scaled_alibi", OFFLOAD_FUNC_KQV }, - { "kq_masked", OFFLOAD_FUNC_KQV }, - { "kq_soft_max", OFFLOAD_FUNC_KQV }, - { "kq_soft_max_ext", OFFLOAD_FUNC_KQV }, - { "v", OFFLOAD_FUNC_KQV }, - { "kqv", OFFLOAD_FUNC_KQV }, - { "kqv_merged", OFFLOAD_FUNC_KQV }, - { "kqv_merged_cont", OFFLOAD_FUNC_KQV }, - { "kqv_wo", OFFLOAD_FUNC_KQV }, - { "kqv_out", OFFLOAD_FUNC_KQV }, - - { "ffn_inp", OFFLOAD_FUNC }, - { "ffn_norm", OFFLOAD_FUNC }, - - { "ffn_up", OFFLOAD_FUNC }, - { "ffn_up_b", OFFLOAD_FUNC }, - { "ffn_gate", OFFLOAD_FUNC }, - { "ffn_gate_b", OFFLOAD_FUNC }, - { "ffn_gate_par", OFFLOAD_FUNC }, - { "ffn_act", OFFLOAD_FUNC }, - { "ffn_down", OFFLOAD_FUNC }, - { "ffn_down_b", OFFLOAD_FUNC }, - { "ffn_out", OFFLOAD_FUNC }, - - { "ffn_silu", OFFLOAD_FUNC }, - { "ffn_gelu", OFFLOAD_FUNC }, - { "ffn_relu", OFFLOAD_FUNC }, - { "ffn_sqr(relu)", OFFLOAD_FUNC }, - - { "ffn_moe_logits", OFFLOAD_FUNC }, - { "ffn_moe_probs", OFFLOAD_FUNC }, - { "ffn_moe_argsort", OFFLOAD_FUNC }, - { "ffn_moe_weights", OFFLOAD_FUNC }, - { "ffn_moe_weights_sum", OFFLOAD_FUNC }, - { "ffn_moe_weights_norm", OFFLOAD_FUNC }, - { "ffn_moe_weighted", OFFLOAD_FUNC }, - { "ffn_moe_up", OFFLOAD_FUNC }, - { "ffn_moe_gate", OFFLOAD_FUNC }, - { "ffn_moe_silu", OFFLOAD_FUNC }, - { "ffn_moe_gate_par", OFFLOAD_FUNC }, - { "ffn_moe_down", OFFLOAD_FUNC }, - { "ffn_moe_out", OFFLOAD_FUNC }, - - { "l_out", OFFLOAD_FUNC }, - - { "result_norm", OFFLOAD_FUNC_EMB }, - { "result_output_no_bias", OFFLOAD_FUNC_EMB }, - { "result_output", OFFLOAD_FUNC_OUT }, -}; - -static llm_offload_trie k_offload_func_trie(k_offload_map); - static struct ggml_cgraph * llama_build_graph( llama_context & lctx, const llama_batch & batch) { const auto & model = lctx.model; // check if we should build the worst-case graph (for memory measurement) - const bool worst_case = ggml_allocr_is_measure(lctx.alloc); + const bool worst_case = ggml_tallocr_is_measure(lctx.alloc); // keep track of the input that has already been allocated bool alloc_inp_tokens = false; @@ -6366,20 +6031,13 @@ static struct ggml_cgraph * llama_build_graph( bool alloc_inp_KQ_mask = false; bool alloc_inp_K_shift = false; -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - const bool do_offload = true; -#else - const bool do_offload = true; // TODO: set to false after finishing refactoring -#endif - + // TODO(jared): do we still need this? #ifdef GGML_USE_KOMPUTE const bool needs_h2d_all = lctx.ctx_kompute && !ggml_vk_has_h2d_all(lctx.ctx_kompute); #endif - int n_non_view = 0; // number of non-view tensors that have been processed by the callback - // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.) - // TODO: will be removed with backend v2 + // TODO: improve handling of input and output tensors, then replace this with ggml_set_name llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) { if (il >= 0) { ggml_format_name(cur, "%s-%d", name, il); @@ -6390,12 +6048,11 @@ static struct ggml_cgraph * llama_build_graph( // // allocate input tensors and set input data // - // TODO: will be removed with backend v2 if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) { + if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) { const int64_t n_tokens = cur->ne[0]; ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur)); @@ -6404,10 +6061,10 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_tokens = true; } - if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) { + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) { + if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) { const int64_t n_embd = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; @@ -6418,9 +6075,9 @@ static struct ggml_cgraph * llama_build_graph( } if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) { + if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) { const int64_t n_tokens = cur->ne[0]; static_assert(std::is_same::value, "llama_pos must be int32_t"); @@ -6431,9 +6088,9 @@ static struct ggml_cgraph * llama_build_graph( } if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_tallocr_is_measure(lctx.alloc)) { const int64_t n_kv = cur->ne[0]; const int64_t n_tokens = cur->ne[1]; @@ -6471,9 +6128,9 @@ static struct ggml_cgraph * llama_build_graph( } if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) { - ggml_allocr_alloc(lctx.alloc, cur); + ggml_tallocr_alloc(lctx.alloc, cur); - if (!ggml_allocr_is_measure(lctx.alloc)) { + if (!ggml_tallocr_is_measure(lctx.alloc)) { const int64_t n_ctx = cur->ne[0]; int32_t * data; @@ -6496,15 +6153,7 @@ static struct ggml_cgraph * llama_build_graph( alloc_inp_K_shift = true; } - // view tensors are not processed further - if (cur->view_src != nullptr) { - return; - } - - if (cur->op != GGML_OP_NONE) { - n_non_view++; - } - + // TODO(jared): this shouldn't be needed anymore #ifdef GGML_USE_KOMPUTE if (lctx.ctx_kompute && !needs_h2d_all) { const char * offload_tensors[] = {"inp_tokens", "inp_pos", "KQ_mask", "K_shift"}; @@ -6519,127 +6168,6 @@ static struct ggml_cgraph * llama_build_graph( } } #endif - - // - // offload layers - // - // TODO: will be removed with backend v2 - -//#define LLAMA_OFFLOAD_DEBUG - - if (!do_offload) { - return; - } - - const int n_layer = model.hparams.n_layer; - - const int n_gpu_layers = model.n_gpu_layers; - const int i_gpu_start = n_layer - n_gpu_layers; - - // should we offload the final norm? yes if we are not computing embeddings - const bool offload_emb = lctx.embedding.empty(); - - static const std::unordered_map> k_offload_func_name = { - { OFFLOAD_FUNC_NOP, "CPU" }, - { OFFLOAD_FUNC_OUT, "CPU" }, -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - { OFFLOAD_FUNC, "GPU (CUDA)" }, - { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" }, - { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" }, - { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" }, - { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" }, -#else - { OFFLOAD_FUNC, "CPU" }, - { OFFLOAD_FUNC_FRC, "CPU" }, - { OFFLOAD_FUNC_KQV, "CPU" }, - { OFFLOAD_FUNC_NR, "CPU" }, - { OFFLOAD_FUNC_EMB, "CPU" }, -#endif // GGML_USE_CUBLAS - }; - - // check the global map for what offload function to use for this tensor - llm_offload_func_e func_e = k_offload_func_trie.find(name); - - if (func_e == OFFLOAD_FUNC_NOP) { -#ifdef LLAMA_OFFLOAD_DEBUG - // if a tensor hasn't been offloaded, we warn the user - if (worst_case) { - LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__, - cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837"); - } -#endif - - return; - } - - // count the number of layers and respect the provided n_gpu_layers - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: - break; - case OFFLOAD_FUNC: - if (n_gpu_layers < n_layer) { - if (il < i_gpu_start) { - func_e = OFFLOAD_FUNC_NOP; - } - } - break; - case OFFLOAD_FUNC_FRC: - if (!lctx.cparams.offload_kqv) { - func_e = OFFLOAD_FUNC_NOP; - } break; - case OFFLOAD_FUNC_KQV: - if (!lctx.cparams.offload_kqv) { - func_e = OFFLOAD_FUNC_NOP; - } else { - if (n_gpu_layers < n_layer) { - if (il < i_gpu_start) { - func_e = OFFLOAD_FUNC_NOP; - } - } - } - break; - case OFFLOAD_FUNC_NR: - if (n_gpu_layers <= n_layer + 0) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - case OFFLOAD_FUNC_EMB: - if (!offload_emb || n_gpu_layers < n_layer) { - func_e = OFFLOAD_FUNC_NOP; - } - break; - default: GGML_ASSERT(false); - } - - offload_func_t func = ggml_offload_nop; - - // this is needed for compatibility with Metal for example -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc; -#else - static offload_func_t ggml_offload_gpu = ggml_offload_nop; -#endif - - switch (func_e) { - case OFFLOAD_FUNC_NOP: - case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break; - case OFFLOAD_FUNC: - case OFFLOAD_FUNC_KQV: - case OFFLOAD_FUNC_FRC: - case OFFLOAD_FUNC_NR: - case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break; - default: GGML_ASSERT(false); - } - - // apply offload function to the tensor - func(cur); - -#ifdef LLAMA_OFFLOAD_DEBUG - if (worst_case) { - LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str()); - } -#endif }; struct ggml_cgraph * result = NULL; @@ -6713,27 +6241,6 @@ static struct ggml_cgraph * llama_build_graph( llm.free(); - if (worst_case) { - int n_non_view_total = 0; - - for (int i = 0; i < result->n_nodes; ++i) { - if (result->nodes[i]->view_src == nullptr) { - n_non_view_total++; - } - } - - LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total); - - if (n_non_view != n_non_view_total) { - LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); - LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__); - LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__); - LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__); - LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__); - LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__); - } - } - return result; } @@ -6779,8 +6286,6 @@ static int llama_decode_internal( auto & kv_self = lctx.kv_self; - GGML_ASSERT(!!kv_self.ctx); - const int64_t n_embd = hparams.n_embd; const int64_t n_vocab = hparams.n_vocab; @@ -6834,12 +6339,10 @@ static int llama_decode_internal( //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head); - ggml_allocr_reset(lctx.alloc); + ggml_backend_sched_reset(lctx.sched); ggml_cgraph * gf = llama_build_graph(lctx, batch); - ggml_allocr_alloc_graph(lctx.alloc, gf); - // the output is always the last tensor in the graph struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1]; GGML_ASSERT(strcmp(res->name, "result_output") == 0); @@ -6851,30 +6354,6 @@ static int llama_decode_internal( GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0); } -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc); - for (int i = 0; i < gf->n_leafs; i++) { - ggml_tensor * node = gf->leafs[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); - ggml_cuda_copy_to_device(node); - } - } - - for (int i = 0; i < gf->n_nodes; i++) { - ggml_tensor * node = gf->nodes[i]; - if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) { - ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base); - } - } - - // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed - if (!lctx.embedding.empty()) { - embeddings->backend = GGML_BACKEND_CPU; - } - res->backend = GGML_BACKEND_CPU; -#endif - // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs); // for big prompts, if BLAS is enabled, it is better to use only one thread @@ -6897,8 +6376,8 @@ static int llama_decode_internal( #endif #ifdef GGML_USE_METAL - if (ggml_backend_is_metal(lctx.backend)) { - ggml_backend_metal_set_n_cb(lctx.backend, n_threads); + if (ggml_backend_is_metal(lctx.backend_metal)) { + ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads); } #elif defined(GGML_USE_KOMPUTE) if (lctx.ctx_kompute && n_tokens == 1) { @@ -6921,10 +6400,12 @@ static int llama_decode_internal( } #endif - if (ggml_backend_is_cpu(lctx.backend)) { - ggml_backend_cpu_set_n_threads(lctx.backend, n_threads); + if (lctx.backend_cpu != nullptr) { + ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); } - ggml_backend_graph_compute(lctx.backend, gf); + ggml_backend_sched_graph_compute(lctx.sched, gf); + + // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched)); #ifdef GGML_USE_MPI ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer); @@ -6972,30 +6453,33 @@ static int llama_decode_internal( logits_out.clear(); #endif + ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res); + GGML_ASSERT(res_backend != nullptr); if (batch.logits) { logits_out.resize(n_vocab * n_tokens); for (uint32_t i = 0; i < n_tokens; i++) { if (batch.logits[i] == 0) { continue; } - ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[i] = true; #endif } } else if (lctx.logits_all) { logits_out.resize(n_vocab * n_tokens); - ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); + ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float)); #ifndef NDEBUG std::fill(logits_valid.begin(), logits_valid.end(), true); #endif } else { logits_out.resize(n_vocab); - ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); + ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float)); #ifndef NDEBUG logits_valid[0] = true; #endif } + ggml_backend_synchronize(res_backend); } // extract embeddings @@ -7003,7 +6487,9 @@ static int llama_decode_internal( auto & embedding_out = lctx.embedding; embedding_out.resize(n_embd); - ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); + ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings); + ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float)); + ggml_backend_synchronize(embeddings_backend); } // measure the performance only for the single-token evals @@ -9088,10 +8574,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty // TODO: explore better strategies new_type = GGML_TYPE_Q8_0; } - } else if (name.find("ffn_down.weight") != std::string::npos) { + } else if (name.find("ffn_down") != std::string::npos) { if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) { + if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K; + } else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) { - new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K + new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K : GGML_TYPE_Q3_K; } @@ -9100,14 +8589,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty } else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) { if (arch == LLM_ARCH_FALCON) { - new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K : + new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K : use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K; } else { if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; } } else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K; - else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) { + else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) { new_type = GGML_TYPE_Q5_K; } ++qs.i_feed_forward_w2; @@ -9125,9 +8614,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K; } - else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) { - if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; - } + // IK: let's remove this, else Q2_K is almost the same as Q3_K_S + //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) { + // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K; + //} // This can be used to reduce the size of the Q5_K_S model. // The associated PPL increase is fully in line with the size reduction //else { @@ -9176,6 +8666,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s // K-quants case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break; + case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break; case LLAMA_FTYPE_MOSTLY_Q3_K_S: case LLAMA_FTYPE_MOSTLY_Q3_K_M: case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break; @@ -9185,6 +8676,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break; case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break; case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break; + case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break; default: throw std::runtime_error(format("invalid output file type %d\n", ftype)); } @@ -9233,7 +8725,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) { ++qs.n_attention_wv; } - else if (name.find("ffn_down.weight") != std::string::npos) { + else if (name.find("ffn_down") != std::string::npos) { ++qs.n_feed_forward_w2; } } @@ -9473,48 +8965,23 @@ static int llama_apply_lora_from_file_internal( LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling); - // create a name -> tensor map of the model to accelerate lookups - // find the max tensor size to estimate the required temporary buffer size - size_t max_tensor_size = 0; - std::unordered_map model_tensors; - for (const auto & kv : model.tensors_by_name) { - model_tensors.insert(kv); - size_t f32_size = ggml_nelements(kv.second) * sizeof(float); - max_tensor_size = std::max(max_tensor_size, f32_size); - } - - // create a temporary ggml context to store the lora tensors - // TODO: use ggml-alloc - size_t lora_ctx_size = max_tensor_size * 3; - LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0); - std::vector lora_buf(lora_ctx_size); - - struct ggml_init_params params; - params.mem_size = lora_buf.size(); - params.mem_buffer = lora_buf.data(); - params.no_alloc = false; - - using unique_context = std::unique_ptr; - - unique_context lora_ctx(nullptr, ggml_free); - lora_ctx.reset(ggml_init(params)); - std::unordered_map lora_tensors; - // load base model std::unique_ptr ml; - - if (path_base_model) { + if (path_base_model) { LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model); ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr)); - ml->init_mapping(false); // no prefetching + ml->init_mapping(/*prefetch*/ false); // no prefetching } - // read tensors and apply - bool warned = false; - int n_tensors = 0; - - std::vector work_buffer; + struct tensor_meta { + std::string name; + ggml_type type; + int32_t ne[2]; + size_t offset; + }; + std::map tensor_meta_map; + // load all tensor meta while (true) { if (fin.tell() == fin.size) { // eof @@ -9527,7 +8994,7 @@ static int llama_apply_lora_from_file_internal( fin.read_raw(&n_dims, sizeof(n_dims)); fin.read_raw(&name_len, sizeof(name_len)); - fin.read_raw(&ftype, sizeof(ftype)); + fin.read_raw(&ftype, sizeof(ftype)); if (n_dims != 1 && n_dims != 2) { LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims); @@ -9541,31 +9008,23 @@ static int llama_apply_lora_from_file_internal( std::string name; { - GGML_ASSERT(name_len <= 1024); - char buf[1024]; + GGML_ASSERT(name_len < GGML_MAX_NAME); + char buf[GGML_MAX_NAME]; fin.read_raw(buf, name_len); name = std::string(buf, name_len); } - // check for lora suffix and get the type of tensor - const std::string lora_suffix = ".lora"; - size_t pos = name.rfind(lora_suffix); - if (pos == std::string::npos) { + // check for lora suffix + std::string lora_suffix; + if (name.length() > 6) { + lora_suffix = name.substr(name.length() - 6); + } + if (lora_suffix != ".loraA" && lora_suffix != ".loraB") { LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str()); return 1; } - std::string lora_type = name.substr(pos + lora_suffix.length()); - std::string base_name = name; - base_name.erase(pos); - // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str()); - - if (model_tensors.find(base_name) == model_tensors.end()) { - LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data()); - return 1; - } - - // create ggml tensor + // tensor type ggml_type wtype; switch (ftype) { case 0: wtype = GGML_TYPE_F32; break; @@ -9577,122 +9036,177 @@ static int llama_apply_lora_from_file_internal( return false; } } - ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]); - ggml_set_name(lora_tensor, name.c_str()); - // load tensor data + // data offset size_t offset = fin.tell(); - size_t tensor_data_size = ggml_nbytes(lora_tensor); offset = (offset + 31) & -32; - fin.seek(offset, SEEK_SET); - fin.read_raw(lora_tensor->data, tensor_data_size); - lora_tensors[name] = lora_tensor; + // skip tensor data + fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET); - // check if we have both A and B tensors and apply - if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() && - lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) { + tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset }); + } - ggml_tensor * dest_t = model_tensors[base_name]; + bool warned = false; + int n_tensors = 0; - offload_func_t offload_func = ggml_offload_nop; - offload_func_t offload_func_force_inplace = ggml_offload_nop; + // apply + ggml_backend_t backend_cpu = ggml_backend_cpu_init(); + if (backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__); + return 1; + } + ggml_backend_cpu_set_n_threads(backend_cpu, n_threads); -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) { - if (dest_t->type != GGML_TYPE_F16) { - throw std::runtime_error(format( - "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type)); - } - offload_func = ggml_cuda_assign_buffers; - offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace; - } -#endif // GGML_USE_CUBLAS + std::vector> read_buf; + for (const auto & it : model.tensors_by_name) { + const std::string & base_name = it.first; + ggml_tensor * model_t = it.second; - ggml_tensor * base_t; - if (ml) { - struct gguf_context * ctx_gguf = ml->ctx_gguf; + if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() || + tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) { + continue; + } - // load from base model - if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) { - LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); - return 1; - } + tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA"); + tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB"); - base_t = ml->get_tensor_meta(base_name.c_str()); - ml->load_data_for(base_t); - } else { - base_t = dest_t; - } + ggml_init_params lora_init_params = { + /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(), + /* .mem_buffer */ nullptr, + /* .no_alloc */ true, + }; + ggml_context * lora_ctx = ggml_init(lora_init_params); + if (lora_ctx == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__); + ggml_backend_free(backend_cpu); + return 1; + } - if (ggml_is_quantized(base_t->type)) { - if (!warned) { - LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " - "use a f16 or f32 base model with --lora-base\n", __func__); - warned = true; - } - } + // create tensors + ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]); + ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]); + ggml_set_name(loraA, metaA.name.c_str()); + ggml_set_name(loraB, metaB.name.c_str()); - ggml_tensor * loraA = lora_tensors[base_name + ".loraA"]; - GGML_ASSERT(loraA->type == GGML_TYPE_F32); - ggml_set_name(loraA, "loraA"); - - ggml_tensor * loraB = lora_tensors[base_name + ".loraB"]; - GGML_ASSERT(loraB->type == GGML_TYPE_F32); - ggml_set_name(loraB, "loraB"); - - if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { - LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" - " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + ggml_tensor * base_t; + if (ml) { + if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) { + LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str()); return 1; } + base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str())); + } else { + base_t = ggml_dup_tensor(lora_ctx, model_t); + } + ggml_set_name(base_t, base_name.c_str()); + // allocate in backend buffer + ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); + if (lora_buf == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__); + return 1; + } + + // load tensor data + auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) { + read_buf.resize(ggml_nbytes(tensor)); + fin.seek(tensor_meta.offset, SEEK_SET); + fin.read_raw(read_buf.data(), ggml_nbytes(tensor)); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size()); + }; + load_tensor(metaA, loraA); + load_tensor(metaB, loraB); + + // load base model tensor data + if (ml) { + ml->load_data_for(base_t); + } else { + ggml_backend_tensor_copy(model_t, base_t); + } + + if (ggml_is_quantized(base_t->type) && !warned) { + LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, " + "use a f16 or f32 base model with --lora-base\n", __func__); + warned = true; + } + + if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) { + LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");" + " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]); + ggml_free(lora_ctx); + ggml_backend_buffer_free(lora_buf); + ggml_backend_free(backend_cpu); + return 1; + } + + auto build_lora_graph = [&]() { // w = w + BA*s - ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB); - offload_func(BA); + ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB); ggml_set_name(BA, "BA"); if (scaling != 1.0f) { - BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling); - offload_func(BA); + BA = ggml_scale(lora_ctx, BA, scaling); ggml_set_name(BA, "BA_scaled"); } ggml_tensor * r; - if (base_t == dest_t) { - r = ggml_add_inplace(lora_ctx.get(), dest_t, BA); - offload_func_force_inplace(r); - ggml_set_name(r, "r_add_inplace"); - } - else { - r = ggml_add(lora_ctx.get(), base_t, BA); - offload_func(r); - ggml_set_name(r, "r_add"); + r = ggml_add_inplace(lora_ctx, base_t, BA); + ggml_set_name(r, "r_add"); - r = ggml_cpy(lora_ctx.get(), r, dest_t); - offload_func(r); - ggml_set_name(r, "r_cpy"); + if (base_t->type != model_t->type) { + // convert the result to the model type + r = ggml_cast(lora_ctx, r, model_t->type); + ggml_set_name(r, "r_cast"); } - struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get()); - ggml_build_forward_expand(gf, r); + return r; + }; - ggml_graph_compute_helper(work_buffer, gf, n_threads); + ggml_cgraph * gf = ggml_new_graph(lora_ctx); + ggml_tensor * r = build_lora_graph(); + ggml_build_forward_expand(gf, r); - // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other - GGML_ASSERT(lora_tensors.size() == 2); + ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type()); + if (graph_buf == nullptr) { + LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__); + ggml_free(lora_ctx); + ggml_backend_buffer_free(lora_buf); + ggml_backend_free(backend_cpu); + return 1; + } - // we won't need these tensors again, reset the context to save memory - lora_ctx.reset(ggml_init(params)); - lora_tensors.clear(); + ggml_backend_graph_compute(backend_cpu, gf); - n_tensors++; - if (n_tensors % 4 == 0) { - LLAMA_LOG_INFO("."); - } + ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r)); + +#if 0 + // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU + //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE); + + // sched compute + ggml_build_forward_expand(gf, build_graph()); + ggml_backend_sched_init_measure(sched, gf); + + // create the graph again, since the previous one was destroyed by the measure + ggml_graph_clear(gf); + ggml_build_forward_expand(gf, build_graph()); + ggml_backend_sched_graph_compute(sched, gf); + ggml_backend_sched_free(sched); +#endif + + ggml_backend_buffer_free(lora_buf); + ggml_backend_buffer_free(graph_buf); + ggml_free(lora_ctx); + + n_tensors++; + if (n_tensors % 4 == 0) { + LLAMA_LOG_INFO("."); } } + ggml_backend_free(backend_cpu); + const int64_t t_lora_us = ggml_time_us() - t_start_lora_us; LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0); @@ -9705,6 +9219,7 @@ static int llama_apply_lora_from_file_internal( struct llama_model_params llama_model_default_params() { struct llama_model_params result = { /*.n_gpu_layers =*/ 0, + /*.split_mode =*/ LLAMA_SPLIT_LAYER, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, /*.progress_callback =*/ nullptr, @@ -9716,7 +9231,8 @@ struct llama_model_params llama_model_default_params() { }; #ifdef GGML_USE_METAL - result.n_gpu_layers = 1; + // note: we usually have plenty of VRAM, so by default offload all layers to the GPU + result.n_gpu_layers = 999; #endif return result; @@ -9912,41 +9428,53 @@ struct llama_context * llama_new_context_with_model( GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0); GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0); - // reserve memory for context buffers if (!hparams.vocab_only) { - // initialize backend + // initialize backends #ifdef GGML_USE_METAL if (model->n_gpu_layers > 0) { - ctx->backend = ggml_backend_metal_init(); - if (ctx->backend == nullptr) { + ctx->backend_metal = ggml_backend_metal_init(); + if (ctx->backend_metal == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__); + llama_free(ctx); + return nullptr; } + ctx->backends.push_back(ctx->backend_metal); } -#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST) - // for testing only +#elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { - ctx->backend = ggml_backend_cuda_init(0); - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__); + // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { + ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_LAYER requires a backend for each GPU + for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) { + ggml_backend_t backend = ggml_backend_cuda_init(device); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } } } #endif - - if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) { - ctx->backend = ggml_backend_cpu_init(); - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); - } - } - - if (ctx->backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__); - delete ctx; + ctx->backend_cpu = ggml_backend_cpu_init(); + if (ctx->backend_cpu == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__); + llama_free(ctx); return nullptr; } + ctx->backends.push_back(ctx->backend_cpu); - if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, - cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) { + if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, + cparams.n_ctx, cparams.offload_kqv)) { LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__); llama_free(ctx); return nullptr; @@ -9982,11 +9510,22 @@ struct llama_context * llama_new_context_with_model( } { - // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data + // buffer types used for the compute buffer of each backend + std::vector backend_buft; + for (auto * backend : ctx->backends) { + if (ggml_backend_is_cpu(backend)) { + // use host buffers for the CPU backend compute buffer + backend_buft.push_back(llama_default_buffer_type_cpu(true)); + } else { + backend_buft.push_back(ggml_backend_get_default_buffer_type(backend)); + } + } + + // buffer used to store the computation graph and the tensor meta data ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead()); - // create measure allocator - ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend); + ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES); + ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); @@ -9994,50 +9533,19 @@ struct llama_context * llama_new_context_with_model( llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); - // measure memory requirements for the graph - size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf); + // initialize scheduler with the worst-case graph + ggml_backend_sched_init_measure(ctx->sched, gf); + // note: the number of splits during measure is higher than during inference due to the kv shift + int n_splits = ggml_backend_sched_get_n_splits(ctx->sched); + LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits); + ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu); - LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0); - - // create allocator again with exact memory requirements - ggml_allocr_free(ctx->alloc); - - ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size); - ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc); -#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) - if (model->n_gpu_layers > 0) { - // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets - ggml_cuda_set_scratch_size(alloc_size + 64); - LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); - - // calculate total VRAM usage - auto add_tensor = [](const ggml_tensor * t, size_t & size) { - if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) { - size += ggml_nbytes(t); - } - }; - size_t model_vram_size = 0; - for (const auto & kv : model->tensors_by_name) { - add_tensor(kv.second, model_vram_size); - } - - size_t kv_vram_size = 0; - for (auto & k : ctx->kv_self.k_l) { - add_tensor(k, kv_vram_size); - } - for (auto & v : ctx->kv_self.v_l) { - add_tensor(v, kv_vram_size); - } - - size_t ctx_vram_size = alloc_size + kv_vram_size; - size_t total_vram_size = model_vram_size + ctx_vram_size; - - LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__, - total_vram_size / 1024.0 / 1024.0, - model_vram_size / 1024.0 / 1024.0, - ctx_vram_size / 1024.0 / 1024.0); + for (ggml_backend_t backend : ctx->backends) { + ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend); + LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__, + ggml_backend_buffer_name(buf), + ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0); } -#endif } // TODO(jared): remove this @@ -10157,9 +9665,8 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3 } int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) { - return snprintf(buf, buf_size, "%s %s%s %s", + return snprintf(buf, buf_size, "%s %s %s", llama_model_arch_name(model->arch).c_str(), - model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "", llama_model_type_name(model->type), llama_model_ftype_name(model->ftype).c_str()); } @@ -10181,7 +9688,14 @@ uint64_t llama_model_n_params(const struct llama_model * model) { } struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) { - return ggml_get_tensor(model->ctx, name); + auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(), + [name](const std::pair & it) { + return it.first == name; + }); + if (it == model->tensors_by_name.end()) { + return nullptr; + } + return it->second; } uint32_t llama_model_quantize( @@ -10366,7 +9880,7 @@ size_t llama_get_state_size(const struct llama_context * ctx) { const size_t s_embedding = ctx->embedding.size() * sizeof(float); const size_t s_kv_size = sizeof(size_t); const size_t s_kv_ntok = sizeof(int); - const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf); + const size_t s_kv = ctx->kv_self.total_size(); const size_t s_total = ( + s_rng_size @@ -10495,7 +10009,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat const auto n_embd_v_gqa = hparams.n_embd_v_gqa(); const auto n_ctx = cparams.n_ctx; - const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf); + const size_t kv_buf_size = kv_self.total_size(); const uint32_t kv_head = kv_self.head; const uint32_t kv_size = kv_self.size; const uint32_t kv_used = kv_self.used; @@ -10508,46 +10022,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat if (kv_buf_size) { const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); - ggml_cgraph * gf = ggml_new_graph(cpy_ctx); - - std::vector kout2d(n_layer); - std::vector vout2d(n_layer); - - for (int il = 0; il < (int) n_layer; ++il) { - kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head); - vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa); - - ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], - n_embd_k_gqa, kv_head, - elt_size*n_embd_k_gqa, 0); - - ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il], - kv_head, n_embd_v_gqa, - elt_size*n_ctx, 0); - - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il])); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il])); - } - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); - - ggml_backend_graph_compute(ctx->backend, gf); - std::vector tmp_buf; for (int il = 0; il < (int) n_layer; ++il) { - tmp_buf.resize(ggml_nbytes(kout2d[il])); - ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size()); + tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head); + ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size()); data_ctx->write(tmp_buf.data(), tmp_buf.size()); - tmp_buf.resize(ggml_nbytes(vout2d[il])); - ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size()); - data_ctx->write(tmp_buf.data(), tmp_buf.size()); + // v is not contiguous, copy row by row + tmp_buf.resize(elt_size*kv_head); + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { + ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size()); + data_ctx->write(tmp_buf.data(), tmp_buf.size()); + } } - - ggml_free(cpy_ctx); - - ggml_backend_buffer_free(buf); } for (uint32_t i = 0; i < kv_size; ++i) { @@ -10646,48 +10133,22 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used); if (kv_buf_size) { - GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size); + GGML_ASSERT(kv_self.total_size() == kv_buf_size); const size_t elt_size = ggml_element_size(kv_self.k_l[0]); - ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true }); - ggml_cgraph * gf = ggml_new_graph(cpy_ctx); + for (int il = 0; il < (int) n_layer; ++il) { + size_t k_size = elt_size*n_embd_k_gqa*kv_head; + ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size); + inp += k_size; - std::vector kin2d(n_layer); - std::vector vin2d(n_layer); - - for (int il = 0; il < n_layer; ++il) { - kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head); - vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa); - - ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il], - n_embd_k_gqa, kv_head, - elt_size*n_embd_k_gqa, 0); - - ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il], - kv_head, n_embd_v_gqa, - elt_size*n_ctx, 0); - - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d)); - ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d)); + // v is not contiguous, copy row by row + size_t v_row_size = elt_size*kv_head; + for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) { + ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size); + inp += v_row_size; + } } - - ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend); - - // load data into the tensors - for (int il = 0; il < n_layer; ++il) { - ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il])); - inp += ggml_nbytes(kin2d[il]); - - ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il])); - inp += ggml_nbytes(vin2d[il]); - } - - ggml_backend_graph_compute(ctx->backend, gf); - - ggml_free(cpy_ctx); - - ggml_backend_buffer_free(buf); } ctx->kv_self.head = kv_head; @@ -11085,7 +10546,7 @@ void llama_print_timings(struct llama_context * ctx) { __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval); LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n", __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval); - LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms)); + LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval)); } void llama_reset_timings(struct llama_context * ctx) { diff --git a/llama.h b/llama.h index 9fdb94bc4..2c2684eb5 100644 --- a/llama.h +++ b/llama.h @@ -104,6 +104,8 @@ extern "C" { LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors + LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors + LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file }; @@ -116,6 +118,12 @@ extern "C" { LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN, }; + enum llama_split_mode { + LLAMA_SPLIT_NONE = 0, // single GPU + LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs + LLAMA_SPLIT_ROW = 2, // split rows across GPUs + }; + typedef struct llama_token_data { llama_token id; // token id float logit; // log-odds of the token @@ -178,8 +186,16 @@ extern "C" { struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors - const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) + enum llama_split_mode split_mode; // how to split the model across multiple GPUs + + // main_gpu interpretation depends on split_mode: + // LLAMA_SPLIT_NONE: the GPU that is used for the entire model + // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results + // LLAMA_SPLIT_LAYER: ignored + int32_t main_gpu; + + // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES + const float * tensor_split; // Called with a progress value between 0.0 and 1.0. Pass NULL to disable. // If the provided progress_callback returns true, model loading continues. diff --git a/scripts/sync-ggml.last b/scripts/sync-ggml.last index fe7f3202f..3e2c579d5 100644 --- a/scripts/sync-ggml.last +++ b/scripts/sync-ggml.last @@ -1 +1 @@ -f96711108d55bdbbd277e6be07204dce6a94fb93 +979cc23b345006504cfc1f67c0fdf627805e3319 diff --git a/spm-headers/ggml.h b/spm-headers/ggml.h deleted file mode 120000 index 39215298f..000000000 --- a/spm-headers/ggml.h +++ /dev/null @@ -1 +0,0 @@ -../ggml.h \ No newline at end of file diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index 7a60d7743..d9b8b106a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -376,6 +376,11 @@ struct test_case { // allocate ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend1); + if (buf == NULL) { + printf("failed to allocate tensors [%s] ", ggml_backend_name(backend1)); + ggml_free(ctx); + return false; + } // build graph ggml_build_forward_expand(gf, out); @@ -463,19 +468,23 @@ struct test_case { GGML_UNUSED(index); }; - ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); + const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); - if (ud.ok) { - printf("\033[1;32mOK\033[0m\n"); - } else { - printf("\033[1;31mFAIL\033[0m\n"); + if (!cmp_ok) { + printf("compare failed "); } ggml_backend_buffer_free(buf); ggml_free(ctx); - return ud.ok; + if (ud.ok && cmp_ok) { + printf("\033[1;32mOK\033[0m\n"); + return true; + } + + printf("\033[1;31mFAIL\033[0m\n"); + return false; } bool eval_perf(ggml_backend_t backend, const char * op_name) { @@ -519,6 +528,11 @@ struct test_case { // allocate ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(ctx, backend); + if (buf == NULL) { + printf("failed to allocate tensors\n"); + ggml_free(ctx); + return false; + } // randomize tensors initialize_tensors(ctx); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index cee712618..31a78c632 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -134,8 +134,9 @@ int main(int argc, char * argv[]) { continue; } - if ((ggml_type)i == GGML_TYPE_IQ2_XXS) { - printf("Skip %s due to missing quantization functionality\n", ggml_type_name((ggml_type) i)); + const ggml_type ei = (ggml_type)i; + if (ei == GGML_TYPE_IQ2_XXS || ei == GGML_TYPE_IQ2_XS) { + printf("Skip %s due to missing quantization functionality\n", ggml_type_name(ei)); continue; }