diff --git a/.devops/vulkan.Dockerfile b/.devops/vulkan.Dockerfile index ad5dcd374..b5bd3b6d2 100644 --- a/.devops/vulkan.Dockerfile +++ b/.devops/vulkan.Dockerfile @@ -1,4 +1,4 @@ -ARG UBUNTU_VERSION=24.04 +ARG UBUNTU_VERSION=22.04 FROM ubuntu:$UBUNTU_VERSION AS build @@ -7,7 +7,7 @@ RUN apt update && apt install -y git build-essential cmake wget # Install Vulkan SDK and cURL RUN wget -qO - https://packages.lunarg.com/lunarg-signing-key-pub.asc | apt-key add - && \ - wget -qO /etc/apt/sources.list.d/lunarg-vulkan-noble.list https://packages.lunarg.com/vulkan/lunarg-vulkan-noble.list && \ + wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list https://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list && \ apt update -y && \ apt-get install -y vulkan-sdk libcurl4-openssl-dev curl diff --git a/.github/workflows/docker.yml b/.github/workflows/docker.yml index d71f1eb38..6bf22eb66 100644 --- a/.github/workflows/docker.yml +++ b/.github/workflows/docker.yml @@ -32,10 +32,12 @@ jobs: env: COMMIT_SHA: ${{ github.sha }} strategy: + fail-fast: false matrix: config: # Multi-stage build - - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64,linux/arm64", full: true, light: true, server: true, freediskspace: false} + - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} + - { tag: "cpu", dockerfile: ".devops/cpu.Dockerfile", platforms: "linux/arm64", full: true, light: true, server: true, freediskspace: false} - { tag: "cuda", dockerfile: ".devops/cuda.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "musa", dockerfile: ".devops/musa.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} - { tag: "intel", dockerfile: ".devops/intel.Dockerfile", platforms: "linux/amd64", full: true, light: true, server: true, freediskspace: false} diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index c9474345d..76f8e4291 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -64,7 +64,9 @@ static id ggml_backend_metal_device_acq(struct ggml_backend_metal_dev if (ctx->mtl_device == nil) { ctx->mtl_device = MTLCreateSystemDefaultDevice(); + } + if (ctx->mtl_device) { ctx->has_simdgroup_reduction = [ctx->mtl_device supportsFamily:MTLGPUFamilyApple7]; ctx->has_simdgroup_reduction |= [ctx->mtl_device supportsFamily:MTLGPUFamilyMetal3_GGML]; @@ -99,8 +101,10 @@ static void ggml_backend_metal_device_rel(struct ggml_backend_metal_device_conte ctx->mtl_device_ref_count--; if (ctx->mtl_device_ref_count == 0) { - [ctx->mtl_device release]; - ctx->mtl_device = nil; + if (ctx->mtl_device) { + [ctx->mtl_device release]; + ctx->mtl_device = nil; + } } } diff --git a/src/llama-context.cpp b/src/llama-context.cpp index 4e6033ff1..37e43213a 100644 --- a/src/llama-context.cpp +++ b/src/llama-context.cpp @@ -7,6 +7,7 @@ #include #include #include +#include static int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) { // TODO move to hparams if a T5 variant appears that uses a different value @@ -336,12 +337,55 @@ llama_context::llama_context(const llama_model & model, const llama_context_para } struct llama_batch_manager : public llama_batch_manager_i { - llama_batch_manager(llama_context & lctx, const llama_batch & batch, bool logits_all) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { + llama_batch_manager(llama_context & lctx, const llama_batch & batch) : lctx(lctx), batch(batch), kv_slot_restorer(lctx.kv_self) { + const auto & model = lctx.model; + const auto & cparams = lctx.cparams; const auto & hparams = lctx.model.hparams; - const auto & n_embd = hparams.n_embd; const auto & kv_self = lctx.kv_self; + const int64_t n_tokens_all = batch.n_tokens; + const int64_t n_embd = hparams.n_embd; + + GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT + + if (batch.token) { + for (int64_t i = 0; i < n_tokens_all; ++i) { + if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { + LLAMA_LOG_ERROR("%s: invalid token[%" PRId64 "] = %d\n", __func__, i, batch.token[i]); + throw std::runtime_error("invalid token"); + } + } + } + + GGML_ASSERT(n_tokens_all <= cparams.n_batch); + + GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); + + if (lctx.t_compute_start_us == 0) { + lctx.t_compute_start_us = ggml_time_us(); + } + lctx.n_queued_tokens += n_tokens_all; + + // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens + const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; + + lctx.embd_seq.clear(); + + // count outputs + if (batch.logits && !embd_pooled) { + for (uint32_t i = 0; i < n_tokens_all; ++i) { + n_outputs_all += batch.logits[i] != 0; + } + } else if (lctx.logits_all || embd_pooled) { + n_outputs_all = n_tokens_all; + } else { + // keep last output only + n_outputs_all = 1; + } + + const bool logits_all = n_outputs_all == n_tokens_all; + lctx.sbatch.from_batch(batch, n_embd, /* simple_split */ !kv_self.recurrent, /* logits_all */ logits_all); @@ -379,9 +423,29 @@ struct llama_batch_manager : public llama_batch_manager_i { virtual bool prepare() override { const auto & cparams = lctx.cparams; const auto & hparams = lctx.model.hparams; + const auto & batch = lctx.sbatch.batch; + + const auto n_tokens_all = batch->n_tokens; auto & kv_self = lctx.kv_self; + // count the outputs in this u_batch + { + int32_t n_outputs_new = 0; + + if (n_outputs_all == n_tokens_all) { + n_outputs_new = ubatch.n_tokens; + } else { + GGML_ASSERT(ubatch.output); + for (uint32_t i = 0; i < ubatch.n_tokens; i++) { + n_outputs_new += (int32_t) (ubatch.output[i] != 0); + } + } + + // needs to happen before the graph is built + lctx.n_outputs = n_outputs_new; + } + // non-causal masks do not use the KV cache if (hparams.causal_attn) { lctx.kv_self_update(); @@ -459,8 +523,8 @@ struct llama_batch_manager : public llama_batch_manager_i { llama_kv_slot_restorer kv_slot_restorer; }; -std::unique_ptr llama_context::prepare_batch(const llama_batch & batch, bool logits_all) { - return std::make_unique(*this, batch, logits_all); +std::unique_ptr llama_context::prepare_batch(const llama_batch & batch) { + return std::make_unique(*this, batch); } enum ggml_status llama_context::compute_graph( diff --git a/src/llama-context.h b/src/llama-context.h index d0356e3ed..1277645de 100644 --- a/src/llama-context.h +++ b/src/llama-context.h @@ -28,6 +28,9 @@ struct llama_batch_manager_i { virtual void restore() = 0; virtual void update() = 0; virtual void finalize() = 0; + + // TODO: might be temporary + int64_t n_outputs_all = 0; }; // TODO: make implementation details private @@ -98,7 +101,7 @@ struct llama_context { void * abort_callback_data = nullptr; // TODO: do not pass logits_all explicitly - std::unique_ptr prepare_batch(const llama_batch & batch, bool logits_all); + std::unique_ptr prepare_batch(const llama_batch & batch); // returns the result of ggml_backend_sched_graph_compute_async execution enum ggml_status compute_graph( diff --git a/src/llama.cpp b/src/llama.cpp index 408bd9030..f410f7a2f 100644 --- a/src/llama.cpp +++ b/src/llama.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -7751,7 +7752,7 @@ static struct ggml_cgraph * llama_build_graph( // (for non-recurrent models) or cleaned (for recurrent models) // // - lctx: llama context -// - batch: batch to evaluate +// - inp_batch: batch to evaluate // // return 0 on success // return positive int on warning @@ -7774,98 +7775,34 @@ static int llama_decode_impl( const llama_batch & batch = batch_allocr.batch; - const uint32_t n_tokens_all = batch.n_tokens; - const auto & model = lctx.model; const auto & vocab = model.vocab; - const auto & hparams = model.hparams; const auto & cparams = lctx.cparams; + const auto & hparams = lctx.model.hparams; - GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT - - if (batch.token) { - for (uint32_t i = 0; i < n_tokens_all; ++i) { - if (batch.token[i] < 0 || (uint32_t) batch.token[i] >= model.vocab.n_tokens()) { - LLAMA_LOG_ERROR("%s: invalid token[%d] = %d\n", __func__, i, batch.token[i]); - return -1; - } - } - } - - GGML_ASSERT(n_tokens_all <= cparams.n_batch); - - GGML_ASSERT((cparams.causal_attn || cparams.n_ubatch >= n_tokens_all) && "non-causal attention requires n_ubatch >= n_tokens"); - - if (lctx.t_compute_start_us == 0) { - lctx.t_compute_start_us = ggml_time_us(); - } - lctx.n_queued_tokens += n_tokens_all; - + const int32_t n_vocab = vocab.n_tokens(); const int64_t n_embd = hparams.n_embd; - const int64_t n_vocab = vocab.n_tokens(); - uint32_t n_outputs = 0; - uint32_t n_outputs_prev = 0; + // TODO: try catch + auto bman = lctx.prepare_batch(batch); - // this indicates we are doing pooled embedding, so we ignore batch.logits and output all tokens - const bool embd_pooled = cparams.embeddings && cparams.pooling_type != LLAMA_POOLING_TYPE_NONE; - - lctx.embd_seq.clear(); - - // count outputs - if (batch.logits && !embd_pooled) { - for (uint32_t i = 0; i < n_tokens_all; ++i) { - n_outputs += batch.logits[i] != 0; - } - } else if (lctx.logits_all || embd_pooled) { - n_outputs = n_tokens_all; - } else { - // keep last output only - n_outputs = 1; - } + const auto n_outputs_all = bman->n_outputs_all; // reserve output buffer - if (llama_output_reserve(lctx, n_outputs) < n_outputs) { - LLAMA_LOG_ERROR("%s: could not reserve space for batch with %u outputs\n", __func__, n_outputs); + // TODO: move to batch manager? + if (llama_output_reserve(lctx, bman->n_outputs_all) < (size_t) n_outputs_all) { + LLAMA_LOG_ERROR("%s: could not reserve space for batch with %" PRId64 " outputs\n", __func__, n_outputs_all); return -2; }; - const bool logits_all = n_outputs == n_tokens_all; - - //auto & kv_self = lctx.kv_self; - //llama_kv_slot_restorer kv_slot_restorer(kv_self); - - //lctx.sbatch.from_batch(batch, n_embd, - // /* simple_split */ !kv_self.recurrent, - // /* logits_all */ logits_all); - - auto batch_manager = lctx.prepare_batch(batch, logits_all); + int64_t n_outputs_prev = 0; while (lctx.sbatch.n_tokens > 0) { - llama_ubatch ubatch = batch_manager->next(); + llama_ubatch ubatch = bman->next(); - const uint32_t n_tokens = ubatch.n_tokens; - - // count the outputs in this u_batch - { - int32_t n_outputs_new = 0; - - if (n_outputs == n_tokens_all) { - n_outputs_new = n_tokens; - } else { - GGML_ASSERT(ubatch.output); - for (uint32_t i = 0; i < n_tokens; i++) { - n_outputs_new += (int32_t) (ubatch.output[i] != 0); - } - } - - // needs to happen before the graph is built - lctx.n_outputs = n_outputs_new; - } - - if (!batch_manager->prepare()) { + if (!bman->prepare()) { LLAMA_LOG_ERROR("%s: failed to prepare ubatch\n", __func__); - batch_manager->restore(); + bman->restore(); return -3; } @@ -7927,9 +7864,9 @@ static int llama_decode_impl( GGML_ASSERT(strcmp(res->name, "result_output") == 0 && "missing result_output tensor"); } - const auto compute_status = lctx.compute_graph(gf, n_tokens > 1); + const auto compute_status = lctx.compute_graph(gf, ubatch.n_tokens > 1); if (compute_status != GGML_STATUS_SUCCESS) { - batch_manager->restore(); + bman->restore(); switch (compute_status) { case GGML_STATUS_ABORTED: return 2; @@ -7941,7 +7878,7 @@ static int llama_decode_impl( } } - batch_manager->update(); + bman->update(); // plot the computation graph in dot format (for debugging purposes) //if (n_past%100 == 0) { @@ -7958,7 +7895,7 @@ static int llama_decode_impl( const int32_t n_outputs_new = lctx.n_outputs; if (n_outputs_new) { - GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_vocab <= (int64_t) lctx.logits_size); ggml_backend_tensor_get_async(backend_res, res, logits_out, 0, n_outputs_new*n_vocab*sizeof(float)); } @@ -7978,7 +7915,7 @@ static int llama_decode_impl( const int32_t n_outputs_new = lctx.n_outputs; if (n_outputs_new) { - GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs); + GGML_ASSERT( n_outputs_prev + n_outputs_new <= n_outputs_all); GGML_ASSERT((n_outputs_prev + n_outputs_new)*n_embd <= (int64_t) lctx.embd_size); ggml_backend_tensor_get_async(backend_embd, embd, embd_out, 0, n_outputs_new*n_embd*sizeof(float)); } @@ -8027,9 +7964,9 @@ static int llama_decode_impl( { bool sorted_output = true; - GGML_ASSERT(lctx.sbatch.out_ids.size() == n_outputs); + GGML_ASSERT(lctx.sbatch.out_ids.size() == (size_t) n_outputs_all); - for (size_t i = 0; i < n_outputs; ++i) { + for (size_t i = 0; i < (size_t) n_outputs_all; ++i) { size_t out_id = lctx.sbatch.out_ids[i]; lctx.output_ids[out_id] = i; if (out_id != i) { @@ -8043,12 +7980,12 @@ static int llama_decode_impl( } // set to total number of outputs in the batch, for use in llama_get_logits_ith - lctx.n_outputs = n_outputs; + lctx.n_outputs = n_outputs_all; // wait for the computation to finish (automatically done when obtaining the model output) //llama_synchronize(&lctx); - batch_manager->finalize(); + bman->finalize(); // Reset state for the next token before backend sync, to allow the CPU activities in the reset to // overlap with device computation.