diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d3e4651c7..c1e36ee28 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -10,10 +10,10 @@ on: push: branches: - master - paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu'] + paths: ['.github/workflows/**', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift'] pull_request: types: [opened, synchronize, reopened] - paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu'] + paths: ['**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift'] env: BRANCH_NAME: ${{ github.head_ref || github.ref_name }} @@ -258,7 +258,7 @@ jobs: strategy: matrix: - destination: ['platform=macOS,name=Any Mac', 'platform=iOS,name=Any iOS Device', 'platform=tvOS,name=Any tvOS Device'] + destination: ['generic/platform=macOS', 'generic/platform=iOS', 'generic/platform=tvOS'] steps: - name: Clone diff --git a/common/common.cpp b/common/common.cpp index e63737de4..60b00b5fb 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1022,10 +1022,11 @@ llama_token llama_sample_token( id = llama_sample_token_mirostat_v2(ctx, &cur_p, mirostat_tau, mirostat_eta, &mirostat_mu); } else { // Temperature sampling - llama_sample_top_k (ctx, &cur_p, top_k, 1); - llama_sample_tail_free (ctx, &cur_p, tfs_z, 1); - llama_sample_typical (ctx, &cur_p, typical_p, 1); - llama_sample_top_p (ctx, &cur_p, top_p, 1); + size_t min_keep = std::max(1, params.n_probs); + llama_sample_top_k (ctx, &cur_p, top_k, min_keep); + llama_sample_tail_free (ctx, &cur_p, tfs_z, min_keep); + llama_sample_typical (ctx, &cur_p, typical_p, min_keep); + llama_sample_top_p (ctx, &cur_p, top_p, min_keep); llama_sample_temp(ctx, &cur_p, temp); { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 5f9cdecd5..c53a64867 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -534,98 +534,20 @@ struct llama_server_context return result; } - // out of user input, sample next token - const float temp = params.temp; - const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(model) : params.top_k; - const float top_p = params.top_p; - const float tfs_z = params.tfs_z; - const float typical_p = params.typical_p; - const int32_t repeat_last_n = params.repeat_last_n < 0 ? n_ctx : params.repeat_last_n; - const float repeat_penalty = params.repeat_penalty; - const float alpha_presence = params.presence_penalty; - const float alpha_frequency = params.frequency_penalty; - const int mirostat = params.mirostat; - const float mirostat_tau = params.mirostat_tau; - const float mirostat_eta = params.mirostat_eta; - const bool penalize_nl = params.penalize_nl; - const int32_t n_probs = params.n_probs; - { - auto *logits = llama_get_logits(ctx); - auto n_vocab = llama_n_vocab(model); - - // Apply params.logit_bias map - for (const auto &it : params.logit_bias) - { - logits[it.first] += it.second; - } - + // out of user input, sample next token std::vector candidates; - candidates.reserve(n_vocab); - for (llama_token token_id = 0; token_id < n_vocab; token_id++) + candidates.reserve(llama_n_vocab(model)); + + result.tok = llama_sample_token(ctx, NULL, grammar, params, last_n_tokens, candidates); + + llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false }; + + const int32_t n_probs = params.n_probs; + if (params.temp <= 0 && n_probs > 0) { - candidates.emplace_back(llama_token_data{token_id, logits[token_id], 0.0f}); - } - - llama_token_data_array candidates_p = {candidates.data(), candidates.size(), false}; - - // Apply penalties - float nl_logit = logits[llama_token_nl(ctx)]; - auto last_n_repeat = std::min(std::min((int)last_n_tokens.size(), repeat_last_n), n_ctx); - llama_sample_repetition_penalty(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, repeat_penalty); - llama_sample_frequency_and_presence_penalties(ctx, &candidates_p, - last_n_tokens.data() + last_n_tokens.size() - last_n_repeat, - last_n_repeat, alpha_frequency, alpha_presence); - if (!penalize_nl) - { - logits[llama_token_nl(ctx)] = nl_logit; - } - - if (grammar != nullptr) { - llama_sample_grammar(ctx, &candidates_p, grammar); - } - - if (temp <= 0) - { - // Greedy sampling - result.tok = llama_sample_token_greedy(ctx, &candidates_p); - if (n_probs > 0) - { - llama_sample_softmax(ctx, &candidates_p); - } - } - else - { - if (mirostat == 1) - { - static float mirostat_mu = 2.0f * mirostat_tau; - const int mirostat_m = 100; - llama_sample_temp(ctx, &candidates_p, temp); - result.tok = llama_sample_token_mirostat(ctx, &candidates_p, mirostat_tau, mirostat_eta, mirostat_m, &mirostat_mu); - } - else if (mirostat == 2) - { - static float mirostat_mu = 2.0f * mirostat_tau; - llama_sample_temp(ctx, &candidates_p, temp); - result.tok = llama_sample_token_mirostat_v2(ctx, &candidates_p, mirostat_tau, mirostat_eta, &mirostat_mu); - } - else - { - // Temperature sampling - size_t min_keep = std::max(1, n_probs); - llama_sample_top_k(ctx, &candidates_p, top_k, min_keep); - llama_sample_tail_free(ctx, &candidates_p, tfs_z, min_keep); - llama_sample_typical(ctx, &candidates_p, typical_p, min_keep); - llama_sample_top_p(ctx, &candidates_p, top_p, min_keep); - llama_sample_temp(ctx, &candidates_p, temp); - result.tok = llama_sample_token(ctx, &candidates_p); - } - } - - if (grammar != nullptr) { - llama_grammar_accept_token(ctx, grammar, result.tok); + // For llama_sample_token_greedy we need to sort candidates + llama_sample_softmax(ctx, &candidates_p); } for (size_t i = 0; i < std::min(candidates_p.size, (size_t)n_probs); ++i) diff --git a/llama.cpp b/llama.cpp index 0b28f4e66..1a7d37b8d 100644 --- a/llama.cpp +++ b/llama.cpp @@ -125,6 +125,27 @@ static void replace_all(std::string & s, const std::string & search, const std:: } s = std::move(result); } + +static bool is_float_close(float a, float b, float abs_tol) { + // Check for non-negative tolerance + if (abs_tol < 0.0) { + throw std::invalid_argument("Tolerance must be non-negative"); + } + + // Exact equality check + if (a == b) { + return true; + } + + // Check for infinities + if (std::isinf(a) || std::isinf(b)) { + return false; + } + + // Regular comparison using the provided absolute tolerance + return std::fabs(b - a) <= abs_tol; +} + #ifdef GGML_USE_CPU_HBM #include #endif @@ -969,7 +990,24 @@ struct llama_hparams { float rope_freq_scale_train; bool operator!=(const llama_hparams & other) const { - return static_cast(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT + if (this->vocab_only != other.vocab_only) return true; + if (this->n_vocab != other.n_vocab) return true; + if (this->n_ctx_train != other.n_ctx_train) return true; + if (this->n_embd != other.n_embd) return true; + if (this->n_head != other.n_head) return true; + if (this->n_head_kv != other.n_head_kv) return true; + if (this->n_layer != other.n_layer) return true; + if (this->n_rot != other.n_rot) return true; + if (this->n_ff != other.n_ff) return true; + + const float EPSILON = 1e-9; + + if (!is_float_close(this->f_norm_eps, other.f_norm_eps, EPSILON)) return true; + if (!is_float_close(this->f_norm_rms_eps, other.f_norm_rms_eps, EPSILON)) return true; + if (!is_float_close(this->rope_freq_base_train, other.rope_freq_base_train, EPSILON)) return true; + if (!is_float_close(this->rope_freq_scale_train, other.rope_freq_scale_train, EPSILON)) return true; + + return false; } uint32_t n_gqa() const {