diff --git a/llama.cpp b/llama.cpp index d63a2a888..17b493746 100644 --- a/llama.cpp +++ b/llama.cpp @@ -13545,7 +13545,7 @@ void llama_sample_repetition_penalties( } } -void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, struct llama_grammar * grammar) { +void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) { GGML_ASSERT(ctx); const int64_t t_start_sample_us = ggml_time_us(); @@ -13557,7 +13557,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } - // Store decoded codepoints when they are not cached. + // Store decoded codepoints when they are not cached (happens when there's a partial utf8 string prefix). std::vector, llama_partial_utf8>> candidates_decoded; if (grammar->partial_utf8.n_remain > 0) { candidates_decoded.reserve(candidates->size); diff --git a/llama.h b/llama.h index 9c849c055..8b1b15ed4 100644 --- a/llama.h +++ b/llama.h @@ -961,7 +961,7 @@ extern "C" { LLAMA_API void llama_sample_grammar( struct llama_context * ctx, llama_token_data_array * candidates, - struct llama_grammar * grammar); + const struct llama_grammar * grammar); /// @details Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. /// @param candidates A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text.