Merge branch 'master' into gg/flash-attn

ggml-ci
2024-04-24 14:00:32 +03:00 · 2024-04-24 14:00:32 +03:00 · 8937ec5307
commit 8937ec5307
parent 751591d520 3fe847b574
57 changed files with 1576 additions and 8245 deletions
--- a/llama.h
+++ b/llama.h
@ -784,6 +784,9 @@ extern "C" {

    LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token);

+    // Check if the token is supposed to end generation (end-of-generation, eg. EOS, EOT, etc.)
+    LLAMA_API bool llama_token_is_eog(const struct llama_model * model, llama_token token);
+
    // Special tokens
    LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence
    LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence
@ -797,7 +800,7 @@ extern "C" {
    // Returns -1 if unknown, 1 for true or 0 for false.
    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);

-    // codellama infill tokens
+    // Codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
    LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle
    LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix
@ -826,11 +829,13 @@ extern "C" {
    // Uses the vocabulary in the provided context.
    // Does not write null terminator to the buffer.
    // User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
+    // @param special If true, special tokens are rendered in the output.
    LLAMA_API int32_t llama_token_to_piece(
              const struct llama_model * model,
                           llama_token   token,
                                  char * buf,
-                               int32_t   length);
+                               int32_t   length,
+                                  bool   special);

    /// Apply chat template. Inspired by hf apply_chat_template() on python.
    /// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
@ -983,7 +988,7 @@ extern "C" {
            struct llama_context * ctx,
          llama_token_data_array * candidates);

-    /// @details Randomly selects a token from the candidates based on their probabilities.
+    /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx.
    LLAMA_API llama_token llama_sample_token(
            struct llama_context * ctx,
          llama_token_data_array * candidates);
@ -1070,8 +1075,9 @@ extern "C" {
 // Internal API to be implemented by llama.cpp and used by tests/benchmarks only
 #ifdef LLAMA_API_INTERNAL

-#include <vector>
+#include <random>
 #include <string>
+#include <vector>

 struct ggml_tensor;

@ -1108,6 +1114,10 @@ std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
        const std::string & src,
        llama_partial_utf8   partial_start);

+// Randomly selects a token from the candidates based on their probabilities using given std::mt19937.
+// This is a temporary workaround in order to fix race conditions when sampling with multiple sequences.
+llama_token llama_sample_token_with_rng(struct llama_context * ctx, llama_token_data_array * candidates, std::mt19937 & rng);
+
 #endif // LLAMA_API_INTERNAL

 #endif // LLAMA_H