llama : move tokenizers into llama-vocab

ggml-ci
2024-07-19 15:44:30 +03:00 · 2024-07-19 15:44:30 +03:00 · 8fef5b1897
commit 8fef5b1897
parent e7dffa6bc7
6 changed files with 2067 additions and 1974 deletions
--- a/include/llama.h
+++ b/include/llama.h
@ -906,10 +906,10 @@ extern "C" {
    LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_bos_token(const struct llama_model * model);
+    LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
    // Returns -1 if unknown, 1 for true or 0 for false.
-    LLAMA_API int32_t         llama_add_eos_token(const struct llama_model * model);
+    LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
    // Codellama infill tokens
    LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -62,3 +62,13 @@ struct llama_vocab {
 };
 const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
 const struct llama_vocab * llama_get_vocab(const struct llama_model   * model);
 // TODO: This should probably be in llama.h
 std::vector<llama_vocab::id> llama_tokenize_internal(
        const llama_vocab & vocab,
        std::string raw_text,
        bool add_special,
        bool parse_special = false);
 llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
--- a/src/llama.cpp
+++ b/src/llama.cpp
--- a/src/unicode.cpp
+++ b/src/unicode.cpp
@ -19,6 +19,12 @@
 #include <locale>
 #include <codecvt>
 size_t unicode_len_utf8(char src) {
    const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
    uint8_t highbits = static_cast<uint8_t>(src) >> 4;
    return lookup[highbits];
 }
 static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
    std::string result;
    for (size_t i = 0; i < cps.size(); ++i) {
--- a/src/unicode.h
+++ b/src/unicode.h
@ -46,6 +46,7 @@ struct codepoint_flags {
    }
 };
 size_t unicode_len_utf8(char src);
 std::string unicode_cpt_to_utf8(uint32_t cp);
 uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);