vocab : more pimpl (#11165)

ggml-ci
2025-01-10 10:28:37 +02:00 · 2025-01-10 10:28:37 +02:00 · a857dc50af
commit a857dc50af
parent 45aab64e93
2 changed files with 537 additions and 423 deletions
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
--- a/src/llama-vocab.h
+++ b/src/llama-vocab.h
@ -78,17 +78,10 @@ struct llama_vocab {
    bool escape_whitespaces        () const;
    bool treat_whitespace_as_suffix() const;
-    int max_token_text_len() const;
+    int max_token_len() const;
    void print_info() const;
    int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
    std::vector<llama_token> tokenize(
                  std::string   raw_text,
                         bool   add_special,
                         bool   parse_special = false) const;
    int32_t tokenize(
                   const char * text,
                      int32_t   text_len,
@ -97,6 +90,11 @@ struct llama_vocab {
                         bool   add_special,
                         bool   parse_special) const;
    std::vector<llama_token> tokenize(
            const std::string & raw_text,
                         bool   add_special,
                         bool   parse_special = false) const;
    // does not write null-terminator to buf
    int32_t token_to_piece(
                  llama_token   token,
@ -108,11 +106,6 @@ struct llama_vocab {
    // use cached data
    const std::string & token_to_piece(llama_token token) const;
    // check if token0 is contained as a prefix in token1
    bool token_is_prefix(
                  llama_token   token0,
                  llama_token   token1) const;
    int32_t detokenize(
            const llama_token * tokens,
                      int32_t   n_tokens,
@ -125,48 +118,9 @@ struct llama_vocab {
            const std::vector<llama_token> & tokens,
                                      bool   special) const;
    void print_info() const;
 private:
    struct impl;
    std::unique_ptr<impl> pimpl;
    std::string token_to_piece_for_cache(
                  llama_token   token,
                         bool   special) const;
    enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
    enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
    int max_token_len = 0; // used for optimizing longest token search
    // default LLaMA special tokens
    // TODO: should we set all of these to LLAMA_TOKEN_NULL?
    llama_token special_bos_id  = 1;
    llama_token special_eos_id  = 2;
    llama_token special_eot_id  = LLAMA_TOKEN_NULL;
    llama_token special_eom_id  = LLAMA_TOKEN_NULL;
    llama_token special_unk_id  = 0;
    llama_token special_sep_id  = LLAMA_TOKEN_NULL;
    llama_token special_pad_id  = LLAMA_TOKEN_NULL;
    llama_token special_cls_id  = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
    llama_token special_mask_id = LLAMA_TOKEN_NULL;
    llama_token linefeed_id = 13;
    // fim tokens
    llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
    llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
    llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
    llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
    llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
    llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
    // tokenizer flags
    bool tokenizer_add_space_prefix           = false;
    bool tokenizer_add_bos                    = false;
    bool tokenizer_add_eos                    = false;
    bool tokenizer_ignore_merges              = false;
    bool tokenizer_clean_spaces               = false;  // clean_up_tokenization_spaces
    bool tokenizer_remove_extra_whitespaces   = false;
    bool tokenizer_escape_whitespaces         = true;
    bool tokenizer_treat_whitespace_as_suffix = false;
 };