vocab : more pimpl (#11165)

ggml-ci
This commit is contained in:
Georgi Gerganov 2025-01-10 10:28:37 +02:00
parent 45aab64e93
commit a857dc50af
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 537 additions and 423 deletions

File diff suppressed because it is too large Load diff

View file

@ -78,17 +78,10 @@ struct llama_vocab {
bool escape_whitespaces () const; bool escape_whitespaces () const;
bool treat_whitespace_as_suffix() const; bool treat_whitespace_as_suffix() const;
int max_token_text_len() const; int max_token_len() const;
void print_info() const;
int find_bpe_rank(const std::string & token_left, const std::string & token_right) const; int find_bpe_rank(const std::string & token_left, const std::string & token_right) const;
std::vector<llama_token> tokenize(
std::string raw_text,
bool add_special,
bool parse_special = false) const;
int32_t tokenize( int32_t tokenize(
const char * text, const char * text,
int32_t text_len, int32_t text_len,
@ -97,6 +90,11 @@ struct llama_vocab {
bool add_special, bool add_special,
bool parse_special) const; bool parse_special) const;
std::vector<llama_token> tokenize(
const std::string & raw_text,
bool add_special,
bool parse_special = false) const;
// does not write null-terminator to buf // does not write null-terminator to buf
int32_t token_to_piece( int32_t token_to_piece(
llama_token token, llama_token token,
@ -108,11 +106,6 @@ struct llama_vocab {
// use cached data // use cached data
const std::string & token_to_piece(llama_token token) const; const std::string & token_to_piece(llama_token token) const;
// check if token0 is contained as a prefix in token1
bool token_is_prefix(
llama_token token0,
llama_token token1) const;
int32_t detokenize( int32_t detokenize(
const llama_token * tokens, const llama_token * tokens,
int32_t n_tokens, int32_t n_tokens,
@ -125,48 +118,9 @@ struct llama_vocab {
const std::vector<llama_token> & tokens, const std::vector<llama_token> & tokens,
bool special) const; bool special) const;
void print_info() const;
private: private:
struct impl; struct impl;
std::unique_ptr<impl> pimpl; std::unique_ptr<impl> pimpl;
std::string token_to_piece_for_cache(
llama_token token,
bool special) const;
enum llama_vocab_type type = LLAMA_VOCAB_TYPE_SPM;
enum llama_vocab_pre_type pre_type = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
int max_token_len = 0; // used for optimizing longest token search
// default LLaMA special tokens
// TODO: should we set all of these to LLAMA_TOKEN_NULL?
llama_token special_bos_id = 1;
llama_token special_eos_id = 2;
llama_token special_eot_id = LLAMA_TOKEN_NULL;
llama_token special_eom_id = LLAMA_TOKEN_NULL;
llama_token special_unk_id = 0;
llama_token special_sep_id = LLAMA_TOKEN_NULL;
llama_token special_pad_id = LLAMA_TOKEN_NULL;
llama_token special_cls_id = LLAMA_TOKEN_NULL; // TODO: revisit if this is really needed https://github.com/ggerganov/llama.cpp/pull/10930
llama_token special_mask_id = LLAMA_TOKEN_NULL;
llama_token linefeed_id = 13;
// fim tokens
llama_token special_fim_pre_id = LLAMA_TOKEN_NULL;
llama_token special_fim_suf_id = LLAMA_TOKEN_NULL;
llama_token special_fim_mid_id = LLAMA_TOKEN_NULL;
llama_token special_fim_pad_id = LLAMA_TOKEN_NULL;
llama_token special_fim_rep_id = LLAMA_TOKEN_NULL; // repo
llama_token special_fim_sep_id = LLAMA_TOKEN_NULL; // file separator
// tokenizer flags
bool tokenizer_add_space_prefix = false;
bool tokenizer_add_bos = false;
bool tokenizer_add_eos = false;
bool tokenizer_ignore_merges = false;
bool tokenizer_clean_spaces = false; // clean_up_tokenization_spaces
bool tokenizer_remove_extra_whitespaces = false;
bool tokenizer_escape_whitespaces = true;
bool tokenizer_treat_whitespace_as_suffix = false;
}; };