llama : move tokenizers into llama-vocab

ggml-ci
This commit is contained in:
Georgi Gerganov 2024-07-19 15:44:30 +03:00
parent e7dffa6bc7
commit 8fef5b1897
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
6 changed files with 2067 additions and 1974 deletions

View file

@ -906,10 +906,10 @@ extern "C" {
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
// Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
// Returns -1 if unknown, 1 for true or 0 for false.
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
// Codellama infill tokens
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix

File diff suppressed because it is too large Load diff

View file

@ -62,3 +62,13 @@ struct llama_vocab {
};
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
const struct llama_vocab * llama_get_vocab(const struct llama_model * model);
// TODO: This should probably be in llama.h
std::vector<llama_vocab::id> llama_tokenize_internal(
const llama_vocab & vocab,
std::string raw_text,
bool add_special,
bool parse_special = false);
llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);

File diff suppressed because it is too large Load diff

View file

@ -19,6 +19,12 @@
#include <locale>
#include <codecvt>
size_t unicode_len_utf8(char src) {
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
return lookup[highbits];
}
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
std::string result;
for (size_t i = 0; i < cps.size(); ++i) {

View file

@ -46,6 +46,7 @@ struct codepoint_flags {
}
};
size_t unicode_len_utf8(char src);
std::string unicode_cpt_to_utf8(uint32_t cp);
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);