llama : move tokenizers into llama-vocab
ggml-ci
This commit is contained in:
parent
e7dffa6bc7
commit
8fef5b1897
6 changed files with 2067 additions and 1974 deletions
|
@ -906,10 +906,10 @@ extern "C" {
|
|||
LLAMA_API llama_token llama_token_pad(const struct llama_model * model); // padding
|
||||
|
||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_add_bos_token(const struct llama_model * model);
|
||||
|
||||
// Returns -1 if unknown, 1 for true or 0 for false.
|
||||
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
||||
LLAMA_API int32_t llama_add_eos_token(const struct llama_model * model);
|
||||
|
||||
// Codellama infill tokens
|
||||
LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix
|
||||
|
|
2046
src/llama-vocab.cpp
2046
src/llama-vocab.cpp
File diff suppressed because it is too large
Load diff
|
@ -62,3 +62,13 @@ struct llama_vocab {
|
|||
};
|
||||
|
||||
const struct llama_vocab * llama_get_vocab(const struct llama_context * ctx);
|
||||
const struct llama_vocab * llama_get_vocab(const struct llama_model * model);
|
||||
|
||||
// TODO: This should probably be in llama.h
|
||||
std::vector<llama_vocab::id> llama_tokenize_internal(
|
||||
const llama_vocab & vocab,
|
||||
std::string raw_text,
|
||||
bool add_special,
|
||||
bool parse_special = false);
|
||||
|
||||
llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch);
|
||||
|
|
1974
src/llama.cpp
1974
src/llama.cpp
File diff suppressed because it is too large
Load diff
|
@ -19,6 +19,12 @@
|
|||
#include <locale>
|
||||
#include <codecvt>
|
||||
|
||||
size_t unicode_len_utf8(char src) {
|
||||
const size_t lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 };
|
||||
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
||||
return lookup[highbits];
|
||||
}
|
||||
|
||||
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||
std::string result;
|
||||
for (size_t i = 0; i < cps.size(); ++i) {
|
||||
|
|
|
@ -46,6 +46,7 @@ struct codepoint_flags {
|
|||
}
|
||||
};
|
||||
|
||||
size_t unicode_len_utf8(char src);
|
||||
|
||||
std::string unicode_cpt_to_utf8(uint32_t cp);
|
||||
uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue