llama : towards llama3 tokenization support (wip)

This commit is contained in:
Georgi Gerganov 2024-04-26 14:55:03 +03:00
parent ed42711b90
commit 4907e41aa7
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
8 changed files with 298 additions and 121 deletions

View file

@ -67,8 +67,6 @@ extern "C" {
LLAMA_VOCAB_TYPE_SPM = 1, // LLaMA tokenizer based on byte-level BPE with byte fallback
LLAMA_VOCAB_TYPE_BPE = 2, // GPT-2 tokenizer based on byte-level BPE
LLAMA_VOCAB_TYPE_WPM = 3, // BERT tokenizer based on WordPiece
LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 4, // Deepseek Coder
LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 5, // Deepseek LLM
};
// note: these values should be synchronized with ggml_rope