From 128c213ab597ca4ffa38b996e30ce941ec6e53a5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 29 Dec 2023 19:32:30 +0200 Subject: [PATCH] llama : minor stuff --- llama.cpp | 31 +++++++++++++++---------------- llama.h | 8 ++++---- unicode.h | 5 ++--- 3 files changed, 21 insertions(+), 23 deletions(-) diff --git a/llama.cpp b/llama.cpp index e34de68a5..2ac9f2cf0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -78,7 +78,6 @@ #include #include #include -#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data @@ -7006,20 +7005,20 @@ struct llm_tokenizer_bpe { void tokenize(const std::string & text, std::vector & output) { int final_prev_index = -1; + std::vector word_collection; - switch (vocab.type) - { - case LLAMA_VOCAB_TYPE_BPE: - word_collection = bpe_gpt2_preprocess(text); - break; - case LLAMA_VOCAB_TYPE_DEEPSEEKCODER: - word_collection = bpe_deepseek_coder_preprocess(text); - break; - case LLAMA_VOCAB_TYPE_DEEPSEEKLLM: - word_collection = bpe_deepseek_llm_preprocess(text); - break; - default: - break; + switch (vocab.type) { + case LLAMA_VOCAB_TYPE_BPE: + word_collection = bpe_gpt2_preprocess(text); + break; + case LLAMA_VOCAB_TYPE_DEEPSEEKCODER: + word_collection = bpe_deepseek_coder_preprocess(text); + break; + case LLAMA_VOCAB_TYPE_DEEPSEEKLLM: + word_collection = bpe_deepseek_llm_preprocess(text); + break; + default: + break; } symbols_final.clear(); @@ -7147,7 +7146,7 @@ private: work_queue.push(bigram); } - std::vector byte_encoding_process(const std::vector &bpe_words) { + std::vector byte_encoding_process(const std::vector & bpe_words) { std::vectorbpe_encoded_words; for (auto word : bpe_words) { std::string text_utf = ""; @@ -7164,7 +7163,7 @@ private: return bpe_encoded_words; } - std::vector regex_preprocess(const std::wstring & text, const std::vector & offsets, const std::wstring& regex_expr) { + std::vector regex_preprocess(const std::wstring & text, const std::vector & offsets, const std::wstring & regex_expr) { std::wregex expr(regex_expr); std::vector bpe_words; // stroe the offset of each word bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size diff --git a/llama.h b/llama.h index 925b37358..897d6da3f 100644 --- a/llama.h +++ b/llama.h @@ -68,10 +68,10 @@ extern "C" { typedef int32_t llama_seq_id; enum llama_vocab_type { - LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece - LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding - LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // deepseek coder - LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 3, // deepseek coder + LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece + LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding + LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // Deepseek Coder + LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 3, // Deepseek LLM }; enum llama_token_type { diff --git a/unicode.h b/unicode.h index 44cc7439a..8cc0a12a9 100644 --- a/unicode.h +++ b/unicode.h @@ -7,7 +7,6 @@ #include #include #include -#include static const std::vector> digit_ranges = { {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F}, @@ -495,13 +494,13 @@ static const std::vector deepseek_llm_regex = { L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]" }; -inline std::wstring from_utf8(const std::string& s) +inline std::wstring from_utf8(const std::string & s) { std::wstring_convert> conv; return conv.from_bytes(s); } -inline std::string to_utf8(const std::wstring& ws) +inline std::string to_utf8(const std::wstring & ws) { // code to convert from utf32/utf16 to utf8 std::wstring_convert, wchar_t> converter;