llama : minor stuff

This commit is contained in:
Georgi Gerganov 2023-12-29 19:32:30 +02:00
parent d24da31d2f
commit 128c213ab5
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 21 additions and 23 deletions

View file

@ -78,7 +78,6 @@
#include <thread>
#include <type_traits>
#include <unordered_map>
#include <iostream>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
@ -7006,20 +7005,20 @@ struct llm_tokenizer_bpe {
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
int final_prev_index = -1;
std::vector<std::string> word_collection;
switch (vocab.type)
{
case LLAMA_VOCAB_TYPE_BPE:
word_collection = bpe_gpt2_preprocess(text);
break;
case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
word_collection = bpe_deepseek_coder_preprocess(text);
break;
case LLAMA_VOCAB_TYPE_DEEPSEEKLLM:
word_collection = bpe_deepseek_llm_preprocess(text);
break;
default:
break;
switch (vocab.type) {
case LLAMA_VOCAB_TYPE_BPE:
word_collection = bpe_gpt2_preprocess(text);
break;
case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
word_collection = bpe_deepseek_coder_preprocess(text);
break;
case LLAMA_VOCAB_TYPE_DEEPSEEKLLM:
word_collection = bpe_deepseek_llm_preprocess(text);
break;
default:
break;
}
symbols_final.clear();
@ -7147,7 +7146,7 @@ private:
work_queue.push(bigram);
}
std::vector<std::string> byte_encoding_process(const std::vector<std::string> &bpe_words) {
std::vector<std::string> byte_encoding_process(const std::vector<std::string> & bpe_words) {
std::vector<std::string>bpe_encoded_words;
for (auto word : bpe_words) {
std::string text_utf = "";
@ -7164,7 +7163,7 @@ private:
return bpe_encoded_words;
}
std::vector<size_t> regex_preprocess(const std::wstring & text, const std::vector<size_t> & offsets, const std::wstring& regex_expr) {
std::vector<size_t> regex_preprocess(const std::wstring & text, const std::vector<size_t> & offsets, const std::wstring & regex_expr) {
std::wregex expr(regex_expr);
std::vector<size_t> bpe_words; // stroe the offset of each word
bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size

View file

@ -68,10 +68,10 @@ extern "C" {
typedef int32_t llama_seq_id;
enum llama_vocab_type {
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // deepseek coder
LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 3, // deepseek coder
LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // Deepseek Coder
LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 3, // Deepseek LLM
};
enum llama_token_type {

View file

@ -7,7 +7,6 @@
#include <locale>
#include <codecvt>
#include <string>
#include <cstring>
static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
{0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
@ -495,13 +494,13 @@ static const std::vector<std::wstring> deepseek_llm_regex = {
L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]"
};
inline std::wstring from_utf8(const std::string& s)
inline std::wstring from_utf8(const std::string & s)
{
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
return conv.from_bytes(s);
}
inline std::string to_utf8(const std::wstring& ws)
inline std::string to_utf8(const std::wstring & ws)
{
// code to convert from utf32/utf16 to utf8
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;