llama : minor stuff

2023-12-29 19:32:30 +02:00 · 2023-12-29 19:32:30 +02:00 · 128c213ab5
commit 128c213ab5
parent d24da31d2f
3 changed files with 21 additions and 23 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -78,7 +78,6 @@
 #include <thread>
 #include <type_traits>
 #include <unordered_map>
-#include <iostream>

 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@ -7006,20 +7005,20 @@ struct llm_tokenizer_bpe {

    void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
        int final_prev_index = -1;
+
        std::vector<std::string> word_collection;
-        switch (vocab.type)
-        {
-        case LLAMA_VOCAB_TYPE_BPE:
-            word_collection = bpe_gpt2_preprocess(text);
-            break;
-        case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
-            word_collection = bpe_deepseek_coder_preprocess(text);
-            break;
-        case LLAMA_VOCAB_TYPE_DEEPSEEKLLM:
-            word_collection = bpe_deepseek_llm_preprocess(text);
-            break;
-        default:
-            break;
+        switch (vocab.type) {
+            case LLAMA_VOCAB_TYPE_BPE:
+                word_collection = bpe_gpt2_preprocess(text);
+                break;
+            case LLAMA_VOCAB_TYPE_DEEPSEEKCODER:
+                word_collection = bpe_deepseek_coder_preprocess(text);
+                break;
+            case LLAMA_VOCAB_TYPE_DEEPSEEKLLM:
+                word_collection = bpe_deepseek_llm_preprocess(text);
+                break;
+            default:
+                break;
        }

        symbols_final.clear();
@ -7147,7 +7146,7 @@ private:
        work_queue.push(bigram);
    }

-    std::vector<std::string> byte_encoding_process(const std::vector<std::string> &bpe_words) {
+    std::vector<std::string> byte_encoding_process(const std::vector<std::string> & bpe_words) {
        std::vector<std::string>bpe_encoded_words;
        for (auto word : bpe_words) {
            std::string text_utf = "";
@ -7164,7 +7163,7 @@ private:
        return bpe_encoded_words;
    }

-    std::vector<size_t> regex_preprocess(const std::wstring & text, const std::vector<size_t> & offsets, const std::wstring& regex_expr) {
+    std::vector<size_t> regex_preprocess(const std::wstring & text, const std::vector<size_t> & offsets, const std::wstring & regex_expr) {
        std::wregex expr(regex_expr);
        std::vector<size_t> bpe_words; // stroe the offset of each word
        bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size
--- a/llama.h
+++ b/llama.h
@ -68,10 +68,10 @@ extern "C" {
    typedef int32_t llama_seq_id;

    enum llama_vocab_type {
-        LLAMA_VOCAB_TYPE_SPM = 0, // SentencePiece
-        LLAMA_VOCAB_TYPE_BPE = 1, // Byte Pair Encoding
-        LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // deepseek coder
-        LLAMA_VOCAB_TYPE_DEEPSEEKLLM = 3, // deepseek coder
+        LLAMA_VOCAB_TYPE_SPM           = 0, // SentencePiece
+        LLAMA_VOCAB_TYPE_BPE           = 1, // Byte Pair Encoding
+        LLAMA_VOCAB_TYPE_DEEPSEEKCODER = 2, // Deepseek Coder
+        LLAMA_VOCAB_TYPE_DEEPSEEKLLM   = 3, // Deepseek LLM
    };

    enum llama_token_type {
--- a/unicode.h
+++ b/unicode.h
@ -7,7 +7,6 @@
 #include <locale>
 #include <codecvt>
 #include <string>
-#include <cstring>

 static const std::vector<std::pair<uint32_t, uint32_t>> digit_ranges = {
 {0x30, 0x39}, {0xB2, 0xB3}, {0xB9, 0xB9}, {0x660, 0x669}, {0x6F0, 0x6F9}, {0x7C0, 0x7C9}, {0x966, 0x96F}, {0x9E6, 0x9EF}, {0xA66, 0xA6F}, {0xAE6, 0xAEF}, {0xB66, 0xB6F}, {0xBE6, 0xBEF}, {0xC66, 0xC6F},
@ -495,13 +494,13 @@ static const std::vector<std::wstring> deepseek_llm_regex = {
    L"[\U00000030-\U00000039\U000000B2-\U000000B3\U000000B9-\U000000B9\U00000660-\U00000669\U000006F0-\U000006F9\U000007C0-\U000007C9\U00000966-\U0000096F\U000009E6-\U000009EF\U00000A66-\U00000A6F\U00000AE6-\U00000AEF\U00000B66-\U00000B6F\U00000BE6-\U00000BEF\U00000C66-\U00000C6F\U00000CE6-\U00000CEF\U00000D66-\U00000D6F\U00000DE6-\U00000DEF\U00000E50-\U00000E59\U00000ED0-\U00000ED9\U00000F20-\U00000F29\U00001040-\U00001049\U00001090-\U00001099\U00001369-\U00001371\U000017E0-\U000017E9\U00001810-\U00001819\U00001946-\U0000194F\U000019D0-\U000019DA\U00001A80-\U00001A89\U00001A90-\U00001A99\U00001B50-\U00001B59\U00001BB0-\U00001BB9\U00001C40-\U00001C49\U00001C50-\U00001C59\U00002070-\U00002070\U00002074-\U00002079\U00002080-\U00002089\U00002460-\U00002468\U00002474-\U0000247C\U00002488-\U00002490\U000024EA-\U000024EA\U000024F5-\U000024FD\U000024FF-\U000024FF\U00002776-\U0000277E\U00002780-\U00002788\U0000278A-\U00002792\U0000A620-\U0000A629\U0000A8D0-\U0000A8D9\U0000A900-\U0000A909\U0000A9D0-\U0000A9D9\U0000A9F0-\U0000A9F9\U0000AA50-\U0000AA59\U0000ABF0-\U0000ABF9\U0000FF10-\U0000FF19\U000104A0-\U000104A9\U00010A40-\U00010A43\U00010D30-\U00010D39\U00010E60-\U00010E68\U00011052-\U0001105A\U00011066-\U0001106F\U000110F0-\U000110F9\U00011136-\U0001113F\U000111D0-\U000111D9\U000112F0-\U000112F9\U00011450-\U00011459\U000114D0-\U000114D9\U00011650-\U00011659\U000116C0-\U000116C9\U00011730-\U00011739\U000118E0-\U000118E9\U00011950-\U00011959\U00011C50-\U00011C59\U00011D50-\U00011D59\U00011DA0-\U00011DA9\U00016A60-\U00016A69\U00016B50-\U00016B59\U0001D7CE-\U0001D7FF\U0001E140-\U0001E149\U0001E2F0-\U0001E2F9\U0001E950-\U0001E959\U0001F100-\U0001F10A\U0001FBF0-\U0001FBF9]"
    };

-inline std::wstring from_utf8(const std::string& s)
+inline std::wstring from_utf8(const std::string & s)
 {
    std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
    return conv.from_bytes(s);
 }

-inline std::string to_utf8(const std::wstring& ws)
+inline std::string to_utf8(const std::wstring & ws)
 {
    // code to convert from utf32/utf16 to utf8
    std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;