From 0485fa65a2fc3159ea9fb2ad7661a5837038b31d Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Sat, 24 Jun 2023 11:43:42 +0800
Subject: [PATCH] wstring convert for mpt

---
 gpttype_adapter.cpp  |  3 ++-
 llama.cpp            |  4 ++--
 otherarch/mpt_v3.cpp | 14 ++++++++++++--
 otherarch/utils.cpp  | 10 ++++++++--
 otherarch/utils.h    |  6 ++++++
 5 files changed, 30 insertions(+), 7 deletions(-)

diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp
index 8c716c84a..b166e2aac 100644
--- a/gpttype_adapter.cpp
+++ b/gpttype_adapter.cpp
@@ -313,6 +313,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
     = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx
     = mpt_ctx_v3.hparams.n_ctx = params.n_ctx;
 
+    //this is used for the mem_per_token eval, openblas needs more RAM
     bool use_scratch = ggml_cpu_has_gpublas();
 
     printf("System Info: %s\n", llama_print_system_info());
@@ -904,7 +905,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o
     concat_output = "";
 
     bool startedsampling = false;
-    bool use_scratch = true;
+    bool use_scratch = true; //for normal inference always use scratch
 
     timer_start();
     double time1 = 0, time2 = 0;
diff --git a/llama.cpp b/llama.cpp
index aa67038e0..5259fd52e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -105,7 +105,7 @@ static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
         { MODEL_3B,    682ull * MB },
         { MODEL_7B,   1026ull * MB },
         { MODEL_13B,  1608ull * MB },
-        { MODEL_30B,  3124ull * MB },
+        { MODEL_30B,  3224ull * MB },
         { MODEL_65B,  5120ull * MB },
     };
     return k_sizes;
@@ -119,7 +119,7 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
         { MODEL_3B,   512ull * MB },
         { MODEL_7B,   800ull * MB },
         { MODEL_13B, 1024ull * MB },
-        { MODEL_30B, 1280ull * MB },
+        { MODEL_30B, 1380ull * MB },
         { MODEL_65B, 1536ull * MB },
     };
     return k_sizes;
diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp
index a60172f51..b611b0703 100644
--- a/otherarch/mpt_v3.cpp
+++ b/otherarch/mpt_v3.cpp
@@ -86,6 +86,16 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
             fin.read((char *) buf.data(), len);
             word.assign(buf.data(), len);
 
+            // Convert token from utf-8
+            std::wstring word_multibytes = convert_to_wstring(word);
+            if(word_multibytes!=L"")
+            {
+                word.resize(word_multibytes.size());
+                for (int w = 0; w < word_multibytes.size(); w++) {
+                    word[w] = uint8_t(word_multibytes[w]);
+                }
+            }
+
             vocab.token_to_id[word] = i;
             vocab.id_to_token[i] = word;
         }
@@ -123,8 +133,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
         ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight
         ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight
 
-        ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_k
-        ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_v
+        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k
+        ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v
 
         ctx_size += (6 + 6 * n_layer) * 512; // object overhead
 
diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp
index 57c362934..02637069a 100644
--- a/otherarch/utils.cpp
+++ b/otherarch/utils.cpp
@@ -122,8 +122,14 @@ std::string convert_to_utf8(const std::wstring & input) {
 
 
 std::wstring convert_to_wstring(const std::string & input) {
-    std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
-    return converter.from_bytes(input);
+    try {
+        std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
+        return converter.from_bytes(input);
+    } catch (const std::range_error& e) {
+        return L"";
+    } catch (...) {
+        return L"";
+    }
 }
 
 void gpt_split_words(std::string str, std::vector<std::string>& words) {
diff --git a/otherarch/utils.h b/otherarch/utils.h
index bb57a8242..f9857823f 100644
--- a/otherarch/utils.h
+++ b/otherarch/utils.h
@@ -34,6 +34,12 @@ void utreplace(std::string & str, const std::string & needle, const std::string
 // poor-man's JSON parsing
 std::map<std::string, int32_t> json_parse(const std::string & fname);
 
+std::string convert_to_utf8(const std::wstring & input);
+
+std::wstring convert_to_wstring(const std::string & input);
+
+void gpt_split_words(std::string str, std::vector<std::string>& words);
+
 // split text into tokens
 //
 // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53