From 0485fa65a2fc3159ea9fb2ad7661a5837038b31d Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Sat, 24 Jun 2023 11:43:42 +0800 Subject: [PATCH] wstring convert for mpt --- gpttype_adapter.cpp | 3 ++- llama.cpp | 4 ++-- otherarch/mpt_v3.cpp | 14 ++++++++++++-- otherarch/utils.cpp | 10 ++++++++-- otherarch/utils.h | 6 ++++++ 5 files changed, 30 insertions(+), 7 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 8c716c84a..b166e2aac 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -313,6 +313,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in = gpt2_ctx_v1.hparams.n_ctx = gpt2_ctx_v2.hparams.n_ctx = gpt2_ctx_v3.hparams.n_ctx = mpt_ctx_v3.hparams.n_ctx = params.n_ctx; + //this is used for the mem_per_token eval, openblas needs more RAM bool use_scratch = ggml_cpu_has_gpublas(); printf("System Info: %s\n", llama_print_system_info()); @@ -904,7 +905,7 @@ generation_outputs gpttype_generate(const generation_inputs inputs, generation_o concat_output = ""; bool startedsampling = false; - bool use_scratch = true; + bool use_scratch = true; //for normal inference always use scratch timer_start(); double time1 = 0, time2 = 0; diff --git a/llama.cpp b/llama.cpp index aa67038e0..5259fd52e 100644 --- a/llama.cpp +++ b/llama.cpp @@ -105,7 +105,7 @@ static const std::map & MEM_REQ_KV_SELF() { MODEL_3B, 682ull * MB }, { MODEL_7B, 1026ull * MB }, { MODEL_13B, 1608ull * MB }, - { MODEL_30B, 3124ull * MB }, + { MODEL_30B, 3224ull * MB }, { MODEL_65B, 5120ull * MB }, }; return k_sizes; @@ -119,7 +119,7 @@ static const std::map & MEM_REQ_EVAL() { MODEL_3B, 512ull * MB }, { MODEL_7B, 800ull * MB }, { MODEL_13B, 1024ull * MB }, - { MODEL_30B, 1280ull * MB }, + { MODEL_30B, 1380ull * MB }, { MODEL_65B, 1536ull * MB }, }; return k_sizes; diff --git a/otherarch/mpt_v3.cpp b/otherarch/mpt_v3.cpp index a60172f51..b611b0703 100644 --- a/otherarch/mpt_v3.cpp +++ b/otherarch/mpt_v3.cpp @@ -86,6 +86,16 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo fin.read((char *) buf.data(), len); word.assign(buf.data(), len); + // Convert token from utf-8 + std::wstring word_multibytes = convert_to_wstring(word); + if(word_multibytes!=L"") + { + word.resize(word_multibytes.size()); + for (int w = 0; w < word_multibytes.size(); w++) { + word[w] = uint8_t(word_multibytes[w]); + } + } + vocab.token_to_id[word] = i; vocab.id_to_token[i] = word; } @@ -123,8 +133,8 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo ctx_size += n_layer * (4 * n_embd * n_embd * ggml_type_sizef(wtype)); // mlp_mlp_up_weight ctx_size += n_layer * (n_embd * n_embd * 4 * ggml_type_sizef(wtype)); // mlp_mlp_down_weight - ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_k - ctx_size += (n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16)); // memory_v + ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_k + ctx_size += n_ctx * n_layer * n_embd * ggml_type_sizef(GGML_TYPE_F16); // memory_v ctx_size += (6 + 6 * n_layer) * 512; // object overhead diff --git a/otherarch/utils.cpp b/otherarch/utils.cpp index 57c362934..02637069a 100644 --- a/otherarch/utils.cpp +++ b/otherarch/utils.cpp @@ -122,8 +122,14 @@ std::string convert_to_utf8(const std::wstring & input) { std::wstring convert_to_wstring(const std::string & input) { - std::wstring_convert> converter; - return converter.from_bytes(input); + try { + std::wstring_convert> converter; + return converter.from_bytes(input); + } catch (const std::range_error& e) { + return L""; + } catch (...) { + return L""; + } } void gpt_split_words(std::string str, std::vector& words) { diff --git a/otherarch/utils.h b/otherarch/utils.h index bb57a8242..f9857823f 100644 --- a/otherarch/utils.h +++ b/otherarch/utils.h @@ -34,6 +34,12 @@ void utreplace(std::string & str, const std::string & needle, const std::string // poor-man's JSON parsing std::map json_parse(const std::string & fname); +std::string convert_to_utf8(const std::wstring & input); + +std::wstring convert_to_wstring(const std::string & input); + +void gpt_split_words(std::string str, std::vector& words); + // split text into tokens // // ref: https://github.com/openai/gpt-2/blob/a74da5d99abaaba920de8131d64da2862a8f213b/src/encoder.py#L53