Windows: convert prompt in system locale to UTF-8. Allows to use others languages without tambourine dancing...

2023-03-30 04:22:45 +05:00 · 2023-03-30 04:22:45 +05:00 · dc5adf173a
commit dc5adf173a
parent 9cbc404ba6
1 changed files with 21 additions and 0 deletions
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@ -18,6 +18,10 @@
 #include <signal.h>
 #endif

+#if defined (_WIN32)
+#include <windows.h>
+#endif
+
 static console_state con_st;

 static bool is_interacting = false;
@ -36,6 +40,18 @@ void sigint_handler(int signo) {
 }
 #endif

+#if defined (_WIN32)
+std::string promptconvert(const std::string str)
+{
+    // Convert from current locale to UTF-8
+    wchar_t wstr[1024];
+    int wlen = MultiByteToWideChar(CP_ACP, 0, str.c_str(), str.length(), wstr, 1024);
+    char mbstr[2048];
+    int mblen = WideCharToMultiByte(CP_UTF8, 0, wstr, wlen, mbstr, 2048, 0, 0);
+    return std::string(mbstr, mblen);
+}
+#endif
+
 int main(int argc, char ** argv) {
    gpt_params params;
    params.model = "models/llama-7B/ggml-model.bin";
@ -136,6 +152,11 @@ int main(int argc, char ** argv) {
    // Add a space in front of the first character to match OG llama tokenizer behavior
    params.prompt.insert(0, 1, ' ');

+#if defined (_WIN32)
+    // Convert from current locale to UTF-8
+    params.prompt = promptconvert(params.prompt);
+#endif
+
    // tokenize the prompt
    auto embd_inp = ::llama_tokenize(ctx, params.prompt, true);