diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 15706f122..5f1bb729f 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -46,6 +46,7 @@ llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) +llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) diff --git a/tests/test-tokenizer-0-deepseek-coder.cpp b/tests/test-tokenizer-0-deepseek-coder.cpp index 16966e072..1be6b7ab7 100644 --- a/tests/test-tokenizer-0-deepseek-coder.cpp +++ b/tests/test-tokenizer-0-deepseek-coder.cpp @@ -63,7 +63,7 @@ int main(int argc, char **argv) { llama_model * model; llama_context * ctx; - llama_backend_init(false); + llama_backend_init(); // load the vocab { diff --git a/tests/test-tokenizer-0-deepseek-llm.cpp b/tests/test-tokenizer-0-deepseek-llm.cpp index 98d628615..8afc0a81f 100644 --- a/tests/test-tokenizer-0-deepseek-llm.cpp +++ b/tests/test-tokenizer-0-deepseek-llm.cpp @@ -63,7 +63,7 @@ int main(int argc, char **argv) { llama_model * model; llama_context * ctx; - llama_backend_init(false); + llama_backend_init(); // load the vocab { diff --git a/unicode.cpp b/unicode.cpp index 73db0e1ce..0a2863ea0 100644 --- a/unicode.cpp +++ b/unicode.cpp @@ -10,8 +10,6 @@ #include #include #include -#include -#include static std::string unicode_cpts_to_utf8(const std::vector & cps) { std::string result; @@ -320,17 +318,5 @@ std::vector get_deepseek_llm_regex() { return deepseek_llm_regex; } -inline std::wstring from_utf8(const std::string & s) -{ - std::wstring_convert> conv; - return conv.from_bytes(s); -} -inline std::string to_utf8(const std::wstring & ws) -{ - // code to convert from utf32/utf16 to utf8 - std::wstring_convert, wchar_t> converter; - std::string utf8 = converter.to_bytes(ws); - return utf8; -} diff --git a/unicode.h b/unicode.h index efe04ce99..cf2f8e976 100644 --- a/unicode.h +++ b/unicode.h @@ -3,6 +3,8 @@ #include #include #include +#include +#include #define CODEPOINT_TYPE_UNIDENTIFIED 0 #define CODEPOINT_TYPE_DIGIT 1 @@ -30,5 +32,16 @@ std::vector get_gpt2_regex(); std::vector get_deepseek_coder_regex(); std::vector get_deepseek_llm_regex(); -inline std::wstring from_utf8(const std::string & s); -inline std::string to_utf8(const std::wstring & ws); \ No newline at end of file +inline std::wstring from_utf8(const std::string & s) +{ + std::wstring_convert> conv; + return conv.from_bytes(s); +} + +inline std::string to_utf8(const std::wstring & ws) +{ + // code to convert from utf32/utf16 to utf8 + std::wstring_convert, wchar_t> converter; + std::string utf8 = converter.to_bytes(ws); + return utf8; +} \ No newline at end of file