Resolved issues
This commit is contained in:
parent
54f93eb50b
commit
1c924e4b35
5 changed files with 18 additions and 18 deletions
|
@ -46,6 +46,7 @@ llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama
|
||||||
llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
|
||||||
llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||||
|
llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
||||||
|
|
||||||
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||||
|
|
|
@ -63,7 +63,7 @@ int main(int argc, char **argv) {
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
llama_backend_init(false);
|
llama_backend_init();
|
||||||
|
|
||||||
// load the vocab
|
// load the vocab
|
||||||
{
|
{
|
||||||
|
|
|
@ -63,7 +63,7 @@ int main(int argc, char **argv) {
|
||||||
llama_model * model;
|
llama_model * model;
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
|
|
||||||
llama_backend_init(false);
|
llama_backend_init();
|
||||||
|
|
||||||
// load the vocab
|
// load the vocab
|
||||||
{
|
{
|
||||||
|
|
14
unicode.cpp
14
unicode.cpp
|
@ -10,8 +10,6 @@
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <locale>
|
|
||||||
#include <codecvt>
|
|
||||||
|
|
||||||
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
static std::string unicode_cpts_to_utf8(const std::vector<uint32_t> & cps) {
|
||||||
std::string result;
|
std::string result;
|
||||||
|
@ -320,17 +318,5 @@ std::vector<std::wstring> get_deepseek_llm_regex() {
|
||||||
return deepseek_llm_regex;
|
return deepseek_llm_regex;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::wstring from_utf8(const std::string & s)
|
|
||||||
{
|
|
||||||
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
|
||||||
return conv.from_bytes(s);
|
|
||||||
}
|
|
||||||
|
|
||||||
inline std::string to_utf8(const std::wstring & ws)
|
|
||||||
{
|
|
||||||
// code to convert from utf32/utf16 to utf8
|
|
||||||
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;
|
|
||||||
std::string utf8 = converter.to_bytes(ws);
|
|
||||||
return utf8;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
17
unicode.h
17
unicode.h
|
@ -3,6 +3,8 @@
|
||||||
#include <cstdint>
|
#include <cstdint>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
#include <locale>
|
||||||
|
#include <codecvt>
|
||||||
|
|
||||||
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
||||||
#define CODEPOINT_TYPE_DIGIT 1
|
#define CODEPOINT_TYPE_DIGIT 1
|
||||||
|
@ -30,5 +32,16 @@ std::vector<std::wstring> get_gpt2_regex();
|
||||||
std::vector<std::wstring> get_deepseek_coder_regex();
|
std::vector<std::wstring> get_deepseek_coder_regex();
|
||||||
std::vector<std::wstring> get_deepseek_llm_regex();
|
std::vector<std::wstring> get_deepseek_llm_regex();
|
||||||
|
|
||||||
inline std::wstring from_utf8(const std::string & s);
|
inline std::wstring from_utf8(const std::string & s)
|
||||||
inline std::string to_utf8(const std::wstring & ws);
|
{
|
||||||
|
std::wstring_convert<std::codecvt_utf8<wchar_t>> conv;
|
||||||
|
return conv.from_bytes(s);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline std::string to_utf8(const std::wstring & ws)
|
||||||
|
{
|
||||||
|
// code to convert from utf32/utf16 to utf8
|
||||||
|
std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;
|
||||||
|
std::string utf8 = converter.to_bytes(ws);
|
||||||
|
return utf8;
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue