llama : refactor unicode stuff (#5992)
* llama : refactor unicode stuff ggml-ci * unicode : names * make : fix c++ compiler * unicode : names * unicode : straighten tables * zig : fix build * unicode : put nfd normalization behind API ggml-ci * swift : fix build * unicode : add BOM * unicode : add <cstdint> ggml-ci * unicode : pass as cpts as const ref
This commit is contained in:
parent
828defefb6
commit
83796e62bc
9 changed files with 1744 additions and 836 deletions
|
@ -85,7 +85,7 @@ int main(int argc, char **argv) {
|
|||
continue;
|
||||
}
|
||||
|
||||
std::string str = codepoint_to_utf8(cp);
|
||||
std::string str = unicode_cpt_to_utf8(cp);
|
||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||
if (cp != 9601 && str != check) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue