Merge a808370c58
into 14f83526cd
This commit is contained in:
commit
8a1998aff8
3 changed files with 84 additions and 0 deletions
|
@ -30,6 +30,7 @@ else()
|
|||
endif()
|
||||
add_subdirectory(main)
|
||||
add_subdirectory(tokenize)
|
||||
add_subdirectory(tokenizer-verifier)
|
||||
add_subdirectory(parallel)
|
||||
add_subdirectory(perplexity)
|
||||
add_subdirectory(quantize)
|
||||
|
|
5
examples/tokenizer-verifier/CMakeLists.txt
Normal file
5
examples/tokenizer-verifier/CMakeLists.txt
Normal file
|
@ -0,0 +1,5 @@
|
|||
set(TARGET tokenizer-verifier)
|
||||
add_executable(${TARGET} tokenizer-verifier.cpp)
|
||||
install(TARGETS ${TARGET} RUNTIME)
|
||||
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
|
||||
target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
78
examples/tokenizer-verifier/tokenizer-verifier.cpp
Normal file
78
examples/tokenizer-verifier/tokenizer-verifier.cpp
Normal file
|
@ -0,0 +1,78 @@
|
|||
#include "common.h"
|
||||
#include "llama.h"
|
||||
|
||||
#include <cstdio>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
static int unicode_to_utf8(int codepoint, char *dest) {
|
||||
// https://stackoverflow.com/a/4609989 — who needs iconv?
|
||||
if (codepoint < 0x80) {
|
||||
*dest++ = codepoint;
|
||||
} else if (codepoint < 0x800) {
|
||||
*dest++ = 192 + codepoint / 64, *dest++ = 128 + codepoint % 64;
|
||||
// we also support reserved utf-16 surrogates 0xd800 - 0xdfff for simplicity
|
||||
} else if (codepoint < 0x10000) {
|
||||
*dest++ = 224 + codepoint / 4096, *dest++ = 128 + codepoint / 64 % 64,
|
||||
*dest++ = 128 + codepoint % 64;
|
||||
} else if (codepoint < 0x110000) {
|
||||
*dest++ = 240 + codepoint / 262144, *dest++ = 128 + codepoint / 4096 % 64,
|
||||
*dest++ = 128 + codepoint / 64 % 64, *dest++ = 128 + codepoint % 64;
|
||||
} else {
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
if (argc < 2) {
|
||||
printf("usage: %s MODEL_PATH\n", argv[0]);
|
||||
return 1;
|
||||
}
|
||||
|
||||
const char *model_path = argv[1];
|
||||
|
||||
llama_backend_init();
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.vocab_only = true;
|
||||
llama_model *model = llama_load_model_from_file(model_path, model_params);
|
||||
|
||||
std::vector<llama_token> tokens;
|
||||
|
||||
int failed_ascii = 0;
|
||||
int ascii_max = 127;
|
||||
for (int c = 0; c <= ascii_max; c++) {
|
||||
const char prompt[] = {(char)c, '\0'};
|
||||
try {
|
||||
tokens = ::llama_tokenize(model, prompt, false, true);
|
||||
} catch (...) {
|
||||
printf("%#x -> Tokenization failed for char '%c'\n", c, (char)c);
|
||||
failed_ascii += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
printf("%d/%d 7-bit ascii characters could not be tokenized\n", failed_ascii, ascii_max);
|
||||
|
||||
int failed_unicode = 0;
|
||||
int utf8_max = 0x10FFFF;
|
||||
// Now let's do all potential codepoints
|
||||
for (int cp = 0; cp <= utf8_max; cp++) {
|
||||
char buf[5] = {};
|
||||
if (unicode_to_utf8(cp, buf)) {
|
||||
printf("Impossible to encode codepoint %#x\n", cp);
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
tokens = ::llama_tokenize(model, buf, false, true);
|
||||
} catch (...) {
|
||||
// printf("%#x -> Tokenization failed for codepoint '%s'\n", cp, buf);
|
||||
failed_unicode += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
printf("%d/%d potential unicode codepoints not tokenized\n", failed_unicode,
|
||||
utf8_max);
|
||||
|
||||
return (failed_ascii != 0 || failed_unicode != 0);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue