tests : multi-thread the tokenizer tests
ggml-ci
This commit is contained in:
parent
895407f31b
commit
21851c11d1
3 changed files with 82 additions and 72 deletions
|
@ -7834,6 +7834,7 @@ private:
|
||||||
|
|
||||||
if (p == rev_merge.end()) {
|
if (p == rev_merge.end()) {
|
||||||
// output any symbols that did not form tokens as bytes.
|
// output any symbols that did not form tokens as bytes.
|
||||||
|
output.reserve(output.size() + symbol.n);
|
||||||
for (int j = 0; j < (int)symbol.n; ++j) {
|
for (int j = 0; j < (int)symbol.n; ++j) {
|
||||||
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
|
||||||
output.push_back(token_id);
|
output.push_back(token_id);
|
||||||
|
@ -8397,6 +8398,7 @@ struct fragment_buffer_variant {
|
||||||
raw_text(_dummy),
|
raw_text(_dummy),
|
||||||
offset(0),
|
offset(0),
|
||||||
length(0) {}
|
length(0) {}
|
||||||
|
|
||||||
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
|
||||||
:
|
:
|
||||||
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
|
||||||
|
|
|
@ -4,13 +4,13 @@
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <codecvt>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <string>
|
|
||||||
#include <codecvt>
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
|
@ -74,45 +74,46 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
catch (const std::invalid_argument &) {
|
catch (const std::invalid_argument &) {
|
||||||
fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
|
//fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
// unicode
|
||||||
// NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters
|
{
|
||||||
if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) {
|
const int nthread = std::thread::hardware_concurrency();
|
||||||
std::string str = " " + codepoint_to_utf8(cp);
|
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
std::vector<std::thread> threads(nthread);
|
||||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
|
||||||
if (str != check) {
|
for (int i = 0; i < nthread; ++i) {
|
||||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
threads[i] = std::thread([i, nthread, ctx]() {
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
|
||||||
return 3;
|
if (!( // NOLINT
|
||||||
|
(cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 &&
|
||||||
|
(cp < 0x13 || cp > 0x17) && cp != 0x19 &&
|
||||||
|
(cp < 0x1c || cp > 0x1e) &&
|
||||||
|
(cp < 0xd800 || cp > 0xdfff) &&
|
||||||
|
(cp < 0x00040000 || cp >= 0x000e0000)
|
||||||
|
)) {
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
}
|
|
||||||
}
|
|
||||||
// Restrict to assigned unicode planes
|
|
||||||
// for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
|
||||||
for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
|
|
||||||
std::string str = codepoint_to_utf8(cp);
|
std::string str = codepoint_to_utf8(cp);
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
std::string check = llama_detokenize_bpe(ctx, tokens);
|
||||||
if (str != check) {
|
if (cp != 9601 && str != check) {
|
||||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
return 4;
|
std::exit(3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
|
});
|
||||||
std::string str = codepoint_to_utf8(cp);
|
}
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
|
||||||
std::string check = llama_detokenize_bpe(ctx, tokens);
|
for (auto & t : threads) {
|
||||||
if (str != check) {
|
t.join();
|
||||||
fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
|
||||||
return 4;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
llama_free_model(model);
|
llama_free_model(model);
|
||||||
llama_free(ctx);
|
llama_free(ctx);
|
||||||
|
|
||||||
|
|
|
@ -4,13 +4,13 @@
|
||||||
#include "console.h"
|
#include "console.h"
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
|
#include <codecvt>
|
||||||
#include <cstdio>
|
#include <cstdio>
|
||||||
#include <cstring>
|
#include <cstring>
|
||||||
#include <string>
|
|
||||||
#include <codecvt>
|
|
||||||
#include <map>
|
|
||||||
#include <vector>
|
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
#include <string>
|
||||||
|
#include <thread>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (argc < 2) {
|
if (argc < 2) {
|
||||||
|
@ -72,26 +72,33 @@ int main(int argc, char **argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
|
// unicode
|
||||||
if (cp < 0xd800 || cp > 0xdfff) {
|
{
|
||||||
|
const int nthread = std::thread::hardware_concurrency();
|
||||||
|
|
||||||
|
std::vector<std::thread> threads(nthread);
|
||||||
|
|
||||||
|
for (int i = 0; i < nthread; ++i) {
|
||||||
|
threads[i] = std::thread([i, nthread, ctx]() {
|
||||||
|
for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
|
||||||
|
if (cp >= 0xd800 && cp <= 0xdfff) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
std::string str = codepoint_to_utf8(cp);
|
std::string str = codepoint_to_utf8(cp);
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
||||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
std::string check = llama_detokenize_spm(ctx, tokens);
|
||||||
if (cp != 9601 && str != check) {
|
if (cp != 9601 && str != check) {
|
||||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
cp, check.c_str(), check.length(), str.c_str(), str.length());
|
||||||
return 3;
|
std::exit(3);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
});
|
||||||
}
|
}
|
||||||
for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
|
|
||||||
std::string str = codepoint_to_utf8(cp);
|
for (auto & t : threads) {
|
||||||
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
|
t.join();
|
||||||
std::string check = llama_detokenize_spm(ctx, tokens);
|
|
||||||
if (str != check) {
|
|
||||||
fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
|
|
||||||
__func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
|
|
||||||
return 4;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue