tests : fix build + warnings (test-tokenizer-1 still fails)

This commit is contained in:
Georgi Gerganov 2023-08-14 20:14:55 +03:00
parent 58fdf3a07a
commit aa0551a504
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
2 changed files with 25 additions and 24 deletions

View file

@ -8,14 +8,13 @@
static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) { static std::string unescape_whitespace(llama_context* ctx, const std::vector<llama_token>& tokens) {
std::string result; std::string result;
for (int i = 0; i < tokens.size(); ++i) { for (size_t i = 0; i < tokens.size(); ++i) {
result += llama_token_to_str(ctx, tokens[i]); result += llama_token_to_str(ctx, tokens[i]);
} }
return result; return result;
} }
static const std::map<std::string, std::vector<llama_token>> & k_tests() static const std::map<std::string, std::vector<llama_token>> & k_tests() {
{
static std::map<std::string, std::vector<llama_token>> _k_tests = { static std::map<std::string, std::vector<llama_token>> _k_tests = {
{ " ", {1, 259, }, }, { " ", {1, 259, }, },
{ "\t", { 1, 29871, 12, }, }, { "\t", { 1, 29871, 12, }, },
@ -40,6 +39,7 @@ static const std::map<std::string, std::vector<llama_token>> & k_tests()
313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681, 313, 20787, 953, 3848, 275, 16125, 630, 29897, 29871, 31681,
313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, }, 313, 6194, 953, 29877, 2397, 393, 756, 967, 1914, 5993, 29897, }, },
}; };
return _k_tests; return _k_tests;
}; };
@ -90,7 +90,7 @@ int main(int argc, char **argv) {
} }
for (const auto & test_kv : k_tests()) { for (const auto & test_kv : k_tests()) {
std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first.c_str(), true); std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, true);
fprintf(stderr, "%s : '%s' tokenized to '%s'\n", fprintf(stderr, "%s : '%s' tokenized to '%s'\n",
__func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str()); __func__, test_kv.first.c_str(), unescape_whitespace(ctx, res).c_str());

View file

@ -8,6 +8,7 @@
#include <codecvt> #include <codecvt>
#include <map> #include <map>
#include <vector> #include <vector>
#include <locale>
static std::string vocab_type(llama_context * ctx) { static std::string vocab_type(llama_context * ctx) {
return llama_n_vocab(ctx) == 32000 ? "spm": "bpe"; return llama_n_vocab(ctx) == 32000 ? "spm": "bpe";
@ -34,7 +35,7 @@ static std::string escape_whitespace(const std::string& text) {
static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) { static std::string unescape_whitespace(llama_context * ctx, const std::vector<llama_token> & tokens) {
std::string result; std::string result;
for (int i = 0; i < tokens.size(); ++i) { for (size_t i = 0; i < tokens.size(); ++i) {
result += llama_token_to_str(ctx, tokens[i]); result += llama_token_to_str(ctx, tokens[i]);
} }
return result; return result;
@ -106,7 +107,7 @@ int main(int argc, char **argv) {
for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) { for (wchar_t ch = 0x0000; ch < 0xffff; ++ch) {
std::wstring wstr(1, ch); std::wstring wstr(1, ch);
std::string str = converter.to_bytes(wstr); std::string str = converter.to_bytes(wstr);
std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str).c_str(), false); std::vector<llama_token> tokens = llama_tokenize(ctx, escape_whitespace(str), false);
if (tokens.size() == 1) { if (tokens.size() == 1) {
fprintf(stderr, "%s : info: %s tokenized to %d \n", fprintf(stderr, "%s : info: %s tokenized to %d \n",
__func__, str.c_str(), tokens[0]); __func__, str.c_str(), tokens[0]);