Work on the BPE tokenizer (#3252)

* Work on the BPE tokenizer

Tokenizer tests work for Falcon-7B

* Try to fix build problem

* Fix debug assertion failure

* Fix MSVC Unicode BOM problem

* Cleanup and an improvement

* Fix compiler warning

* Cleanup

* Test doesn't work over the full range of Unicodes

* Update .gitignore and Makefile

* Another Makefile rule

* Testing Aquila

* Moving byte decoding back to `token_to_piece` ...

... because everyone is using it.

* Guarding some unusable code pathes

* Streamlining code and adding some more assertions

Important change: I'm classifying added tokens as control tokens now for BPE.

* Adding a comment

* Adding another assertion

* Fixed vocabulary guarding assertions

* Fix PR for recent change

* Fix PR for recent change

* Fix for compiler warning

* Fix PR for recent change

* Fix PR for recent change

* Fix PR for recent change

* Fix for compiler warning

* Fixes for more compiler warnings

* Remove unused code

* Fix initialization of static maps

* Add scores and token types back, adapt gptneox

* Update llama.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update unicode.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Update unicode.h

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>

* Ported Starcoder and added some assertions

* Fix coding style

* Apply @jploski 's fix for missing tokens

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
goerch 2023-10-03 09:16:26 +02:00 committed by GitHub
parent 1c84003c08
commit ff5a3f0c09
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
15 changed files with 852 additions and 227 deletions

View file

@ -1,5 +1,6 @@
#include "llama.h"
#include "common.h"
#include "unicode.h"
#include "console.h"
#include <cassert>
@ -11,30 +12,6 @@
#include <vector>
#include <locale>
typedef int codepoint;
static std::string codepoint_to_utf8(codepoint cp) {
std::string result;
if (0x00 <= cp && cp <= 0x7f) {
result.push_back(cp);
} else if (0x80 <= cp && cp <= 0x7ff) {
result.push_back(0xc0 | ((cp >> 6) & 0x1f));
result.push_back(0x80 | (cp & 0x3f));
} else if (0x800 <= cp && cp <= 0xffff) {
result.push_back(0xe0 | ((cp >> 12) & 0x0f));
result.push_back(0x80 | ((cp >> 6) & 0x3f));
result.push_back(0x80 | (cp & 0x3f));
} else if (0x10000 <= cp && cp <= 0x10ffff) {
result.push_back(0xf0 | ((cp >> 18) & 0x07));
result.push_back(0x80 | ((cp >> 12) & 0x3f));
result.push_back(0x80 | ((cp >> 6) & 0x3f));
result.push_back(0x80 | (cp & 0x3f));
} else {
throw std::invalid_argument("invalid codepoint");
}
return result;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
@ -95,7 +72,7 @@ int main(int argc, char **argv) {
}
}
for (codepoint cp = 0x0000; cp < 0xffff; ++cp) {
for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
if (cp < 0xd800 || cp > 0xdfff) {
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
@ -107,7 +84,7 @@ int main(int argc, char **argv) {
}
}
}
for (codepoint cp = 0x10000; cp < 0x0010ffff; ++cp) {
for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
std::string str = codepoint_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize_spm(ctx, tokens);