tests : use new tokenizer type API (#2692)
* Merge tokenizer fixes into the gguf branch. * Add test vocabularies * Adapt convert-new.py (and fix a clang-cl compiler error on windows) * Improved tokenizer test But does it work on MacOS? * Improve token type support - Added @klosax code to convert.py - Improved token type support in vocabulary * Exclude platform dependent tests * More sentencepiece compatibility by eliminating magic numbers * Restored accidentally removed comment * Improve commentary * Use token type API in test-tokenizer-1.cpp
This commit is contained in:
parent
0b53b8b08d
commit
49c25cce19
2 changed files with 4 additions and 4 deletions
4
convert.py
Executable file → Normal file
4
convert.py
Executable file → Normal file
|
@ -741,6 +741,8 @@ class OutputFile:
|
||||||
tokens = []
|
tokens = []
|
||||||
scores = []
|
scores = []
|
||||||
toktypes = []
|
toktypes = []
|
||||||
|
# NOTE: `all_tokens` returns the the base vocabulary and added tokens
|
||||||
|
# TODO: add special tokens?
|
||||||
for text, score, toktype in vocab.all_tokens():
|
for text, score, toktype in vocab.all_tokens():
|
||||||
tokens.append(text)
|
tokens.append(text)
|
||||||
scores.append(score)
|
scores.append(score)
|
||||||
|
@ -751,8 +753,6 @@ class OutputFile:
|
||||||
self.gguf.add_token_scores(scores)
|
self.gguf.add_token_scores(scores)
|
||||||
self.gguf.add_token_types(toktypes)
|
self.gguf.add_token_types(toktypes)
|
||||||
|
|
||||||
# TODO: added / special tokens
|
|
||||||
|
|
||||||
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
|
def add_tensor_info(self, name: str, tensor: LazyTensor) -> None:
|
||||||
n_elements = 1
|
n_elements = 1
|
||||||
for dim in tensor.shape:
|
for dim in tensor.shape:
|
||||||
|
|
|
@ -87,8 +87,8 @@ int main(int argc, char **argv) {
|
||||||
return 2;
|
return 2;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// TODO: needs access to token types
|
llama_token_type type = llama_token_get_type(ctx, i);
|
||||||
if (0 <= i && i < 259) {
|
if (type == LLAMA_TOKEN_TYPE_UNKNOWN || type == LLAMA_TOKEN_TYPE_CONTROL || type == LLAMA_TOKEN_TYPE_BYTE) {
|
||||||
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
|
fprintf(stderr, "%s : info: token %d is string %s and bpe returns tokens %s\n",
|
||||||
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
__func__, i, llama_token_to_str(ctx, i).c_str(), unescape_whitespace(ctx, tokens).c_str());
|
||||||
} else {
|
} else {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue