Add test for MPT tokenization
This commit is contained in:
parent
22c69a2794
commit
6a94ae6d49
5 changed files with 35 additions and 24 deletions
|
@ -128,15 +128,21 @@ vocab_size = hparams["vocab_size"]
|
||||||
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
# ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
|
||||||
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
tokenizer = AutoTokenizer.from_pretrained(dir_model)
|
||||||
|
|
||||||
|
added_vocab = tokenizer.get_added_vocab()
|
||||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
|
if i in reverse_vocab:
|
||||||
scores.append(0.0) # dummy
|
tokens.append(reverse_vocab[i])
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
if reverse_vocab[i] not in added_vocab:
|
||||||
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
else:
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
else:
|
||||||
|
tokens.append(f"[PAD{i}]")
|
||||||
|
toktypes.append(gguf.TokenType.USER_DEFINED)
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
gguf_writer.add_token_scores(scores)
|
|
||||||
gguf_writer.add_token_types(toktypes)
|
gguf_writer.add_token_types(toktypes)
|
||||||
|
|
||||||
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
special_vocab = gguf.SpecialVocab(dir_model, load_merges = True)
|
||||||
|
|
41
llama.cpp
41
llama.cpp
|
@ -975,20 +975,6 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
|
||||||
(void) tensor;
|
(void) tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
|
|
||||||
std::vector<char> result(8, 0);
|
|
||||||
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
|
||||||
if (n_tokens < 0) {
|
|
||||||
result.resize(-n_tokens);
|
|
||||||
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
|
||||||
GGML_ASSERT(check == -n_tokens);
|
|
||||||
} else {
|
|
||||||
result.resize(n_tokens);
|
|
||||||
}
|
|
||||||
|
|
||||||
return std::string(result.data(), result.size());
|
|
||||||
}
|
|
||||||
|
|
||||||
//
|
//
|
||||||
// globals
|
// globals
|
||||||
//
|
//
|
||||||
|
@ -1202,10 +1188,10 @@ struct llama_vocab {
|
||||||
id special_eot_id = 32010;
|
id special_eot_id = 32010;
|
||||||
|
|
||||||
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
int find_bpe_rank(std::string token_left, std::string token_right) const {
|
||||||
replace_all(token_left, " ", "\u0120");
|
GGML_ASSERT(token_left.find(" ") == std::string::npos);
|
||||||
replace_all(token_left, "\n", "\u010A");
|
GGML_ASSERT(token_left.find("\n") == std::string::npos);
|
||||||
replace_all(token_right, " ", "\u0120");
|
GGML_ASSERT(token_right.find(" ") == std::string::npos);
|
||||||
replace_all(token_right, "\n", "\u010A");
|
GGML_ASSERT(token_right.find("\n") == std::string::npos);
|
||||||
|
|
||||||
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
|
||||||
if (it == bpe_ranks.end()) {
|
if (it == bpe_ranks.end()) {
|
||||||
|
@ -7461,6 +7447,21 @@ void llama_sample_repetition_penalties(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static std::string llama_token_to_piece(const struct llama_context* ctx, llama_token token) {
|
||||||
|
std::vector<char> result(8, 0);
|
||||||
|
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
||||||
|
if (n_tokens < 0) {
|
||||||
|
result.resize(-n_tokens);
|
||||||
|
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
||||||
|
GGML_ASSERT(check == -n_tokens);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
result.resize(n_tokens);
|
||||||
|
}
|
||||||
|
|
||||||
|
return std::string(result.data(), result.size());
|
||||||
|
}
|
||||||
|
|
||||||
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * candidates, const struct llama_grammar * grammar) {
|
||||||
GGML_ASSERT(ctx);
|
GGML_ASSERT(ctx);
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
@ -7480,7 +7481,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
const llama_token id = candidates->data[i].id;
|
const llama_token id = candidates->data[i].id;
|
||||||
const std::string piece = llama_token_to_str(ctx, id);
|
const std::string piece = llama_token_to_piece(ctx, id);
|
||||||
if (id == eos) {
|
if (id == eos) {
|
||||||
if (!allow_eos) {
|
if (!allow_eos) {
|
||||||
candidates->data[i].logit = -INFINITY;
|
candidates->data[i].logit = -INFINITY;
|
||||||
|
@ -7692,7 +7693,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
const std::string piece = llama_token_to_str(ctx, token);
|
const std::string piece = llama_token_to_piece(ctx, token);
|
||||||
|
|
||||||
// Note terminating 0 in decoded string
|
// Note terminating 0 in decoded string
|
||||||
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
const auto decoded = decode_utf8(piece.c_str(), grammar->partial_utf8);
|
||||||
|
|
BIN
models/ggml-vocab-mpt.gguf
Normal file
BIN
models/ggml-vocab-mpt.gguf
Normal file
Binary file not shown.
|
@ -31,6 +31,7 @@ llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE
|
||||||
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
|
llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||||
|
|
|
@ -62,6 +62,9 @@ int main(int argc, char **argv) {
|
||||||
const int n_vocab = llama_n_vocab(model);
|
const int n_vocab = llama_n_vocab(model);
|
||||||
|
|
||||||
for (int i = 0; i < n_vocab; ++i) {
|
for (int i = 0; i < n_vocab; ++i) {
|
||||||
|
if (llama_token_get_type(ctx, i) == LLAMA_TOKEN_TYPE_USER_DEFINED) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
|
std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
|
||||||
try {
|
try {
|
||||||
auto cps = codepoints_from_utf8(str);
|
auto cps = codepoints_from_utf8(str);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue