diff --git a/.gitignore b/.gitignore index 967dbb23b..b36d54a98 100644 --- a/.gitignore +++ b/.gitignore @@ -86,4 +86,4 @@ tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama -tests/test-tokenizer-1-falcon +tests/test-tokenizer-1-bpe diff --git a/Makefile b/Makefile index 152964b4c..67575794b 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o # Binaries only useful for tests -TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-falcon +TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe # Code coverage output files COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report @@ -51,7 +51,7 @@ test: $(TEST_TARGETS) ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \ continue; \ - elif [ "$$test_target" = "tests/test-tokenizer-1-falcon" ]; then \ + elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \ continue; \ else \ echo "Running test $$test_target..."; \ @@ -621,7 +621,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-1-falcon: tests/test-tokenizer-1-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS) +tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS) diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py index cd8371844..8de3f3126 100755 --- a/convert-falcon-hf-to-gguf.py +++ b/convert-falcon-hf-to-gguf.py @@ -161,15 +161,12 @@ byte_encoder = bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} for i in range(vocab_size): - if reverse_vocab[i] in byte_decoder: - text = reverse_vocab[i] - tokens.append(text) - scores.append(0.0) # dummy + text = reverse_vocab[i] + tokens.append(text) + scores.append(0.0) # dummy + if text in byte_decoder: toktypes.append(gguf.TokenType.BYTE) else: - text = reverse_vocab[i] - tokens.append(text) - scores.append(0.0) # dummy toktypes.append(gguf.TokenType.NORMAL) gguf_writer.add_token_list(tokens) diff --git a/convert.py b/convert.py index 4ac5030db..f55afdcd3 100755 --- a/convert.py +++ b/convert.py @@ -339,21 +339,15 @@ class BpeVocab: def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]: tokenizer = self.bpe_tokenizer from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import] + reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()} byte_encoder = tokenization_gpt2.bytes_to_unicode() byte_decoder = {v: k for k, v in byte_encoder.items()} + score = 0.0 - for i, item in enumerate(tokenizer): - text: bytes = item.encode("utf-8") - # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior? - if i <= 258 and text.startswith(b'<') and text.endswith(b'>'): - if i == 0 and text == b'': - toktype = gguf.TokenType.UNKNOWN - elif i == 1 or i == 2: - toktype = gguf.TokenType.CONTROL - elif i >= 3 and text.startswith(b'<0x'): - toktype = gguf.TokenType.BYTE - else: - toktype = gguf.TokenType.NORMAL + for i, _ in enumerate(tokenizer): + text = reverse_vocab[i] + if text in byte_decoder: + toktype = gguf.TokenType.BYTE else: toktype = gguf.TokenType.NORMAL yield text, score, toktype diff --git a/models/ggml-vocab-aquila.gguf b/models/ggml-vocab-aquila.gguf new file mode 100644 index 000000000..5ffc9c3cf Binary files /dev/null and b/models/ggml-vocab-aquila.gguf differ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 227dd6659..dc989abab 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -28,9 +28,9 @@ llama_build_executable(test-tokenizer-0-falcon.cpp) llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_build_executable(test-tokenizer-1-llama.cpp) llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) -llama_build_executable(test-tokenizer-1-falcon.cpp) -llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) -#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) +llama_build_executable(test-tokenizer-1-bpe.cpp) +llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) +llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) llama_build_and_test_executable(test-grammar-parser.cpp) llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-grad0.cpp) # SLOW diff --git a/tests/test-tokenizer-1-falcon.cpp b/tests/test-tokenizer-1-bpe.cpp similarity index 100% rename from tests/test-tokenizer-1-falcon.cpp rename to tests/test-tokenizer-1-bpe.cpp