Testing Aquila

2023-09-19 10:31:10 +02:00 · 2023-09-19 10:31:10 +02:00 · c0990bb739
commit c0990bb739
parent 048e659dae
7 changed files with 17 additions and 26 deletions
--- a/.gitignore
+++ b/.gitignore
@ -86,4 +86,4 @@ tests/test-sampling
 tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
 tests/test-tokenizer-1-llama
-tests/test-tokenizer-1-falcon
+tests/test-tokenizer-1-bpe
--- a/6
+++ b/6
@ -2,7 +2,7 @@
 BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o

 # Binaries only useful for tests
-TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-falcon
+TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe

 # Code coverage output files
 COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
@ -51,7 +51,7 @@ test: $(TEST_TARGETS)
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
 			continue; \
-		elif [ "$$test_target" = "tests/test-tokenizer-1-falcon" ]; then \
+		elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
 			continue; \
 		else \
 			echo "Running test $$test_target..."; \
@ -621,7 +621,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
 tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-1-falcon: tests/test-tokenizer-1-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
+tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@ -161,15 +161,12 @@ byte_encoder = bytes_to_unicode()
 byte_decoder = {v: k for k, v in byte_encoder.items()}

 for i in range(vocab_size):
-    if reverse_vocab[i] in byte_decoder:
-        text = reverse_vocab[i]
-        tokens.append(text)
-        scores.append(0.0) # dummy
+    text = reverse_vocab[i]
+    tokens.append(text)
+    scores.append(0.0) # dummy
+    if text in byte_decoder:
        toktypes.append(gguf.TokenType.BYTE)
    else:
-        text = reverse_vocab[i]
-        tokens.append(text)
-        scores.append(0.0) # dummy
        toktypes.append(gguf.TokenType.NORMAL)

 gguf_writer.add_token_list(tokens)
--- a/convert.py
+++ b/convert.py
@ -339,21 +339,15 @@ class BpeVocab:
    def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
        tokenizer = self.bpe_tokenizer
        from transformers.models.gpt2 import tokenization_gpt2  # type: ignore[import]
+        reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
        byte_encoder = tokenization_gpt2.bytes_to_unicode()
        byte_decoder = {v: k for k, v in byte_encoder.items()}
+
        score = 0.0
-        for i, item in enumerate(tokenizer):
-            text: bytes = item.encode("utf-8")
-            # FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
-            if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
-                if i == 0 and text == b'<unk>':
-                    toktype = gguf.TokenType.UNKNOWN
-                elif i == 1 or i == 2:
-                    toktype = gguf.TokenType.CONTROL
-                elif i >= 3 and text.startswith(b'<0x'):
-                    toktype = gguf.TokenType.BYTE
-                else:
-                    toktype = gguf.TokenType.NORMAL
+        for i, _ in enumerate(tokenizer):
+            text = reverse_vocab[i]
+            if text in byte_decoder:
+                toktype = gguf.TokenType.BYTE
            else:
                toktype = gguf.TokenType.NORMAL
            yield text, score, toktype
--- a/models/ggml-vocab-aquila.gguf
+++ b/models/ggml-vocab-aquila.gguf
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -28,9 +28,9 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
 llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_build_executable(test-tokenizer-1-llama.cpp)
 llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_build_executable(test-tokenizer-1-falcon.cpp)
-llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+llama_build_executable(test-tokenizer-1-bpe.cpp)
+llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
--- a/tests/test-tokenizer-1-falcon.cpp
+++ b/tests/test-tokenizer-1-falcon.cpp