Testing Aquila
This commit is contained in:
parent
048e659dae
commit
c0990bb739
7 changed files with 17 additions and 26 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -86,4 +86,4 @@ tests/test-sampling
|
|||
tests/test-tokenizer-0-llama
|
||||
tests/test-tokenizer-0-falcon
|
||||
tests/test-tokenizer-1-llama
|
||||
tests/test-tokenizer-1-falcon
|
||||
tests/test-tokenizer-1-bpe
|
||||
|
|
6
Makefile
6
Makefile
|
@ -2,7 +2,7 @@
|
|||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
||||
|
||||
# Binaries only useful for tests
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-falcon
|
||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
||||
|
||||
# Code coverage output files
|
||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||
|
@ -51,7 +51,7 @@ test: $(TEST_TARGETS)
|
|||
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
||||
continue; \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1-falcon" ]; then \
|
||||
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
||||
continue; \
|
||||
else \
|
||||
echo "Running test $$test_target..."; \
|
||||
|
@ -621,7 +621,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
|
|||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-falcon: tests/test-tokenizer-1-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||
|
||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||
|
|
|
@ -161,15 +161,12 @@ byte_encoder = bytes_to_unicode()
|
|||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
for i in range(vocab_size):
|
||||
if reverse_vocab[i] in byte_decoder:
|
||||
text = reverse_vocab[i]
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dummy
|
||||
text = reverse_vocab[i]
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dummy
|
||||
if text in byte_decoder:
|
||||
toktypes.append(gguf.TokenType.BYTE)
|
||||
else:
|
||||
text = reverse_vocab[i]
|
||||
tokens.append(text)
|
||||
scores.append(0.0) # dummy
|
||||
toktypes.append(gguf.TokenType.NORMAL)
|
||||
|
||||
gguf_writer.add_token_list(tokens)
|
||||
|
|
18
convert.py
18
convert.py
|
@ -339,21 +339,15 @@ class BpeVocab:
|
|||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||
tokenizer = self.bpe_tokenizer
|
||||
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
|
||||
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||
byte_encoder = tokenization_gpt2.bytes_to_unicode()
|
||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||
|
||||
score = 0.0
|
||||
for i, item in enumerate(tokenizer):
|
||||
text: bytes = item.encode("utf-8")
|
||||
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
|
||||
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
|
||||
if i == 0 and text == b'<unk>':
|
||||
toktype = gguf.TokenType.UNKNOWN
|
||||
elif i == 1 or i == 2:
|
||||
toktype = gguf.TokenType.CONTROL
|
||||
elif i >= 3 and text.startswith(b'<0x'):
|
||||
toktype = gguf.TokenType.BYTE
|
||||
else:
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
for i, _ in enumerate(tokenizer):
|
||||
text = reverse_vocab[i]
|
||||
if text in byte_decoder:
|
||||
toktype = gguf.TokenType.BYTE
|
||||
else:
|
||||
toktype = gguf.TokenType.NORMAL
|
||||
yield text, score, toktype
|
||||
|
|
BIN
models/ggml-vocab-aquila.gguf
Normal file
BIN
models/ggml-vocab-aquila.gguf
Normal file
Binary file not shown.
|
@ -28,9 +28,9 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
|
|||
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||
llama_build_executable(test-tokenizer-1-falcon.cpp)
|
||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue