Testing Aquila
This commit is contained in:
parent
048e659dae
commit
c0990bb739
7 changed files with 17 additions and 26 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -86,4 +86,4 @@ tests/test-sampling
|
||||||
tests/test-tokenizer-0-llama
|
tests/test-tokenizer-0-llama
|
||||||
tests/test-tokenizer-0-falcon
|
tests/test-tokenizer-0-falcon
|
||||||
tests/test-tokenizer-1-llama
|
tests/test-tokenizer-1-llama
|
||||||
tests/test-tokenizer-1-falcon
|
tests/test-tokenizer-1-bpe
|
||||||
|
|
6
Makefile
6
Makefile
|
@ -2,7 +2,7 @@
|
||||||
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple save-load-state server embd-input-test gguf llama-bench baby-llama beam-search speculative tests/test-c.o
|
||||||
|
|
||||||
# Binaries only useful for tests
|
# Binaries only useful for tests
|
||||||
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-falcon
|
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
COV_TARGETS = *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report
|
||||||
|
@ -51,7 +51,7 @@ test: $(TEST_TARGETS)
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
||||||
continue; \
|
continue; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-1-falcon" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
|
||||||
continue; \
|
continue; \
|
||||||
else \
|
else \
|
||||||
echo "Running test $$test_target..."; \
|
echo "Running test $$test_target..."; \
|
||||||
|
@ -621,7 +621,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp build-info.h gg
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-falcon: tests/test-tokenizer-1-falcon.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
tests/test-tokenizer-1-llama: tests/test-tokenizer-1-llama.cpp build-info.h ggml.o llama.o common.o $(OBJS)
|
||||||
|
|
|
@ -161,15 +161,12 @@ byte_encoder = bytes_to_unicode()
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||||
|
|
||||||
for i in range(vocab_size):
|
for i in range(vocab_size):
|
||||||
if reverse_vocab[i] in byte_decoder:
|
text = reverse_vocab[i]
|
||||||
text = reverse_vocab[i]
|
tokens.append(text)
|
||||||
tokens.append(text)
|
scores.append(0.0) # dummy
|
||||||
scores.append(0.0) # dummy
|
if text in byte_decoder:
|
||||||
toktypes.append(gguf.TokenType.BYTE)
|
toktypes.append(gguf.TokenType.BYTE)
|
||||||
else:
|
else:
|
||||||
text = reverse_vocab[i]
|
|
||||||
tokens.append(text)
|
|
||||||
scores.append(0.0) # dummy
|
|
||||||
toktypes.append(gguf.TokenType.NORMAL)
|
toktypes.append(gguf.TokenType.NORMAL)
|
||||||
|
|
||||||
gguf_writer.add_token_list(tokens)
|
gguf_writer.add_token_list(tokens)
|
||||||
|
|
18
convert.py
18
convert.py
|
@ -339,21 +339,15 @@ class BpeVocab:
|
||||||
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def bpe_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.bpe_tokenizer
|
tokenizer = self.bpe_tokenizer
|
||||||
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
|
from transformers.models.gpt2 import tokenization_gpt2 # type: ignore[import]
|
||||||
|
reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.items()}
|
||||||
byte_encoder = tokenization_gpt2.bytes_to_unicode()
|
byte_encoder = tokenization_gpt2.bytes_to_unicode()
|
||||||
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
byte_decoder = {v: k for k, v in byte_encoder.items()}
|
||||||
|
|
||||||
score = 0.0
|
score = 0.0
|
||||||
for i, item in enumerate(tokenizer):
|
for i, _ in enumerate(tokenizer):
|
||||||
text: bytes = item.encode("utf-8")
|
text = reverse_vocab[i]
|
||||||
# FIXME: These shouldn't be hardcoded, but it's probably better than the current behavior?
|
if text in byte_decoder:
|
||||||
if i <= 258 and text.startswith(b'<') and text.endswith(b'>'):
|
toktype = gguf.TokenType.BYTE
|
||||||
if i == 0 and text == b'<unk>':
|
|
||||||
toktype = gguf.TokenType.UNKNOWN
|
|
||||||
elif i == 1 or i == 2:
|
|
||||||
toktype = gguf.TokenType.CONTROL
|
|
||||||
elif i >= 3 and text.startswith(b'<0x'):
|
|
||||||
toktype = gguf.TokenType.BYTE
|
|
||||||
else:
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
|
||||||
else:
|
else:
|
||||||
toktype = gguf.TokenType.NORMAL
|
toktype = gguf.TokenType.NORMAL
|
||||||
yield text, score, toktype
|
yield text, score, toktype
|
||||||
|
|
BIN
models/ggml-vocab-aquila.gguf
Normal file
BIN
models/ggml-vocab-aquila.gguf
Normal file
Binary file not shown.
|
@ -28,9 +28,9 @@ llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_build_executable(test-tokenizer-1-falcon.cpp)
|
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
#llama_test_executable(test-tokenizer-1.aquila test-tokenizer-1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue