clean-up : warnings, names

This commit is contained in:
Georgi Gerganov 2023-11-29 10:01:36 +02:00
parent 87fe183d4d
commit fecb61b193
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
8 changed files with 30 additions and 21 deletions

2
.gitignore vendored
View file

@ -97,6 +97,6 @@ tests/test-quantize-perf
tests/test-sampling tests/test-sampling
tests/test-tokenizer-0-llama tests/test-tokenizer-0-llama
tests/test-tokenizer-0-falcon tests/test-tokenizer-0-falcon
tests/test-tokenizer-0-deepseek_coder tests/test-tokenizer-0-deepseek-coder
tests/test-tokenizer-1-llama tests/test-tokenizer-1-llama
tests/test-tokenizer-1-bpe tests/test-tokenizer-1-bpe

View file

@ -8,7 +8,7 @@ BUILD_TARGETS = \
TEST_TARGETS = \ TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek_coder \ tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek-coder \
tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
# Code coverage output files # Code coverage output files
@ -70,7 +70,7 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek_coder" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \ elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \ continue; \
@ -731,7 +731,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-deepseek_coder: tests/test-tokenizer-0-deepseek_coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)

View file

@ -6108,11 +6108,11 @@ private:
std::vector<size_t> bpe_words; // stroe the offset of each word std::vector<size_t> bpe_words; // stroe the offset of each word
bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size
size_t start = 0; size_t start = 0;
for ( auto & offset : offsets) { for (auto offset : offsets) {
std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr); std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr);
std::wcregex_iterator end; std::wcregex_iterator end;
size_t start_idx = 0; int64_t start_idx = 0;
while (it != end) { while (it != end) {
std::wcmatch match = *it; std::wcmatch match = *it;
if (match.position() > start_idx) { if (match.position() > start_idx) {
@ -6122,7 +6122,7 @@ private:
start_idx = match.position() + match.length(); start_idx = match.position() + match.length();
++it; ++it;
} }
if (start_idx < offset) { if (start_idx < (int64_t) offset) {
bpe_words.emplace_back(offset - start_idx); bpe_words.emplace_back(offset - start_idx);
} }
start += offset; start += offset;

View file

@ -22,22 +22,30 @@ endfunction()
llama_build_and_test_executable(test-quantize-fns.cpp) llama_build_and_test_executable(test-quantize-fns.cpp)
llama_build_and_test_executable(test-quantize-perf.cpp) llama_build_and_test_executable(test-quantize-perf.cpp)
llama_build_and_test_executable(test-sampling.cpp) llama_build_and_test_executable(test-sampling.cpp)
llama_build_executable(test-tokenizer-0-llama.cpp) llama_build_executable(test-tokenizer-0-llama.cpp)
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_build_executable(test-tokenizer-0-falcon.cpp) llama_build_executable(test-tokenizer-0-falcon.cpp)
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_build_executable(test-tokenizer-0-deepseek-coder.cpp)
llama_test_executable (test-tokenizer-0-deepseek-coder test-tokenizer-0-deepseek-coder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_build_executable(test-tokenizer-1-llama.cpp) llama_build_executable(test-tokenizer-1-llama.cpp)
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
llama_build_executable(test-tokenizer-1-bpe.cpp) llama_build_executable(test-tokenizer-1-bpe.cpp)
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) llama_test_executable (test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) llama_test_executable (test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf) llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf) llama_test_executable (test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) llama_test_executable (test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) llama_test_executable (test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
# llama_test_executable(test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG #llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
llama_build_and_test_executable(test-grammar-parser.cpp) llama_build_and_test_executable(test-grammar-parser.cpp)
llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-llama-grammar.cpp)
llama_build_and_test_executable(test-grad0.cpp) # SLOW llama_build_and_test_executable(test-grad0.cpp) # SLOW

View file

@ -1,4 +1,4 @@
#pragma once #pragma once
#include <cassert> #include <cassert>
#include <stdexcept> #include <stdexcept>
@ -499,3 +499,4 @@ inline std::string to_utf8(const std::wstring& ws)
std::string utf8 = converter.to_bytes(ws); std::string utf8 = converter.to_bytes(ws);
return utf8; return utf8;
} }