clean-up : warnings, names

This commit is contained in:
Georgi Gerganov 2023-11-29 10:01:36 +02:00
parent 87fe183d4d
commit fecb61b193
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
8 changed files with 30 additions and 21 deletions

2
.gitignore vendored
View file

@ -97,6 +97,6 @@ tests/test-quantize-perf
tests/test-sampling
tests/test-tokenizer-0-llama
tests/test-tokenizer-0-falcon
tests/test-tokenizer-0-deepseek_coder
tests/test-tokenizer-0-deepseek-coder
tests/test-tokenizer-1-llama
tests/test-tokenizer-1-bpe

View file

@ -8,7 +8,7 @@ BUILD_TARGETS = \
TEST_TARGETS = \
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek_coder \
tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek-coder \
tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
# Code coverage output files
@ -70,7 +70,7 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek_coder" ]; then \
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
continue; \
@ -731,7 +731,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-0-deepseek_coder: tests/test-tokenizer-0-deepseek_coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)

View file

@ -6108,11 +6108,11 @@ private:
std::vector<size_t> bpe_words; // stroe the offset of each word
bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size
size_t start = 0;
for ( auto & offset : offsets) {
for (auto offset : offsets) {
std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr);
std::wcregex_iterator end;
size_t start_idx = 0;
int64_t start_idx = 0;
while (it != end) {
std::wcmatch match = *it;
if (match.position() > start_idx) {
@ -6122,7 +6122,7 @@ private:
start_idx = match.position() + match.length();
++it;
}
if (start_idx < offset) {
if (start_idx < (int64_t) offset) {
bpe_words.emplace_back(offset - start_idx);
}
start += offset;

View file

@ -22,13 +22,20 @@ endfunction()
llama_build_and_test_executable(test-quantize-fns.cpp)
llama_build_and_test_executable(test-quantize-perf.cpp)
llama_build_and_test_executable(test-sampling.cpp)
llama_build_executable(test-tokenizer-0-llama.cpp)
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_build_executable(test-tokenizer-0-falcon.cpp)
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_build_executable(test-tokenizer-0-deepseek-coder.cpp)
llama_test_executable (test-tokenizer-0-deepseek-coder test-tokenizer-0-deepseek-coder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_build_executable(test-tokenizer-1-llama.cpp)
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
llama_build_executable(test-tokenizer-1-bpe.cpp)
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test_executable (test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
@ -38,6 +45,7 @@ llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE
llama_test_executable (test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test_executable (test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
#llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
llama_build_and_test_executable(test-grammar-parser.cpp)
llama_build_and_test_executable(test-llama-grammar.cpp)
llama_build_and_test_executable(test-grad0.cpp) # SLOW

View file

@ -1,4 +1,4 @@
#pragma once
#pragma once
#include <cassert>
#include <stdexcept>
@ -499,3 +499,4 @@ inline std::string to_utf8(const std::wstring& ws)
std::string utf8 = converter.to_bytes(ws);
return utf8;
}