clean-up : warnings, names
This commit is contained in:
parent
87fe183d4d
commit
fecb61b193
8 changed files with 30 additions and 21 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -97,6 +97,6 @@ tests/test-quantize-perf
|
||||||
tests/test-sampling
|
tests/test-sampling
|
||||||
tests/test-tokenizer-0-llama
|
tests/test-tokenizer-0-llama
|
||||||
tests/test-tokenizer-0-falcon
|
tests/test-tokenizer-0-falcon
|
||||||
tests/test-tokenizer-0-deepseek_coder
|
tests/test-tokenizer-0-deepseek-coder
|
||||||
tests/test-tokenizer-1-llama
|
tests/test-tokenizer-1-llama
|
||||||
tests/test-tokenizer-1-bpe
|
tests/test-tokenizer-1-bpe
|
||||||
|
|
6
Makefile
6
Makefile
|
@ -8,7 +8,7 @@ BUILD_TARGETS = \
|
||||||
TEST_TARGETS = \
|
TEST_TARGETS = \
|
||||||
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
|
||||||
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \
|
||||||
tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek_coder \
|
tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek-coder \
|
||||||
tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
|
||||||
|
|
||||||
# Code coverage output files
|
# Code coverage output files
|
||||||
|
@ -70,7 +70,7 @@ test: $(TEST_TARGETS)
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek_coder" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \
|
||||||
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
|
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
|
||||||
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
|
||||||
continue; \
|
continue; \
|
||||||
|
@ -731,7 +731,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o
|
||||||
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-0-deepseek_coder: tests/test-tokenizer-0-deepseek_coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
|
||||||
|
|
||||||
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
|
||||||
|
|
|
@ -6108,11 +6108,11 @@ private:
|
||||||
std::vector<size_t> bpe_words; // stroe the offset of each word
|
std::vector<size_t> bpe_words; // stroe the offset of each word
|
||||||
bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size
|
bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
for ( auto & offset : offsets) {
|
for (auto offset : offsets) {
|
||||||
std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr);
|
std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr);
|
||||||
std::wcregex_iterator end;
|
std::wcregex_iterator end;
|
||||||
|
|
||||||
size_t start_idx = 0;
|
int64_t start_idx = 0;
|
||||||
while (it != end) {
|
while (it != end) {
|
||||||
std::wcmatch match = *it;
|
std::wcmatch match = *it;
|
||||||
if (match.position() > start_idx) {
|
if (match.position() > start_idx) {
|
||||||
|
@ -6122,7 +6122,7 @@ private:
|
||||||
start_idx = match.position() + match.length();
|
start_idx = match.position() + match.length();
|
||||||
++it;
|
++it;
|
||||||
}
|
}
|
||||||
if (start_idx < offset) {
|
if (start_idx < (int64_t) offset) {
|
||||||
bpe_words.emplace_back(offset - start_idx);
|
bpe_words.emplace_back(offset - start_idx);
|
||||||
}
|
}
|
||||||
start += offset;
|
start += offset;
|
||||||
|
|
|
@ -22,22 +22,30 @@ endfunction()
|
||||||
llama_build_and_test_executable(test-quantize-fns.cpp)
|
llama_build_and_test_executable(test-quantize-fns.cpp)
|
||||||
llama_build_and_test_executable(test-quantize-perf.cpp)
|
llama_build_and_test_executable(test-quantize-perf.cpp)
|
||||||
llama_build_and_test_executable(test-sampling.cpp)
|
llama_build_and_test_executable(test-sampling.cpp)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-0-llama.cpp)
|
llama_build_executable(test-tokenizer-0-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
llama_build_executable(test-tokenizer-0-falcon.cpp)
|
||||||
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
|
||||||
|
llama_build_executable(test-tokenizer-0-deepseek-coder.cpp)
|
||||||
|
llama_test_executable (test-tokenizer-0-deepseek-coder test-tokenizer-0-deepseek-coder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-1-llama.cpp)
|
llama_build_executable(test-tokenizer-1-llama.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||||
|
|
||||||
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
llama_build_executable(test-tokenizer-1-bpe.cpp)
|
||||||
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
llama_test_executable (test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
llama_test_executable (test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
|
llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
llama_test_executable (test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
llama_test_executable (test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
llama_test_executable (test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||||
# llama_test_executable(test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
#llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
||||||
|
|
||||||
llama_build_and_test_executable(test-grammar-parser.cpp)
|
llama_build_and_test_executable(test-grammar-parser.cpp)
|
||||||
llama_build_and_test_executable(test-llama-grammar.cpp)
|
llama_build_and_test_executable(test-llama-grammar.cpp)
|
||||||
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
llama_build_and_test_executable(test-grad0.cpp) # SLOW
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
|
@ -499,3 +499,4 @@ inline std::string to_utf8(const std::wstring& ws)
|
||||||
std::string utf8 = converter.to_bytes(ws);
|
std::string utf8 = converter.to_bytes(ws);
|
||||||
return utf8;
|
return utf8;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue