clean-up : warnings, names

2023-11-29 10:01:36 +02:00 · 2023-11-29 10:01:36 +02:00 · fecb61b193
commit fecb61b193
parent 87fe183d4d
8 changed files with 30 additions and 21 deletions
--- a/.gitignore
+++ b/.gitignore
@ -97,6 +97,6 @@ tests/test-quantize-perf
 tests/test-sampling
 tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
-tests/test-tokenizer-0-deepseek_coder
+tests/test-tokenizer-0-deepseek-coder
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
--- a/6
+++ b/6
@ -8,7 +8,7 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek_coder 									   \
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek-coder 									   \
 	tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe

 # Code coverage output files
@ -70,7 +70,7 @@ test: $(TEST_TARGETS)
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek_coder" ]; then \
+		elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
 			continue; \
@ -731,7 +731,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o
 tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

-tests/test-tokenizer-0-deepseek_coder: tests/test-tokenizer-0-deepseek_coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)

 tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
--- a/llama.cpp
+++ b/llama.cpp
@ -6108,11 +6108,11 @@ private:
        std::vector<size_t> bpe_words; // stroe the offset of each word
        bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size
        size_t start = 0;
-        for ( auto & offset : offsets) {
+        for (auto offset : offsets) {
            std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr);
            std::wcregex_iterator end;

-            size_t start_idx = 0;
+            int64_t start_idx = 0;
            while (it != end) {
                std::wcmatch match = *it;
                if (match.position() > start_idx) {
@ -6122,7 +6122,7 @@ private:
                start_idx = match.position() + match.length();
                ++it;
            }
-            if (start_idx < offset) {
+            if (start_idx < (int64_t) offset) {
                bpe_words.emplace_back(offset - start_idx);
            }
            start += offset;
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -22,13 +22,20 @@ endfunction()
 llama_build_and_test_executable(test-quantize-fns.cpp)
 llama_build_and_test_executable(test-quantize-perf.cpp)
 llama_build_and_test_executable(test-sampling.cpp)
+
 llama_build_executable(test-tokenizer-0-llama.cpp)
 llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+
 llama_build_executable(test-tokenizer-0-falcon.cpp)
 llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+
+llama_build_executable(test-tokenizer-0-deepseek-coder.cpp)
+llama_test_executable (test-tokenizer-0-deepseek-coder test-tokenizer-0-deepseek-coder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
+
 llama_build_executable(test-tokenizer-1-llama.cpp)
 llama_test_executable (test-tokenizer-1-llama    test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
 llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+
 llama_build_executable(test-tokenizer-1-bpe.cpp)
 llama_test_executable (test-tokenizer-1-falcon           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
 llama_test_executable (test-tokenizer-1-aquila           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
@ -38,6 +45,7 @@ llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE
 llama_test_executable (test-tokenizer-1-refact           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test_executable (test-tokenizer-1-starcoder        test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 #llama_test_executable (test-tokenizer-1-bloom            test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
--- a/tests/test-tokenizer-0-deepseek-coder.cpp
+++ b/tests/test-tokenizer-0-deepseek-coder.cpp
--- a/tests/test-tokenizer-0-deepseek-coder.py
+++ b/tests/test-tokenizer-0-deepseek-coder.py
--- a/unicode.h
+++ b/unicode.h
@ -1,4 +1,4 @@
-#pragma once
+#pragma once

 #include <cassert>
 #include <stdexcept>
@ -499,3 +499,4 @@ inline std::string to_utf8(const std::wstring& ws)
    std::string utf8 = converter.to_bytes(ws);
    return utf8;
 }
+