From fecb61b1931a146b555d4269c3130d1e83b5c826 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 29 Nov 2023 10:01:36 +0200 Subject: [PATCH] clean-up : warnings, names --- .gitignore | 2 +- Makefile | 6 ++-- llama.cpp | 6 ++-- tests/CMakeLists.txt | 28 ++++++++++++------- ...pp => test-tokenizer-0-deepseek-coder.cpp} | 2 +- ....py => test-tokenizer-0-deepseek-coder.py} | 0 tests/test-tokenizer-0-falcon.cpp | 2 +- unicode.h | 5 ++-- 8 files changed, 30 insertions(+), 21 deletions(-) rename tests/{test-tokenizer-0-deepseek_coder.cpp => test-tokenizer-0-deepseek-coder.cpp} (99%) rename tests/{test-tokenizer-0-deepseek_coder.py => test-tokenizer-0-deepseek-coder.py} (100%) diff --git a/.gitignore b/.gitignore index fbe8a8ade..a72dce0c4 100644 --- a/.gitignore +++ b/.gitignore @@ -97,6 +97,6 @@ tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama tests/test-tokenizer-0-falcon -tests/test-tokenizer-0-deepseek_coder +tests/test-tokenizer-0-deepseek-coder tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe diff --git a/Makefile b/Makefile index 8fc824a1a..0ca968b30 100644 --- a/Makefile +++ b/Makefile @@ -8,7 +8,7 @@ BUILD_TARGETS = \ TEST_TARGETS = \ tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \ tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama \ - tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek_coder \ + tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek-coder \ tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe # Code coverage output files @@ -70,7 +70,7 @@ test: $(TEST_TARGETS) ./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \ elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \ ./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \ - elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek_coder" ]; then \ + elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \ ./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \ elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \ continue; \ @@ -731,7 +731,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) -tests/test-tokenizer-0-deepseek_coder: tests/test-tokenizer-0-deepseek_coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) +tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) $(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS) tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS) diff --git a/llama.cpp b/llama.cpp index 208567776..71fb44a05 100644 --- a/llama.cpp +++ b/llama.cpp @@ -6108,11 +6108,11 @@ private: std::vector bpe_words; // stroe the offset of each word bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size size_t start = 0; - for ( auto & offset : offsets) { + for (auto offset : offsets) { std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr); std::wcregex_iterator end; - size_t start_idx = 0; + int64_t start_idx = 0; while (it != end) { std::wcmatch match = *it; if (match.position() > start_idx) { @@ -6122,7 +6122,7 @@ private: start_idx = match.position() + match.length(); ++it; } - if (start_idx < offset) { + if (start_idx < (int64_t) offset) { bpe_words.emplace_back(offset - start_idx); } start += offset; diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index c8b4bc254..1e247fbfc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -22,22 +22,30 @@ endfunction() llama_build_and_test_executable(test-quantize-fns.cpp) llama_build_and_test_executable(test-quantize-perf.cpp) llama_build_and_test_executable(test-sampling.cpp) + llama_build_executable(test-tokenizer-0-llama.cpp) llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) + llama_build_executable(test-tokenizer-0-falcon.cpp) llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) + +llama_build_executable(test-tokenizer-0-deepseek-coder.cpp) +llama_test_executable (test-tokenizer-0-deepseek-coder test-tokenizer-0-deepseek-coder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) + llama_build_executable(test-tokenizer-1-llama.cpp) -llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) -llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) +llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) + llama_build_executable(test-tokenizer-1-bpe.cpp) -llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) -llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) -llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) -llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf) -llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf) -llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) -llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) -# llama_test_executable(test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG +llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) +llama_test_executable (test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) +llama_test_executable (test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf) +llama_test_executable (test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf) +llama_test_executable (test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) +llama_test_executable (test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) +#llama_test_executable (test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG + llama_build_and_test_executable(test-grammar-parser.cpp) llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-grad0.cpp) # SLOW diff --git a/tests/test-tokenizer-0-deepseek_coder.cpp b/tests/test-tokenizer-0-deepseek-coder.cpp similarity index 99% rename from tests/test-tokenizer-0-deepseek_coder.cpp rename to tests/test-tokenizer-0-deepseek-coder.cpp index fb30c409d..16966e072 100644 --- a/tests/test-tokenizer-0-deepseek_coder.cpp +++ b/tests/test-tokenizer-0-deepseek-coder.cpp @@ -142,7 +142,7 @@ int main(int argc, char **argv) { success = false; } } - + if (!fname_text.empty()) { fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); diff --git a/tests/test-tokenizer-0-deepseek_coder.py b/tests/test-tokenizer-0-deepseek-coder.py similarity index 100% rename from tests/test-tokenizer-0-deepseek_coder.py rename to tests/test-tokenizer-0-deepseek-coder.py diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp index 467b9f45e..677608dec 100644 --- a/tests/test-tokenizer-0-falcon.cpp +++ b/tests/test-tokenizer-0-falcon.cpp @@ -141,7 +141,7 @@ int main(int argc, char **argv) { success = false; } } - + if (!fname_text.empty()) { fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str()); diff --git a/unicode.h b/unicode.h index 0946d3c67..dc5188970 100644 --- a/unicode.h +++ b/unicode.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once #include #include @@ -498,4 +498,5 @@ inline std::string to_utf8(const std::wstring& ws) std::wstring_convert, wchar_t> converter; std::string utf8 = converter.to_bytes(ws); return utf8; -} \ No newline at end of file +} +