From fecb61b1931a146b555d4269c3130d1e83b5c826 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 29 Nov 2023 10:01:36 +0200
Subject: [PATCH] clean-up : warnings, names

---
 .gitignore                                    |  2 +-
 Makefile                                      |  6 ++--
 llama.cpp                                     |  6 ++--
 tests/CMakeLists.txt                          | 28 ++++++++++++-------
 ...pp => test-tokenizer-0-deepseek-coder.cpp} |  2 +-
 ....py => test-tokenizer-0-deepseek-coder.py} |  0
 tests/test-tokenizer-0-falcon.cpp             |  2 +-
 unicode.h                                     |  5 ++--
 8 files changed, 30 insertions(+), 21 deletions(-)
 rename tests/{test-tokenizer-0-deepseek_coder.cpp => test-tokenizer-0-deepseek-coder.cpp} (99%)
 rename tests/{test-tokenizer-0-deepseek_coder.py => test-tokenizer-0-deepseek-coder.py} (100%)

diff --git a/.gitignore b/.gitignore
index fbe8a8ade..a72dce0c4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -97,6 +97,6 @@ tests/test-quantize-perf
 tests/test-sampling
 tests/test-tokenizer-0-llama
 tests/test-tokenizer-0-falcon
-tests/test-tokenizer-0-deepseek_coder
+tests/test-tokenizer-0-deepseek-coder
 tests/test-tokenizer-1-llama
 tests/test-tokenizer-1-bpe
diff --git a/Makefile b/Makefile
index 8fc824a1a..0ca968b30 100644
--- a/Makefile
+++ b/Makefile
@@ -8,7 +8,7 @@ BUILD_TARGETS = \
 TEST_TARGETS = \
 	tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt \
 	tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0-llama          \
-	tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek_coder 									   \
+	tests/test-tokenizer-0-falcon tests/test-tokenizer-0-deepseek-coder 									   \
 	tests/test-tokenizer-1-llama tests/test-tokenizer-1-bpe
 
 # Code coverage output files
@@ -70,7 +70,7 @@ test: $(TEST_TARGETS)
 			./$$test_target $(CURDIR)/models/ggml-vocab-llama.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-0-falcon" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
-		elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek_coder" ]; then \
+		elif [ "$$test_target" = "tests/test-tokenizer-0-deepseek-coder" ]; then \
 			./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
 		elif [ "$$test_target" = "tests/test-tokenizer-1-llama" ]; then \
 			continue; \
@@ -731,7 +731,7 @@ tests/test-tokenizer-0-falcon: tests/test-tokenizer-0-falcon.cpp ggml.o llama.o
 tests/test-tokenizer-0-llama: tests/test-tokenizer-0-llama.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
-tests/test-tokenizer-0-deepseek_coder: tests/test-tokenizer-0-deepseek_coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
+tests/test-tokenizer-0-deepseek-coder: tests/test-tokenizer-0-deepseek-coder.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 
 tests/test-tokenizer-1-bpe: tests/test-tokenizer-1-bpe.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
diff --git a/llama.cpp b/llama.cpp
index 208567776..71fb44a05 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -6108,11 +6108,11 @@ private:
         std::vector<size_t> bpe_words; // stroe the offset of each word
         bpe_words.reserve(offsets.size()); // Reserve memory for the approximate size
         size_t start = 0;
-        for ( auto & offset : offsets) {
+        for (auto offset : offsets) {
             std::wcregex_iterator it(text.data() + start, text.data() + start + offset, expr);
             std::wcregex_iterator end;
 
-            size_t start_idx = 0;
+            int64_t start_idx = 0;
             while (it != end) {
                 std::wcmatch match = *it;
                 if (match.position() > start_idx) {
@@ -6122,7 +6122,7 @@ private:
                 start_idx = match.position() + match.length();
                 ++it;
             }
-            if (start_idx < offset) {
+            if (start_idx < (int64_t) offset) {
                 bpe_words.emplace_back(offset - start_idx);
             }
             start += offset;
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index c8b4bc254..1e247fbfc 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -22,22 +22,30 @@ endfunction()
 llama_build_and_test_executable(test-quantize-fns.cpp)
 llama_build_and_test_executable(test-quantize-perf.cpp)
 llama_build_and_test_executable(test-sampling.cpp)
+
 llama_build_executable(test-tokenizer-0-llama.cpp)
 llama_test_executable (test-tokenizer-0-llama test-tokenizer-0-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+
 llama_build_executable(test-tokenizer-0-falcon.cpp)
 llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+
+llama_build_executable(test-tokenizer-0-deepseek-coder.cpp)
+llama_test_executable (test-tokenizer-0-deepseek-coder test-tokenizer-0-deepseek-coder.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
+
 llama_build_executable(test-tokenizer-1-llama.cpp)
-llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+llama_test_executable (test-tokenizer-1-llama    test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_test_executable (test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+
 llama_build_executable(test-tokenizer-1-bpe.cpp)
-llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test_executable(test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
-llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
-llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
-# llama_test_executable(test-tokenizer-1-bloom test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+llama_test_executable (test-tokenizer-1-falcon           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_test_executable (test-tokenizer-1-aquila           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+llama_test_executable (test-tokenizer-1-mpt              test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+llama_test_executable (test-tokenizer-1-stablelm-3b-4e1t test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
+llama_test_executable (test-tokenizer-1-gpt-neox         test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
+llama_test_executable (test-tokenizer-1-refact           test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
+llama_test_executable (test-tokenizer-1-starcoder        test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
+#llama_test_executable (test-tokenizer-1-bloom            test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
+
 llama_build_and_test_executable(test-grammar-parser.cpp)
 llama_build_and_test_executable(test-llama-grammar.cpp)
 llama_build_and_test_executable(test-grad0.cpp) # SLOW
diff --git a/tests/test-tokenizer-0-deepseek_coder.cpp b/tests/test-tokenizer-0-deepseek-coder.cpp
similarity index 99%
rename from tests/test-tokenizer-0-deepseek_coder.cpp
rename to tests/test-tokenizer-0-deepseek-coder.cpp
index fb30c409d..16966e072 100644
--- a/tests/test-tokenizer-0-deepseek_coder.cpp
+++ b/tests/test-tokenizer-0-deepseek-coder.cpp
@@ -142,7 +142,7 @@ int main(int argc, char **argv) {
             success = false;
         }
     }
-    
+
     if (!fname_text.empty()) {
         fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
 
diff --git a/tests/test-tokenizer-0-deepseek_coder.py b/tests/test-tokenizer-0-deepseek-coder.py
similarity index 100%
rename from tests/test-tokenizer-0-deepseek_coder.py
rename to tests/test-tokenizer-0-deepseek-coder.py
diff --git a/tests/test-tokenizer-0-falcon.cpp b/tests/test-tokenizer-0-falcon.cpp
index 467b9f45e..677608dec 100644
--- a/tests/test-tokenizer-0-falcon.cpp
+++ b/tests/test-tokenizer-0-falcon.cpp
@@ -141,7 +141,7 @@ int main(int argc, char **argv) {
             success = false;
         }
     }
-    
+
     if (!fname_text.empty()) {
         fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
 
diff --git a/unicode.h b/unicode.h
index 0946d3c67..dc5188970 100644
--- a/unicode.h
+++ b/unicode.h
@@ -1,4 +1,4 @@
-﻿#pragma once
+#pragma once
 
 #include <cassert>
 #include <stdexcept>
@@ -498,4 +498,5 @@ inline std::string to_utf8(const std::wstring& ws)
     std::wstring_convert<std::codecvt_utf8_utf16<wchar_t>, wchar_t> converter;
     std::string utf8 = converter.to_bytes(ws);
     return utf8;
-}
\ No newline at end of file
+}
+