cmake : refactor test targets

2024-04-29 09:53:14 +03:00 · 2024-04-29 09:53:14 +03:00 · ef4cca9e87
commit ef4cca9e87
parent 7b1210f6a8
3 changed files with 153 additions and 32 deletions
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@ -68,6 +68,7 @@ def download_file_with_auth(url, token, save_path):
    else:
        print(f"Failed to download file. Status code: {response.status_code}")
 # download the tokenizer models
 for model in models:
    name = model["name"]
    repo = model["repo"]
@ -173,3 +174,84 @@ print("\n")
 print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
 print("\n")
 # generate tests for each tokenizer model
 tests = [
    "",
    " ",
    "  ",
    "   ",
    "\t",
    "\n",
    "\n\n",
    "\n\n\n",
    "\t\n",
    "Hello world",
    " Hello world",
    "Hello World",
    " Hello World",
    " Hello World!",
    "Hello, world!",
    " Hello, world!",
    " this is 🦙.cpp",
    "w048 7tuijk dsdfhu",
    "нещо на Български",
    "កាន់តែពិសេសអាចខលចេញ",
    "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
    "Hello",
    " Hello",
    "  Hello",
    "   Hello",
    "    Hello",
    "    Hello\n    Hello",
    " (",
    "\n =",
    "' era",
    "Hello, y'all! How are you 😁 ?我想在apple工作1314151天～",
    "3",
    "33",
    "333",
    "3333",
    "33333",
    "333333",
    "3333333",
    "33333333",
    "333333333",
 ]
 # write the tests in ./models/test-vocab-inp.txt
 # the format is:
 #
 # test0
 # __ggml_vocab_test__
 # test1
 # __ggml_vocab_test__
 # ...
 #
 with open(f"models/test-vocab-inp.txt", "w") as f:
    for text in tests:
        f.write(f"{text}")
        f.write("\n__ggml_vocab_test__\n")
 print("Tests written in ./models/test-vocab-inp.txt")
 # with each model, encode all tests and write the results in ./models/test-vocab-out-{name}.txt
 # for each test, write the resulting tokens on a separate line
 for model in models:
    name = model["name"]
    tokt = model["tokt"]
    # create the tokenizer
    from transformers import AutoTokenizer
    tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
    with open(f"models/test-vocab-out-{name}.txt", "w") as f:
        for text in tests:
            res = tokenizer.encode(text)
            for r in res:
                f.write(f" {r}")
            f.write("\n")
    print(f"Test results for {name} written in ./models/test-vocab-out-{name}.txt")
--- a/models/ggml-vocab-stablelm-3b-4e1t.gguf
+++ b/models/ggml-vocab-stablelm-3b-4e1t.gguf
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@ -1,10 +1,40 @@
 function(llama_test target)
    include(CMakeParseArguments)
    set(options)
    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
    set(multiValueArgs ARGS)
    cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    if (NOT DEFINED LLAMA_TEST_LABEL)
        set(LLAMA_TEST_LABEL "main")
    endif()
    if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
        set(LLAMA_TEST_WORKING_DIRECTORY .)
    endif()
    if (DEFINED LLAMA_TEST_NAME)
        set(TEST_NAME ${LLAMA_TEST_NAME})
    else()
        set(TEST_NAME ${target})
    endif()
    set(TEST_TARGET ${target})
    add_test(
        NAME ${TEST_NAME}
        WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
        COMMAND $<TARGET_FILE:${TEST_TARGET}>
        ${LLAMA_TEST_ARGS})
    set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
 endfunction()
 # Builds and runs a test source file.
 # Optional args:
 # - NAME: name of the executable & test target (defaults to the source file name without extension)
 # - LABEL: label for the test (defaults to main)
 # - ARGS: arguments to pass to the test executable
 # - WORKING_DIRECTORY
-function(llama_test source)
+function(llama_target_and_test source)
    include(CMakeParseArguments)
    set(options)
    set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
@ -35,45 +65,54 @@ function(llama_test source)
    set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
 endfunction()
-# llama_test(test-double-float.cpp) # SLOW
+# llama_target_and_test(test-double-float.cpp) # SLOW
-llama_test(test-quantize-fns.cpp)
+llama_target_and_test(test-quantize-fns.cpp)
-llama_test(test-quantize-perf.cpp)
+llama_target_and_test(test-quantize-perf.cpp)
-llama_test(test-sampling.cpp)
+llama_target_and_test(test-sampling.cpp)
-llama_test(test-chat-template.cpp)
+llama_target_and_test(test-chat-template.cpp)
-llama_test(test-tokenizer-0-llama.cpp    NAME test-tokenizer-0-llama                          ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_target_and_test(test-tokenizer-0-llama.cpp    NAME test-tokenizer-0-llama                ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3                       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
+llama_target_and_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
-llama_test(test-tokenizer-0-falcon.cpp   NAME test-tokenizer-0-falcon                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+llama_target_and_test(test-tokenizer-0-falcon.cpp   NAME test-tokenizer-0-falcon               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
+llama_target_and_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
-llama_test(test-tokenizer-0-deepseek-llm.cpp   NAME test-tokenizer-0-deepseek-llm             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
+llama_target_and_test(test-tokenizer-0-deepseek-llm.cpp   NAME test-tokenizer-0-deepseek-llm   ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
-llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-llama                            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
+llama_target_and_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-llama                  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
-llama_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-baichuan                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
+llama_target_and_test(test-tokenizer-1-llama.cpp  NAME test-tokenizer-1-baichuan               ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-falcon                           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
+# build test-tokenizer-1-bpe target once and add many tests
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-aquila                           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp get-model.cpp)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-mpt                              ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
+target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
-llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-stablelm-3b-4e1t                 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
+install(TARGETS test-tokenizer-1-bpe RUNTIME)
 llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-gpt-neox                         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
 llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-refact                           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-starcoder                        ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-gpt2                             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
 #llama_test(test-tokenizer-1-bpe.cpp    NAME test-tokenizer-1-bloom                           ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
-llama_test(test-grammar-parser.cpp)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
-llama_test(test-llama-grammar.cpp)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
-llama_test(test-grammar-integration.cpp)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
-llama_test(test-grad0.cpp)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
-# llama_test(test-opt.cpp) # SLOW
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox  ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
-llama_test(test-backend-ops.cpp)
+llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
 llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
 llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2      ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
 #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom     ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
 llama_test(test-rope.cpp)
 llama_test(test-model-load-cancel.cpp  LABEL "model")
 llama_test(test-autorelease.cpp        LABEL "model")
-llama_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
+
 llama_target_and_test(test-grammar-parser.cpp)
 llama_target_and_test(test-llama-grammar.cpp)
 llama_target_and_test(test-grammar-integration.cpp)
 llama_target_and_test(test-grad0.cpp)
 # llama_target_and_test(test-opt.cpp) # SLOW
 llama_target_and_test(test-backend-ops.cpp)
 llama_target_and_test(test-rope.cpp)
 llama_target_and_test(test-model-load-cancel.cpp  LABEL "model")
 llama_target_and_test(test-autorelease.cpp        LABEL "model")
 llama_target_and_test(test-json-schema-to-grammar.cpp   WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
 target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
 # dummy executable - not installed