cmake : refactor test targets

This commit is contained in:
Georgi Gerganov 2024-04-29 09:53:14 +03:00
parent 7b1210f6a8
commit ef4cca9e87
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
3 changed files with 153 additions and 32 deletions

View file

@ -68,6 +68,7 @@ def download_file_with_auth(url, token, save_path):
else: else:
print(f"Failed to download file. Status code: {response.status_code}") print(f"Failed to download file. Status code: {response.status_code}")
# download the tokenizer models
for model in models: for model in models:
name = model["name"] name = model["name"]
repo = model["repo"] repo = model["repo"]
@ -173,3 +174,84 @@ print("\n")
print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!") print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
print("\n") print("\n")
# generate tests for each tokenizer model
tests = [
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
]
# write the tests in ./models/test-vocab-inp.txt
# the format is:
#
# test0
# __ggml_vocab_test__
# test1
# __ggml_vocab_test__
# ...
#
with open(f"models/test-vocab-inp.txt", "w") as f:
for text in tests:
f.write(f"{text}")
f.write("\n__ggml_vocab_test__\n")
print("Tests written in ./models/test-vocab-inp.txt")
# with each model, encode all tests and write the results in ./models/test-vocab-out-{name}.txt
# for each test, write the resulting tokens on a separate line
for model in models:
name = model["name"]
tokt = model["tokt"]
# create the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
with open(f"models/test-vocab-out-{name}.txt", "w") as f:
for text in tests:
res = tokenizer.encode(text)
for r in res:
f.write(f" {r}")
f.write("\n")
print(f"Test results for {name} written in ./models/test-vocab-out-{name}.txt")

View file

@ -1,10 +1,40 @@
function(llama_test target)
include(CMakeParseArguments)
set(options)
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
set(multiValueArgs ARGS)
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (NOT DEFINED LLAMA_TEST_LABEL)
set(LLAMA_TEST_LABEL "main")
endif()
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
set(LLAMA_TEST_WORKING_DIRECTORY .)
endif()
if (DEFINED LLAMA_TEST_NAME)
set(TEST_NAME ${LLAMA_TEST_NAME})
else()
set(TEST_NAME ${target})
endif()
set(TEST_TARGET ${target})
add_test(
NAME ${TEST_NAME}
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
COMMAND $<TARGET_FILE:${TEST_TARGET}>
${LLAMA_TEST_ARGS})
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
endfunction()
# Builds and runs a test source file. # Builds and runs a test source file.
# Optional args: # Optional args:
# - NAME: name of the executable & test target (defaults to the source file name without extension) # - NAME: name of the executable & test target (defaults to the source file name without extension)
# - LABEL: label for the test (defaults to main) # - LABEL: label for the test (defaults to main)
# - ARGS: arguments to pass to the test executable # - ARGS: arguments to pass to the test executable
# - WORKING_DIRECTORY # - WORKING_DIRECTORY
function(llama_test source) function(llama_target_and_test source)
include(CMakeParseArguments) include(CMakeParseArguments)
set(options) set(options)
set(oneValueArgs NAME LABEL WORKING_DIRECTORY) set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
@ -35,45 +65,54 @@ function(llama_test source)
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL}) set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
endfunction() endfunction()
# llama_test(test-double-float.cpp) # SLOW # llama_target_and_test(test-double-float.cpp) # SLOW
llama_test(test-quantize-fns.cpp) llama_target_and_test(test-quantize-fns.cpp)
llama_test(test-quantize-perf.cpp) llama_target_and_test(test-quantize-perf.cpp)
llama_test(test-sampling.cpp) llama_target_and_test(test-sampling.cpp)
llama_test(test-chat-template.cpp) llama_target_and_test(test-chat-template.cpp)
llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_target_and_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf) llama_target_and_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_target_and_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf) llama_target_and_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf) llama_target_and_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) # build test-tokenizer-1-bpe target once and add many tests
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp get-model.cpp)
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-stablelm-3b-4e1t ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf) install(TARGETS test-tokenizer-1-bpe RUNTIME)
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
#llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
llama_test(test-grammar-parser.cpp) llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-llama-grammar.cpp) llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
llama_test(test-grammar-integration.cpp) llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
llama_test(test-grad0.cpp) llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
# llama_test(test-opt.cpp) # SLOW llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
llama_test(test-backend-ops.cpp) llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
llama_test(test-rope.cpp)
llama_test(test-model-load-cancel.cpp LABEL "model")
llama_test(test-autorelease.cpp LABEL "model")
llama_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
llama_target_and_test(test-grammar-parser.cpp)
llama_target_and_test(test-llama-grammar.cpp)
llama_target_and_test(test-grammar-integration.cpp)
llama_target_and_test(test-grad0.cpp)
# llama_target_and_test(test-opt.cpp) # SLOW
llama_target_and_test(test-backend-ops.cpp)
llama_target_and_test(test-rope.cpp)
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
llama_target_and_test(test-autorelease.cpp LABEL "model")
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server) target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
# dummy executable - not installed # dummy executable - not installed