cmake : refactor test targets
This commit is contained in:
parent
7b1210f6a8
commit
ef4cca9e87
3 changed files with 153 additions and 32 deletions
|
@ -68,6 +68,7 @@ def download_file_with_auth(url, token, save_path):
|
||||||
else:
|
else:
|
||||||
print(f"Failed to download file. Status code: {response.status_code}")
|
print(f"Failed to download file. Status code: {response.status_code}")
|
||||||
|
|
||||||
|
# download the tokenizer models
|
||||||
for model in models:
|
for model in models:
|
||||||
name = model["name"]
|
name = model["name"]
|
||||||
repo = model["repo"]
|
repo = model["repo"]
|
||||||
|
@ -173,3 +174,84 @@ print("\n")
|
||||||
print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
|
||||||
print("\n")
|
print("\n")
|
||||||
|
|
||||||
|
# generate tests for each tokenizer model
|
||||||
|
|
||||||
|
tests = [
|
||||||
|
"",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
"\t",
|
||||||
|
"\n",
|
||||||
|
"\n\n",
|
||||||
|
"\n\n\n",
|
||||||
|
"\t\n",
|
||||||
|
"Hello world",
|
||||||
|
" Hello world",
|
||||||
|
"Hello World",
|
||||||
|
" Hello World",
|
||||||
|
" Hello World!",
|
||||||
|
"Hello, world!",
|
||||||
|
" Hello, world!",
|
||||||
|
" this is 🦙.cpp",
|
||||||
|
"w048 7tuijk dsdfhu",
|
||||||
|
"нещо на Български",
|
||||||
|
"កាន់តែពិសេសអាចខលចេញ",
|
||||||
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
|
"Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello\n Hello",
|
||||||
|
" (",
|
||||||
|
"\n =",
|
||||||
|
"' era",
|
||||||
|
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
|
||||||
|
"3",
|
||||||
|
"33",
|
||||||
|
"333",
|
||||||
|
"3333",
|
||||||
|
"33333",
|
||||||
|
"333333",
|
||||||
|
"3333333",
|
||||||
|
"33333333",
|
||||||
|
"333333333",
|
||||||
|
]
|
||||||
|
|
||||||
|
# write the tests in ./models/test-vocab-inp.txt
|
||||||
|
# the format is:
|
||||||
|
#
|
||||||
|
# test0
|
||||||
|
# __ggml_vocab_test__
|
||||||
|
# test1
|
||||||
|
# __ggml_vocab_test__
|
||||||
|
# ...
|
||||||
|
#
|
||||||
|
|
||||||
|
with open(f"models/test-vocab-inp.txt", "w") as f:
|
||||||
|
for text in tests:
|
||||||
|
f.write(f"{text}")
|
||||||
|
f.write("\n__ggml_vocab_test__\n")
|
||||||
|
|
||||||
|
print("Tests written in ./models/test-vocab-inp.txt")
|
||||||
|
|
||||||
|
# with each model, encode all tests and write the results in ./models/test-vocab-out-{name}.txt
|
||||||
|
# for each test, write the resulting tokens on a separate line
|
||||||
|
|
||||||
|
for model in models:
|
||||||
|
name = model["name"]
|
||||||
|
tokt = model["tokt"]
|
||||||
|
|
||||||
|
# create the tokenizer
|
||||||
|
from transformers import AutoTokenizer
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
|
||||||
|
|
||||||
|
with open(f"models/test-vocab-out-{name}.txt", "w") as f:
|
||||||
|
for text in tests:
|
||||||
|
res = tokenizer.encode(text)
|
||||||
|
for r in res:
|
||||||
|
f.write(f" {r}")
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
print(f"Test results for {name} written in ./models/test-vocab-out-{name}.txt")
|
||||||
|
|
|
@ -1,10 +1,40 @@
|
||||||
|
function(llama_test target)
|
||||||
|
include(CMakeParseArguments)
|
||||||
|
set(options)
|
||||||
|
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
||||||
|
set(multiValueArgs ARGS)
|
||||||
|
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
|
||||||
|
|
||||||
|
if (NOT DEFINED LLAMA_TEST_LABEL)
|
||||||
|
set(LLAMA_TEST_LABEL "main")
|
||||||
|
endif()
|
||||||
|
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
|
||||||
|
set(LLAMA_TEST_WORKING_DIRECTORY .)
|
||||||
|
endif()
|
||||||
|
if (DEFINED LLAMA_TEST_NAME)
|
||||||
|
set(TEST_NAME ${LLAMA_TEST_NAME})
|
||||||
|
else()
|
||||||
|
set(TEST_NAME ${target})
|
||||||
|
endif()
|
||||||
|
|
||||||
|
set(TEST_TARGET ${target})
|
||||||
|
|
||||||
|
add_test(
|
||||||
|
NAME ${TEST_NAME}
|
||||||
|
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
|
||||||
|
COMMAND $<TARGET_FILE:${TEST_TARGET}>
|
||||||
|
${LLAMA_TEST_ARGS})
|
||||||
|
|
||||||
|
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
||||||
|
endfunction()
|
||||||
|
|
||||||
# Builds and runs a test source file.
|
# Builds and runs a test source file.
|
||||||
# Optional args:
|
# Optional args:
|
||||||
# - NAME: name of the executable & test target (defaults to the source file name without extension)
|
# - NAME: name of the executable & test target (defaults to the source file name without extension)
|
||||||
# - LABEL: label for the test (defaults to main)
|
# - LABEL: label for the test (defaults to main)
|
||||||
# - ARGS: arguments to pass to the test executable
|
# - ARGS: arguments to pass to the test executable
|
||||||
# - WORKING_DIRECTORY
|
# - WORKING_DIRECTORY
|
||||||
function(llama_test source)
|
function(llama_target_and_test source)
|
||||||
include(CMakeParseArguments)
|
include(CMakeParseArguments)
|
||||||
set(options)
|
set(options)
|
||||||
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
|
||||||
|
@ -35,45 +65,54 @@ function(llama_test source)
|
||||||
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
|
||||||
endfunction()
|
endfunction()
|
||||||
|
|
||||||
# llama_test(test-double-float.cpp) # SLOW
|
# llama_target_and_test(test-double-float.cpp) # SLOW
|
||||||
llama_test(test-quantize-fns.cpp)
|
llama_target_and_test(test-quantize-fns.cpp)
|
||||||
llama_test(test-quantize-perf.cpp)
|
llama_target_and_test(test-quantize-perf.cpp)
|
||||||
llama_test(test-sampling.cpp)
|
llama_target_and_test(test-sampling.cpp)
|
||||||
llama_test(test-chat-template.cpp)
|
llama_target_and_test(test-chat-template.cpp)
|
||||||
|
|
||||||
llama_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_target_and_test(test-tokenizer-0-llama.cpp NAME test-tokenizer-0-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
|
llama_target_and_test(test-tokenizer-0-llama-v3.cpp NAME test-tokenizer-0-llama-v3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-v3.gguf)
|
||||||
llama_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
llama_target_and_test(test-tokenizer-0-falcon.cpp NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
|
|
||||||
llama_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
llama_target_and_test(test-tokenizer-0-deepseek-coder.cpp NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
|
||||||
llama_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
llama_target_and_test(test-tokenizer-0-deepseek-llm.cpp NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
|
||||||
|
|
||||||
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-llama ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf)
|
||||||
llama_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
llama_target_and_test(test-tokenizer-1-llama.cpp NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
|
||||||
|
|
||||||
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
# build test-tokenizer-1-bpe target once and add many tests
|
||||||
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp get-model.cpp)
|
||||||
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
|
||||||
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-stablelm-3b-4e1t ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm-3b-4e1t.gguf)
|
install(TARGETS test-tokenizer-1-bpe RUNTIME)
|
||||||
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
|
||||||
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
|
||||||
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
|
||||||
llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
|
|
||||||
#llama_test(test-tokenizer-1-bpe.cpp NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
|
||||||
|
|
||||||
llama_test(test-grammar-parser.cpp)
|
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
|
||||||
llama_test(test-llama-grammar.cpp)
|
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
|
||||||
llama_test(test-grammar-integration.cpp)
|
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
|
||||||
llama_test(test-grad0.cpp)
|
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-stablelm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-stablelm.gguf)
|
||||||
# llama_test(test-opt.cpp) # SLOW
|
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
|
||||||
llama_test(test-backend-ops.cpp)
|
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
|
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
|
||||||
|
llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt2.gguf)
|
||||||
|
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-bloom ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bloom.gguf) # BIG
|
||||||
|
|
||||||
llama_test(test-rope.cpp)
|
|
||||||
|
|
||||||
llama_test(test-model-load-cancel.cpp LABEL "model")
|
|
||||||
llama_test(test-autorelease.cpp LABEL "model")
|
|
||||||
|
|
||||||
llama_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
|
||||||
|
|
||||||
|
llama_target_and_test(test-grammar-parser.cpp)
|
||||||
|
llama_target_and_test(test-llama-grammar.cpp)
|
||||||
|
llama_target_and_test(test-grammar-integration.cpp)
|
||||||
|
llama_target_and_test(test-grad0.cpp)
|
||||||
|
# llama_target_and_test(test-opt.cpp) # SLOW
|
||||||
|
llama_target_and_test(test-backend-ops.cpp)
|
||||||
|
|
||||||
|
llama_target_and_test(test-rope.cpp)
|
||||||
|
|
||||||
|
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
|
||||||
|
llama_target_and_test(test-autorelease.cpp LABEL "model")
|
||||||
|
|
||||||
|
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
|
||||||
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
|
||||||
|
|
||||||
# dummy executable - not installed
|
# dummy executable - not installed
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue