Merge pull request #5 from zihaoccc/cleanup2

remove tests
This commit is contained in:
Zihao Chen 2024-07-26 16:39:41 -07:00 committed by GitHub
commit f7c0f9f576
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
28 changed files with 0 additions and 10585 deletions

View file

@ -63,7 +63,6 @@ option(LLAMA_SANITIZE_ADDRESS "llama: enable address sanitizer" OFF)
option(LLAMA_SANITIZE_UNDEFINED "llama: enable undefined sanitizer" OFF)
# extra artifacts
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
option(LLAMA_BUILD_SERVER "llama: build server example" ${LLAMA_STANDALONE})
@ -189,11 +188,6 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/llama.pc"
add_subdirectory(common)
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
include(CTest)
add_subdirectory(tests)
endif ()
if (LLAMA_BUILD_EXAMPLES)
add_subdirectory(examples)
add_subdirectory(pocs)

4
tests/.gitignore vendored
View file

@ -1,4 +0,0 @@
*
!*.*
*.o
ggml-common.h

View file

@ -1,137 +0,0 @@
function(llama_test target)
include(CMakeParseArguments)
set(options)
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
set(multiValueArgs ARGS)
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (NOT DEFINED LLAMA_TEST_LABEL)
set(LLAMA_TEST_LABEL "main")
endif()
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
set(LLAMA_TEST_WORKING_DIRECTORY .)
endif()
if (DEFINED LLAMA_TEST_NAME)
set(TEST_NAME ${LLAMA_TEST_NAME})
else()
set(TEST_NAME ${target})
endif()
set(TEST_TARGET ${target})
add_test(
NAME ${TEST_NAME}
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
COMMAND $<TARGET_FILE:${TEST_TARGET}>
${LLAMA_TEST_ARGS})
set_property(TEST ${TEST_NAME} PROPERTY LABELS ${LLAMA_TEST_LABEL})
endfunction()
# Builds and runs a test source file.
# Optional args:
# - NAME: name of the executable & test target (defaults to the source file name without extension)
# - LABEL: label for the test (defaults to main)
# - ARGS: arguments to pass to the test executable
# - WORKING_DIRECTORY
function(llama_target_and_test source)
include(CMakeParseArguments)
set(options)
set(oneValueArgs NAME LABEL WORKING_DIRECTORY)
set(multiValueArgs ARGS)
cmake_parse_arguments(LLAMA_TEST "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
if (NOT DEFINED LLAMA_TEST_LABEL)
set(LLAMA_TEST_LABEL "main")
endif()
if (NOT DEFINED LLAMA_TEST_WORKING_DIRECTORY)
set(LLAMA_TEST_WORKING_DIRECTORY .)
endif()
if (DEFINED LLAMA_TEST_NAME)
set(TEST_TARGET ${LLAMA_TEST_NAME})
else()
get_filename_component(TEST_TARGET ${source} NAME_WE)
endif()
add_executable(${TEST_TARGET} ${source} get-model.cpp)
install(TARGETS ${TEST_TARGET} RUNTIME)
target_link_libraries(${TEST_TARGET} PRIVATE common)
add_test(
NAME ${TEST_TARGET}
WORKING_DIRECTORY ${LLAMA_TEST_WORKING_DIRECTORY}
COMMAND $<TARGET_FILE:${TEST_TARGET}>
${LLAMA_TEST_ARGS})
set_property(TEST ${TEST_TARGET} PROPERTY LABELS ${LLAMA_TEST_LABEL})
endfunction()
# build test-tokenizer-0 target once and add many tests
add_executable(test-tokenizer-0 test-tokenizer-0.cpp)
target_link_libraries(test-tokenizer-0 PRIVATE common)
install(TARGETS test-tokenizer-0 RUNTIME)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-bert-bge ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-bert-bge.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-coder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-coder.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-deepseek-llm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-deepseek-llm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-phi-3 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-phi-3.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
# build test-tokenizer-1-bpe target once and add many tests
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
install(TARGETS test-tokenizer-1-bpe RUNTIME)
# TODO: disabled due to slowness
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-gpt-neox ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-starcoder ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf)
# build test-tokenizer-1-spm target once and add many tests
add_executable(test-tokenizer-1-spm test-tokenizer-1-spm.cpp)
target_link_libraries(test-tokenizer-1-spm PRIVATE common)
install(TARGETS test-tokenizer-1-spm RUNTIME)
llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-llama-spm ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-spm.gguf)
#llama_test(test-tokenizer-1-spm NAME test-tokenizer-1-baichuan ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf)
# llama_target_and_test(test-double-float.cpp) # SLOW
llama_target_and_test(test-quantize-fns.cpp)
llama_target_and_test(test-quantize-perf.cpp)
llama_target_and_test(test-sampling.cpp)
llama_target_and_test(test-chat-template.cpp)
llama_target_and_test(test-grammar-parser.cpp)
llama_target_and_test(test-llama-grammar.cpp)
llama_target_and_test(test-grammar-integration.cpp)
llama_target_and_test(test-grad0.cpp)
# llama_target_and_test(test-opt.cpp) # SLOW
llama_target_and_test(test-backend-ops.cpp)
llama_target_and_test(test-rope.cpp)
llama_target_and_test(test-model-load-cancel.cpp LABEL "model")
llama_target_and_test(test-autorelease.cpp LABEL "model")
# TODO: disabled on loongarch64 because the ggml-ci node lacks Python 3.8
if (NOT ${CMAKE_SYSTEM_PROCESSOR} MATCHES "loongarch64")
llama_target_and_test(test-json-schema-to-grammar.cpp WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/..)
target_include_directories(test-json-schema-to-grammar PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/../examples/server)
endif()
# dummy executable - not installed
get_filename_component(TEST_TARGET test-c.c NAME_WE)
add_executable(${TEST_TARGET} test-c.c)
target_link_libraries(${TEST_TARGET} PRIVATE llama)

View file

@ -1,21 +0,0 @@
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include "get-model.h"
char * get_model_or_exit(int argc, char *argv[]) {
char * model_path;
if (argc > 1) {
model_path = argv[1];
} else {
model_path = getenv("LLAMACPP_TEST_MODELFILE");
if (!model_path || strlen(model_path) == 0) {
fprintf(stderr, "\033[33mWARNING: No model file provided. Skipping this test. Set LLAMACPP_TEST_MODELFILE=<gguf_model_path> to silence this warning and run this test.\n\033[0m");
exit(EXIT_SUCCESS);
}
}
return model_path;
}

View file

@ -1,2 +0,0 @@
#pragma once
char * get_model_or_exit(int, char*[]);

View file

@ -1,10 +0,0 @@
import { readFileSync } from "fs"
import { SchemaConverter } from "../examples/server/public/json-schema-to-grammar.mjs"
const [, , file] = process.argv
const url = `file://${file}`
let schema = JSON.parse(readFileSync(file, "utf8"));
const converter = new SchemaConverter({})
schema = await converter.resolveRefs(schema, url)
converter.visit(schema, '')
console.log(converter.formatGrammar())

View file

@ -1,24 +0,0 @@
// ref: https://github.com/ggerganov/llama.cpp/issues/4952#issuecomment-1892864763
#include <cstdio>
#include <string>
#include <thread>
#include "llama.h"
#include "get-model.h"
// This creates a new context inside a pthread and then tries to exit cleanly.
int main(int argc, char ** argv) {
auto * model_path = get_model_or_exit(argc, argv);
std::thread([&model_path]() {
llama_backend_init();
auto * model = llama_load_model_from_file(model_path, llama_model_default_params());
auto * ctx = llama_new_context_with_model(model, llama_context_default_params());
llama_free(ctx);
llama_free_model(model);
llama_backend_free();
}).join();
return 0;
}

File diff suppressed because it is too large Load diff

View file

@ -1,7 +0,0 @@
#include "llama.h"
#ifdef GGML_USE_KOMPUTE
#include "ggml-kompute.h"
#endif
int main(void) {}

View file

@ -1,177 +0,0 @@
#include <string>
#include <vector>
#include <sstream>
#undef NDEBUG
#include <cassert>
#include "llama.h"
#include "common.h"
int main(void) {
llama_chat_message conversation[] = {
{"system", "You are a helpful assistant"},
{"user", "Hello"},
{"assistant", "Hi there"},
{"user", "Who are you"},
{"assistant", " I am an assistant "},
{"user", "Another question"},
};
size_t message_count = 6;
std::vector<std::string> templates = {
// teknium/OpenHermes-2.5-Mistral-7B
"{% for message in messages %}{{'<|im_start|>' + message['role'] + '\\n' + message['content'] + '<|im_end|>' + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\\n' }}{% endif %}",
// mistralai/Mistral-7B-Instruct-v0.2
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",
// TheBloke/FusionNet_34Bx2_MoE-AWQ
"{%- for idx in range(0, messages|length) -%}\\n{%- if messages[idx]['role'] == 'user' -%}\\n{%- if idx > 1 -%}\\n{{- bos_token + '[INST] ' + messages[idx]['content'] + ' [/INST]' -}}\\n{%- else -%}\\n{{- messages[idx]['content'] + ' [/INST]' -}}\\n{%- endif -%}\\n{% elif messages[idx]['role'] == 'system' %}\\n{{- '[INST] <<SYS>>\\\\n' + messages[idx]['content'] + '\\\\n<</SYS>>\\\\n\\\\n' -}}\\n{%- elif messages[idx]['role'] == 'assistant' -%}\\n{{- ' ' + messages[idx]['content'] + ' ' + eos_token -}}\\n{% endif %}\\n{% endfor %}",
// bofenghuang/vigogne-2-70b-chat
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif true == true and not '<<SYS>>' in messages[0]['content'] %}{% set loop_messages = messages %}{% set system_message = 'Vous êtes Vigogne, un assistant IA créé par Zaion Lab. Vous suivez extrêmement bien les instructions. Aidez autant que vous le pouvez.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if loop.index0 == 0 and system_message != false %}{% set content = '<<SYS>>\\\\n' + system_message + '\\\\n<</SYS>>\\\\n\\\\n' + message['content'] %}{% else %}{% set content = message['content'] %}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + content.strip() + ' [/INST]' }}{% elif message['role'] == 'system' %}{{ '<<SYS>>\\\\n' + content.strip() + '\\\\n<</SYS>>\\\\n\\\\n' }}{% elif message['role'] == 'assistant' %}{{ ' ' + content.strip() + ' ' + eos_token }}{% endif %}{% endfor %}",
// mlabonne/AlphaMonarch-7B
"{% for message in messages %}{{bos_token + message['role'] + '\\n' + message['content'] + eos_token + '\\n'}}{% endfor %}{% if add_generation_prompt %}{{ bos_token + 'assistant\\n' }}{% endif %}",
// google/gemma-7b-it
"{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\\n' + message['content'] | trim + '<end_of_turn>\\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\\n'}}{% endif %}",
// OrionStarAI/Orion-14B-Chat
"{% for message in messages %}{% if loop.first %}{{ bos_token }}{% endif %}{% if message['role'] == 'user' %}{{ 'Human: ' + message['content'] + '\\n\\nAssistant: ' + eos_token }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token }}{% endif %}{% endfor %}",
// openchat/openchat-3.5-0106
// The included chat_template differs from the author's suggestions here: https://huggingface.co/openchat/openchat_3.5/discussions/5#65448109b4a3f3a2f486fd9d
// So we match against the included template but implement the suggested version.
"{{ bos_token }}{% for message in messages %}{{ 'GPT4 Correct ' + message['role'].title() + ': ' + message['content'] + '<|end_of_turn|>'}}{% endfor %}{% if add_generation_prompt %}{{ 'GPT4 Correct Assistant:' }}{% endif %}",
// deepseek-ai/deepseek-coder-33b-instruct
"{% if not add_generation_prompt is defined %}\n{% set add_generation_prompt = false %}\n{% endif %}\n{%- set ns = namespace(found=false) -%}\n{%- for message in messages -%}\n {%- if message['role'] == 'system' -%}\n {%- set ns.found = true -%}\n {%- endif -%}\n{%- endfor -%}\n{{bos_token}}{%- if not ns.found -%}\n{{'You are an AI programming assistant, utilizing the Deepseek Coder model, developed by Deepseek Company, and you only answer questions related to computer science. For politically sensitive questions, security and privacy issues, and other non-computer science questions, you will refuse to answer\\n'}}\n{%- endif %}\n{%- for message in messages %}\n {%- if message['role'] == 'system' %}\n{{ message['content'] }}\n {%- else %}\n {%- if message['role'] == 'user' %}\n{{'### Instruction:\\n' + message['content'] + '\\n'}}\n {%- else %}\n{{'### Response:\\n' + message['content'] + '\\n<|EOT|>\\n'}}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{% if add_generation_prompt %}\n{{'### Response:'}}\n{% endif %}",
// eachadea/vicuna-13b-1.1
// No template included in tokenizer_config.json, so this template likely needs to be manually set.
"{%- for message in messages %}{%- if message['role'] == 'system' -%}{{- '' + message['content'] + '\n\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
// Orca-Vicuna
// No template included in tokenizer_config.json, so this template likely needs to be manually set.
"{%- for message in messages %}{%- if message['role'] == 'system' -%}{{-'SYSTEM: ' + message['content'] + '\n' -}}{%- else -%}{%- if message['role'] == 'user' -%}{{-'USER: ' + message['content'] + '\n'-}}{%- else -%}{{-'ASSISTANT: ' + message['content'] + '</s>\n' -}}{%- endif -%}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{-'ASSISTANT:'-}}{%- endif -%}",
// CohereForAI/c4ai-command-r-plus
"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
// Llama-3
"{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}",
//Phi-3-mini
"{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
//Phi-3-small
"{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
//Phi-3-medium
"{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif (message['role'] == 'assistant') %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
//Phi-3-vision
"{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
// ChatGLM3
"{% for message in messages %}{% if loop.first %}[gMASK]sop<|{{ message['role'] }}|>\n {{ message['content'] }}{% else %}<|{{ message['role'] }}|>\n {{ message['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
// ChatGLM4
u8"[gMASK]<sop>{% for item in messages %}{% if item['tools'] is defined %}<|system|>\n你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。\n\n# 可用工具{% set tools = item['tools'] %}{% for tool in tools %}{% if tool['type'] == 'function' %}\n\n## {{ tool['function']['name'] }}\n\n{{ tool['function'] | tojson(indent=4) }}\n......{% endif %}{% endfor %}{% endif %}{% if item['content'] %}<|{{ item['role'] }}|>{{ item['metadata'] }}\n{{ item['content'] }}{% endif %}{% endfor %}{% if add_generation_prompt %}<|assistant|>{% endif %}",
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
u8"{% for message in messages %}{% if message['role'] == 'user' %}{{'<用户>' + message['content'].strip() + '<AI>'}}{% else %}{{message['content'].strip()}}{% endif %}{% endfor %}",
// DeepSeek-V2
"{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ 'User: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ 'Assistant: ' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'Assistant:' }}{% endif %}",
};
std::vector<std::string> expected_output = {
// teknium/OpenHermes-2.5-Mistral-7B
"<|im_start|>system\nYou are a helpful assistant<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi there<|im_end|>\n<|im_start|>user\nWho are you<|im_end|>\n<|im_start|>assistant\n I am an assistant <|im_end|>\n<|im_start|>user\nAnother question<|im_end|>\n<|im_start|>assistant\n",
// mistralai/Mistral-7B-Instruct-v0.2
"[INST] You are a helpful assistant\nHello [/INST]Hi there</s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
// TheBloke/FusionNet_34Bx2_MoE-AWQ
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s><s>[INST] Who are you [/INST] I am an assistant </s><s>[INST] Another question [/INST]",
// bofenghuang/vigogne-2-70b-chat
"[INST] <<SYS>>\nYou are a helpful assistant\n<</SYS>>\n\nHello [/INST] Hi there </s>[INST] Who are you [/INST] I am an assistant </s>[INST] Another question [/INST]",
// mlabonne/AlphaMonarch-7B
"system\nYou are a helpful assistant</s>\n<s>user\nHello</s>\n<s>assistant\nHi there</s>\n<s>user\nWho are you</s>\n<s>assistant\n I am an assistant </s>\n<s>user\nAnother question</s>\n<s>assistant\n",
// google/gemma-7b-it
"<start_of_turn>user\nYou are a helpful assistant\n\nHello<end_of_turn>\n<start_of_turn>model\nHi there<end_of_turn>\n<start_of_turn>user\nWho are you<end_of_turn>\n<start_of_turn>model\nI am an assistant<end_of_turn>\n<start_of_turn>user\nAnother question<end_of_turn>\n<start_of_turn>model\n",
// OrionStarAI/Orion-14B-Chat
"Human: You are a helpful assistant\n\nHello\n\nAssistant: </s>Hi there</s>Human: Who are you\n\nAssistant: </s> I am an assistant </s>Human: Another question\n\nAssistant: </s>",
// openchat/openchat-3.5-0106
"You are a helpful assistant<|end_of_turn|>GPT4 Correct User: Hello<|end_of_turn|>GPT4 Correct Assistant: Hi there<|end_of_turn|>GPT4 Correct User: Who are you<|end_of_turn|>GPT4 Correct Assistant: I am an assistant <|end_of_turn|>GPT4 Correct User: Another question<|end_of_turn|>GPT4 Correct Assistant:",
// deepseek-ai/deepseek-coder-33b-instruct
"You are a helpful assistant### Instruction:\nHello\n### Response:\nHi there\n<|EOT|>\n### Instruction:\nWho are you\n### Response:\n I am an assistant \n<|EOT|>\n### Instruction:\nAnother question\n### Response:\n",
// eachadea/vicuna-13b-1.1
"You are a helpful assistant\n\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT: I am an assistant </s>\nUSER: Another question\nASSISTANT:",
// Orca-Vicuna
"SYSTEM: You are a helpful assistant\nUSER: Hello\nASSISTANT: Hi there</s>\nUSER: Who are you\nASSISTANT: I am an assistant </s>\nUSER: Another question\nASSISTANT:",
// CohereForAI/c4ai-command-r-plus
"<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>You are a helpful assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Hello<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>Hi there<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Who are you<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>I am an assistant<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|USER_TOKEN|>Another question<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>",
// Llama 3
"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nHello<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nHi there<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nI am an assistant<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nAnother question<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n",
//Phi-3-mini
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
//Phi-3-small
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
//Phi-3-medium
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
//Phi-3-vision
"<|system|>\nYou are a helpful assistant<|end|>\n<|user|>\nHello<|end|>\n<|assistant|>\nHi there<|end|>\n<|user|>\nWho are you<|end|>\n<|assistant|>\n I am an assistant <|end|>\n<|user|>\nAnother question<|end|>\n<|assistant|>\n",
// ChatGLM3
"[gMASK]sop<|system|>\n You are a helpful assistant<|user|>\n Hello<|assistant|>\n Hi there<|user|>\n Who are you<|assistant|>\n I am an assistant <|user|>\n Another question<|assistant|>",
// ChatGLM4
"[gMASK]<sop><|system|>\nYou are a helpful assistant<|user|>\nHello<|assistant|>\nHi there<|user|>\nWho are you<|assistant|>\n I am an assistant <|user|>\nAnother question<|assistant|>",
// MiniCPM-3B-OpenHermes-2.5-v2-GGUF
u8"You are a helpful assistant<用户>Hello<AI>Hi there<用户>Who are you<AI>I am an assistant<用户>Another question<AI>",
// DeepSeek-V2
u8"You are a helpful assistant\n\nUser: Hello\n\nAssistant: Hi there<end▁of▁sentence>User: Who are you\n\nAssistant: I am an assistant <end▁of▁sentence>User: Another question\n\nAssistant:",
};
std::vector<char> formatted_chat(1024);
int32_t res;
// test invalid chat template
res = llama_chat_apply_template(nullptr, "INVALID TEMPLATE", conversation, message_count, true, formatted_chat.data(), formatted_chat.size());
assert(res < 0);
for (size_t i = 0; i < templates.size(); i++) {
std::string custom_template = templates[i];
std::string expected = expected_output[i];
formatted_chat.resize(1024);
res = llama_chat_apply_template(
nullptr,
custom_template.c_str(),
conversation,
message_count,
true,
formatted_chat.data(),
formatted_chat.size()
);
formatted_chat.resize(res);
std::string output(formatted_chat.data(), formatted_chat.size());
printf("%s\n", output.c_str());
printf("-------------------------\n");
assert(output == expected);
}
// test llama_chat_format_single for system message
printf("\n\n=== llama_chat_format_single (system message) ===\n\n");
std::vector<llama_chat_msg> chat2;
llama_chat_msg sys_msg{"system", "You are a helpful assistant"};
auto fmt_sys = [&](std::string tmpl) {
auto output = llama_chat_format_single(nullptr, tmpl, chat2, sys_msg, false);
printf("fmt_sys(%s) : %s\n", tmpl.c_str(), output.c_str());
printf("-------------------------\n");
return output;
};
assert(fmt_sys("chatml") == "<|im_start|>system\nYou are a helpful assistant<|im_end|>\n");
assert(fmt_sys("llama2") == "[INST] You are a helpful assistant\n");
assert(fmt_sys("gemma") == ""); // for gemma, system message is merged with user message
assert(fmt_sys("llama3") == "<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful assistant<|eot_id|>");
// test llama_chat_format_single for user message
printf("\n\n=== llama_chat_format_single (user message) ===\n\n");
chat2.push_back({"system", "You are a helpful assistant"});
chat2.push_back({"user", "Hello"});
chat2.push_back({"assistant", "I am assistant"});
llama_chat_msg new_msg{"user", "How are you"};
auto fmt_single = [&](std::string tmpl) {
auto output = llama_chat_format_single(nullptr, tmpl, chat2, new_msg, true);
printf("fmt_single(%s) : %s\n", tmpl.c_str(), output.c_str());
printf("-------------------------\n");
return output;
};
assert(fmt_single("chatml") == "\n<|im_start|>user\nHow are you<|im_end|>\n<|im_start|>assistant\n");
assert(fmt_single("llama2") == "[INST] How are you [/INST]");
assert(fmt_single("gemma") == "\n<start_of_turn>user\nHow are you<end_of_turn>\n<start_of_turn>model\n");
assert(fmt_single("llama3") == "<|start_header_id|>user<|end_header_id|>\n\nHow are you<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n");
return 0;
}

View file

@ -1,57 +0,0 @@
// These tests may take a long time!
// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result.
// This is done by checking all finite (non-NaN, non-infinite) floats.
#undef NDEBUG
#include <cassert>
#if !defined(__riscv) && !defined(__s390__) && !defined(__ARM_NEON)
#include <immintrin.h>
#endif
#include <cmath>
#include <cstdint>
#include <cstring>
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wdouble-promotion"
// ggml.c::quantize_row_q4_0_ref
inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; }
// ggml.c::ggml_silu_f32
inline static float silu_orig(float x) {
return x/(1.0 + exp(-x));
}
#pragma GCC diagnostic pop
// ggml.c::quantize_row_q4_0_ref
inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; }
// ggml.c::ggml_silu_f32
inline static float silu_float(float x) {
return x/(1.0f + expf(-x));
}
int main(void) {
uint32_t x = UINT32_MAX;
do {
float f;
memcpy(&f, &x, sizeof(x));
assert(!std::isfinite(f) || (round_orig(f) == round_float(f)));
} while (x--);
#ifdef __F16C__
// GELU and SILU implementations are used with a FP16 lookup table.
// The original and float-only results are not equal for all inputs after converting to FP16.
// GELU is an approximation anyway (tanh), not tested here.
// For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match.
for (x = 0; x <= UINT16_MAX; x++) {
float f = _cvtsh_ss(x);
const float so = silu_orig(f);
const float sf = silu_float(f);
assert( (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0))
|| (nextafterf(so, sf) == sf)
|| (nextafterf(sf, so) == so));
}
#endif
}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -1,515 +0,0 @@
#ifdef NDEBUG
#undef NDEBUG
#endif
#include "llama.h"
#include "grammar-parser.h"
#include <cassert>
static const char * type_str(llama_gretype type) {
switch (type) {
case LLAMA_GRETYPE_CHAR: return "LLAMA_GRETYPE_CHAR";
case LLAMA_GRETYPE_CHAR_NOT: return "LLAMA_GRETYPE_CHAR_NOT";
case LLAMA_GRETYPE_CHAR_ALT: return "LLAMA_GRETYPE_CHAR_ALT";
case LLAMA_GRETYPE_CHAR_RNG_UPPER: return "LLAMA_GRETYPE_CHAR_RNG_UPPER";
case LLAMA_GRETYPE_RULE_REF: return "LLAMA_GRETYPE_RULE_REF";
case LLAMA_GRETYPE_ALT: return "LLAMA_GRETYPE_ALT";
case LLAMA_GRETYPE_END: return "LLAMA_GRETYPE_END";
default: return "?";
}
}
static void verify_parsing(const char *grammar_bytes, const std::vector<std::pair<std::string, uint32_t>> expected, const std::vector<llama_grammar_element> &expected_rules) {
uint32_t index = 0;
grammar_parser::parse_state parsed_grammar = grammar_parser::parse(grammar_bytes);
std::map<uint32_t, std::string> symbol_names;
for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
symbol_names[it->second] = it->first;
}
auto print_all = [&]() {
fprintf(stderr, " verify_parsing(R\"\"\"(%s)\"\"\", {\n", grammar_bytes);
for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it) {
fprintf(stderr, " {\"%s\", %u},\n", it->first.c_str(), it->second);
}
fprintf(stderr, " }, {\n");
for (size_t i_rule = 0; i_rule < parsed_grammar.rules.size(); i_rule++) {
fprintf(stderr, " // %s (index %zu)\n", symbol_names[i_rule].c_str(), i_rule);
auto & rule = parsed_grammar.rules[i_rule];
for (uint32_t i = 0; i < rule.size(); i++) {
std::string rule_str;
fprintf(stderr, " {%s, ", type_str(rule[i].type));
if (rule[i].type == LLAMA_GRETYPE_CHAR || rule[i].type == LLAMA_GRETYPE_CHAR_ALT ||
rule[i].type == LLAMA_GRETYPE_CHAR_NOT || rule[i].type == LLAMA_GRETYPE_CHAR_RNG_UPPER) {
char c = rule[i].value;
if (c == '\n') {
fprintf(stderr, "'\\n'");
} else if (c == '\t') {
fprintf(stderr, "'\\t'");
} else if (c == '\r') {
fprintf(stderr, "'\\r'");
} else if (c == '\0') {
fprintf(stderr, "'\\0'");
} else {
fprintf(stderr, "'%c'", c);
}
} else if (rule[i].type == LLAMA_GRETYPE_RULE_REF) {
fprintf(stderr, "/* %s */ %u", symbol_names[rule[i].value].c_str(), rule[i].value);
} else {
fprintf(stderr, "%u", rule[i].value);
}
fprintf(stderr, "},\n");
}
}
fprintf(stderr, " });\n");
};
if (getenv("TEST_GRAMMAR_PARSER_PRINT_ALL")) {
print_all();
fprintf(stderr, "\n");
return;
}
fprintf(stderr, "Testing grammar:%s\n", grammar_bytes);
if (parsed_grammar.symbol_ids.size() != expected.size()) {
fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
print_all();
assert(parsed_grammar.symbol_ids.size() == expected.size());
}
for (auto it = parsed_grammar.symbol_ids.begin(); it != parsed_grammar.symbol_ids.end(); ++it)
{
std::string key = it->first;
uint32_t value = it->second;
std::pair<std::string, uint32_t> expected_pair = expected[index];
// pretty print error message before asserting
if (expected_pair.first != key || expected_pair.second != value)
{
fprintf(stderr, "index: %u\n", index);
fprintf(stderr, "expected_pair: %s, %u\n", expected_pair.first.c_str(), expected_pair.second);
fprintf(stderr, "actual_pair: %s, %u\n", key.c_str(), value);
fprintf(stderr, "expected_pair != actual_pair\n");
fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
print_all();
}
assert(expected_pair.first == key && expected_pair.second == value);
index++;
}
index = 0;
for (auto rule : parsed_grammar.rules)
{
// compare rule to expected rule
for (uint32_t i = 0; i < rule.size(); i++)
{
llama_grammar_element element = rule[i];
llama_grammar_element expected_element = expected_rules[index];
// pretty print error message before asserting
if (expected_element.type != element.type || expected_element.value != element.value)
{
fprintf(stderr, "index: %u\n", index);
fprintf(stderr, "expected_element: %s, %u\n", type_str(expected_element.type), expected_element.value);
fprintf(stderr, "actual_element: %s, %u\n", type_str(element.type), element.value);
fprintf(stderr, "expected_element != actual_element\n");
fprintf(stderr, "all elements:\n");
fprintf(stderr, "Code to update expectation (set TEST_GRAMMAR_PARSER_PRINT_ALL=1 to print all):\n");
print_all();
}
assert(expected_element.type == element.type && expected_element.value == element.value);
index++;
}
}
}
static void verify_failure(const char *grammar_bytes) {
fprintf(stderr, "Testing expected failure:%s\n", grammar_bytes);
auto result = grammar_parser::parse(grammar_bytes);
assert(result.rules.empty() && "should have failed");
}
int main()
{
verify_failure(R"""(
root ::= "a"{,}"
)""");
verify_failure(R"""(
root ::= "a"{,10}"
)""");
verify_parsing(R"""(
root ::= "a"
)""", {
{"root", 0},
}, {
// root (index 0)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= "a" | [bdx-z] | [^1-3]
)""", {
{"root", 0},
}, {
// root (index 0)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 'b'},
{LLAMA_GRETYPE_CHAR_ALT, 'd'},
{LLAMA_GRETYPE_CHAR_ALT, 'x'},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR_NOT, '1'},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '3'},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= a+
a ::= "a"
)""", {
{"a", 1},
{"root", 0},
{"root_2", 2},
}, {
// root (index 0)
{LLAMA_GRETYPE_RULE_REF, /* a */ 1},
{LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
{LLAMA_GRETYPE_END, 0},
// a (index 1)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_END, 0},
// root_2 (index 2)
{LLAMA_GRETYPE_RULE_REF, /* a */ 1},
{LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= "a"+
)""", {
{"root", 0},
{"root_1", 1},
}, {
// root (index 0)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_END, 0},
// root_1 (index 1)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= a?
a ::= "a"
)""", {
{"a", 1},
{"root", 0},
{"root_2", 2},
}, {
// root (index 0)
{LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
{LLAMA_GRETYPE_END, 0},
// a (index 1)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_END, 0},
// root_2 (index 2)
{LLAMA_GRETYPE_RULE_REF, /* a */ 1},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= "a"?
)""", {
{"root", 0},
{"root_1", 1},
}, {
// root (index 0)
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_END, 0},
// root_1 (index 1)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= a*
a ::= "a"
)""", {
{"a", 1},
{"root", 0},
{"root_2", 2},
}, {
// root (index 0)
{LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
{LLAMA_GRETYPE_END, 0},
// a (index 1)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_END, 0},
// root_2 (index 2)
{LLAMA_GRETYPE_RULE_REF, /* a */ 1},
{LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= "a"*
)""", {
{"root", 0},
{"root_1", 1},
}, {
// root (index 0)
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_END, 0},
// root_1 (index 1)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= "a"{2}
)""", {
{"root", 0},
}, {
// root (index 0)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= "a"{2,}
)""", {
{"root", 0},
{"root_1", 1},
}, {
// root (index 0)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_END, 0},
// root_1 (index 1)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= "a"{ 4}
)""", {
{"root", 0},
}, {
// root (index 0)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= "a"{2,4}
)""", {
{"root", 0},
{"root_1", 1},
{"root_2", 2},
}, {
// root (index 0)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_RULE_REF, /* root_2 */ 2},
{LLAMA_GRETYPE_END, 0},
// root_1 (index 1)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
// root_2 (index 2)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= (expr "=" term "\n")+
expr ::= term ([-+*/] term)*
term ::= [0-9]+
)""", {
{"expr", 2},
{"expr_5", 5},
{"expr_6", 6},
{"root", 0},
{"root_1", 1},
{"root_4", 4},
{"term", 3},
{"term_7", 7},
}, {
// root (index 0)
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
{LLAMA_GRETYPE_END, 0},
// root_1 (index 1)
{LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
{LLAMA_GRETYPE_CHAR, '='},
{LLAMA_GRETYPE_RULE_REF, /* term */ 3},
{LLAMA_GRETYPE_CHAR, '\n'},
{LLAMA_GRETYPE_END, 0},
// expr (index 2)
{LLAMA_GRETYPE_RULE_REF, /* term */ 3},
{LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
{LLAMA_GRETYPE_END, 0},
// term (index 3)
{LLAMA_GRETYPE_CHAR, '0'},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
{LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
{LLAMA_GRETYPE_END, 0},
// root_4 (index 4)
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_RULE_REF, /* root_4 */ 4},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
// expr_5 (index 5)
{LLAMA_GRETYPE_CHAR, '-'},
{LLAMA_GRETYPE_CHAR_ALT, '+'},
{LLAMA_GRETYPE_CHAR_ALT, '*'},
{LLAMA_GRETYPE_CHAR_ALT, '/'},
{LLAMA_GRETYPE_RULE_REF, /* term */ 3},
{LLAMA_GRETYPE_END, 0},
// expr_6 (index 6)
{LLAMA_GRETYPE_RULE_REF, /* expr_5 */ 5},
{LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
// term_7 (index 7)
{LLAMA_GRETYPE_CHAR, '0'},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
{LLAMA_GRETYPE_RULE_REF, /* term_7 */ 7},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
verify_parsing(R"""(
root ::= (expr "=" ws term "\n")+
expr ::= term ([-+*/] term)*
term ::= ident | num | "(" ws expr ")" ws
ident ::= [a-z] [a-z0-9_]* ws
num ::= [0-9]+ ws
ws ::= [ \t\n]*
)""", {
{"expr", 2},
{"expr_6", 6},
{"expr_7", 7},
{"ident", 8},
{"ident_10", 10},
{"num", 9},
{"num_11", 11},
{"root", 0},
{"root_1", 1},
{"root_5", 5},
{"term", 4},
{"ws", 3},
{"ws_12", 12},
}, {
// root (index 0)
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
{LLAMA_GRETYPE_END, 0},
// root_1 (index 1)
{LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
{LLAMA_GRETYPE_CHAR, '='},
{LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
{LLAMA_GRETYPE_RULE_REF, /* term */ 4},
{LLAMA_GRETYPE_CHAR, '\n'},
{LLAMA_GRETYPE_END, 0},
// expr (index 2)
{LLAMA_GRETYPE_RULE_REF, /* term */ 4},
{LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
{LLAMA_GRETYPE_END, 0},
// ws (index 3)
{LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
{LLAMA_GRETYPE_END, 0},
// term (index 4)
{LLAMA_GRETYPE_RULE_REF, /* ident */ 8},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_RULE_REF, /* num */ 9},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, '('},
{LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
{LLAMA_GRETYPE_RULE_REF, /* expr */ 2},
{LLAMA_GRETYPE_CHAR, ')'},
{LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
{LLAMA_GRETYPE_END, 0},
// root_5 (index 5)
{LLAMA_GRETYPE_RULE_REF, /* root_1 */ 1},
{LLAMA_GRETYPE_RULE_REF, /* root_5 */ 5},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
// expr_6 (index 6)
{LLAMA_GRETYPE_CHAR, '-'},
{LLAMA_GRETYPE_CHAR_ALT, '+'},
{LLAMA_GRETYPE_CHAR_ALT, '*'},
{LLAMA_GRETYPE_CHAR_ALT, '/'},
{LLAMA_GRETYPE_RULE_REF, /* term */ 4},
{LLAMA_GRETYPE_END, 0},
// expr_7 (index 7)
{LLAMA_GRETYPE_RULE_REF, /* expr_6 */ 6},
{LLAMA_GRETYPE_RULE_REF, /* expr_7 */ 7},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
// ident (index 8)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
{LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
{LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
{LLAMA_GRETYPE_END, 0},
// num (index 9)
{LLAMA_GRETYPE_CHAR, '0'},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
{LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
{LLAMA_GRETYPE_RULE_REF, /* ws */ 3},
{LLAMA_GRETYPE_END, 0},
// ident_10 (index 10)
{LLAMA_GRETYPE_CHAR, 'a'},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 'z'},
{LLAMA_GRETYPE_CHAR_ALT, '0'},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
{LLAMA_GRETYPE_CHAR_ALT, '_'},
{LLAMA_GRETYPE_RULE_REF, /* ident_10 */ 10},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
// num_11 (index 11)
{LLAMA_GRETYPE_CHAR, '0'},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, '9'},
{LLAMA_GRETYPE_RULE_REF, /* num_11 */ 11},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
// ws_12 (index 12)
{LLAMA_GRETYPE_CHAR, ' '},
{LLAMA_GRETYPE_CHAR_ALT, '\t'},
{LLAMA_GRETYPE_CHAR_ALT, '\n'},
{LLAMA_GRETYPE_RULE_REF, /* ws_12 */ 12},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
});
return 0;
}

File diff suppressed because it is too large Load diff

View file

@ -1,408 +0,0 @@
#ifdef NDEBUG
#undef NDEBUG
#endif
#define LLAMA_API_INTERNAL
#include "llama.h"
#include "grammar-parser.h"
#include <cassert>
#include <stdexcept>
int main()
{
grammar_parser::parse_state parsed_grammar;
std::vector<std::pair<std::string, uint32_t>> expected = {
{"expr", 2},
{"expr_6", 6},
{"expr_7", 7},
{"ident", 8},
{"ident_10", 10},
{"num", 9},
{"num_11", 11},
{"root", 0},
{"root_1", 1},
{"root_5", 5},
{"term", 4},
{"ws", 3},
{"ws_12", 12},
};
std::vector<std::vector<llama_grammar_element>> expected_rules = {
{{LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_RULE_REF, 2},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_CHAR, 10},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 4}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_END, 0}},
{{LLAMA_GRETYPE_RULE_REF, 12}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_RULE_REF, 8},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_RULE_REF, 9},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 40},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_RULE_REF, 2},
{LLAMA_GRETYPE_CHAR, 41},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_RULE_REF, 5}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_RULE_REF, 1}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_CHAR, 45},
{LLAMA_GRETYPE_CHAR_ALT, 43},
{LLAMA_GRETYPE_CHAR_ALT, 42},
{LLAMA_GRETYPE_CHAR_ALT, 47},
{LLAMA_GRETYPE_RULE_REF, 4},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 6}, {LLAMA_GRETYPE_RULE_REF, 7}, {LLAMA_GRETYPE_ALT, 0}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_CHAR, 97},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
{LLAMA_GRETYPE_RULE_REF, 10},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_END, 0},
},
{{LLAMA_GRETYPE_RULE_REF, 11}, {LLAMA_GRETYPE_RULE_REF, 3}, {LLAMA_GRETYPE_END, 0}},
{
{LLAMA_GRETYPE_CHAR, 97},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 122},
{LLAMA_GRETYPE_CHAR_ALT, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_CHAR_ALT, 95},
{LLAMA_GRETYPE_RULE_REF, 10},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
},
{
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_RULE_REF, 11},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_CHAR, 48},
{LLAMA_GRETYPE_CHAR_RNG_UPPER, 57},
{LLAMA_GRETYPE_END, 0},
},
{
{LLAMA_GRETYPE_CHAR, 32},
{LLAMA_GRETYPE_CHAR_ALT, 9},
{LLAMA_GRETYPE_CHAR_ALT, 10},
{LLAMA_GRETYPE_RULE_REF, 12},
{LLAMA_GRETYPE_ALT, 0},
{LLAMA_GRETYPE_END, 0},
},
};
for (auto pair : expected)
{
parsed_grammar.symbol_ids[pair.first] = pair.second;
}
for (auto rule : expected_rules)
{
parsed_grammar.rules.emplace_back();
for (auto element : rule)
{
parsed_grammar.rules.back().push_back(element);
}
}
llama_grammar * grammar = NULL;
std::vector<const llama_grammar_element *> grammar_rules(parsed_grammar.c_rules());
grammar = llama_grammar_init(grammar_rules.data(), grammar_rules.size(), parsed_grammar.symbol_ids.at("root"));
if (grammar == nullptr)
{
throw std::runtime_error("Failed to initialize llama_grammar");
}
std::vector<std::vector<llama_grammar_element>> expected_stacks = {
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_RULE_REF, 5},
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 97},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_RULE_REF, 3},
{LLAMA_GRETYPE_CHAR, 48},
},
{
{LLAMA_GRETYPE_CHAR, 61},
{LLAMA_GRETYPE_RULE_REF, 7},
{LLAMA_GRETYPE_CHAR, 40},
}};
auto index = 0;
for (auto stack : llama_grammar_get_stacks(grammar))
{
// compare stack to expected_stack
for (uint32_t i = 0; i < stack.size(); i++)
{
auto element = stack[i];
auto expected_element = expected_stacks[index][i];
// pretty print error message before asserting
if (expected_element.type != element->type || expected_element.value != element->value)
{
fprintf(stderr, "index: %d\n", index);
fprintf(stderr, "expected_element: %d, %u\n", expected_element.type, expected_element.value);
fprintf(stderr, "actual_element: %d, %u\n", element->type, element->value);
fprintf(stderr, "expected_element != actual_element\n");
}
assert(expected_element.type == element->type && expected_element.value == element->value);
}
index++;
}
std::vector<llama_grammar_candidate> next_candidates;
next_candidates.resize(24);
for (size_t i = 0; i < 24; ++i)
{
uint32_t *cp = new uint32_t[2]; // dynamically allocate memory for code_point
cp[0] = 37 + i;
cp[1] = 0;
next_candidates[i] = {i, cp, {}};
}
std::vector<std::vector<std::pair<uint32_t, uint16_t>>> expected_reject = {
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{3, 40},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{21, 58},
{22, 59},
{23, 60},
},
{
{0, 37},
{1, 38},
{2, 39},
{4, 41},
{5, 42},
{6, 43},
{7, 44},
{8, 45},
{9, 46},
{10, 47},
{11, 48},
{12, 49},
{13, 50},
{14, 51},
{15, 52},
{16, 53},
{17, 54},
{18, 55},
{19, 56},
{20, 57},
{21, 58},
{22, 59},
{23, 60},
},
};
std::vector<llama_grammar_candidate> rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[0], next_candidates);
std::vector<std::vector<llama_grammar_candidate>> all_rejects;
for (std::size_t count = 0; count < llama_grammar_get_stacks(grammar).size(); ++count)
{
rejects = llama_grammar_reject_candidates_for_stack(llama_grammar_get_rules(grammar), llama_grammar_get_stacks(grammar)[count], next_candidates);
all_rejects.push_back(rejects);
}
index = 0;
for (auto rej : all_rejects)
{
for (uint32_t i = 0; i < rej.size(); i++)
{
auto element = rej[i];
auto expected_element = expected_reject[index][i];
assert(element.index == expected_element.first && *element.code_points == expected_element.second);
}
index++;
}
for (auto &candidate : next_candidates)
{
delete[] candidate.code_points;
candidate.code_points = nullptr;
}
llama_grammar_free(grammar);
return 0;
}

View file

@ -1,27 +0,0 @@
#include "llama.h"
#include "get-model.h"
#include <cstdlib>
int main(int argc, char *argv[] ) {
auto * model_path = get_model_or_exit(argc, argv);
auto * file = fopen(model_path, "r");
if (file == nullptr) {
fprintf(stderr, "no model at '%s' found\n", model_path);
return EXIT_FAILURE;
}
fprintf(stderr, "using '%s'\n", model_path);
fclose(file);
llama_backend_init();
auto params = llama_model_params{};
params.use_mmap = false;
params.progress_callback = [](float progress, void * ctx){
(void) ctx;
return progress > 0.50;
};
auto * model = llama_load_model_from_file(model_path, params);
llama_backend_free();
return model == nullptr ? EXIT_SUCCESS : EXIT_FAILURE;
}

View file

@ -1,181 +0,0 @@
#include "ggml.h"
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#define MAX_NARGS 2
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wdouble-promotion"
#endif
//
// logging
//
#define GGML_DEBUG 0
#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif
#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif
#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif
#define GGML_PRINT(...) printf(__VA_ARGS__)
static float frand(void) {
return (float)rand()/(float)RAND_MAX;
}
static struct ggml_tensor * get_random_tensor(
struct ggml_context * ctx0, int ndims, int64_t ne[], float fmin, float fmax
) {
struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
switch (ndims) {
case 1:
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
}
break;
case 2:
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
}
}
break;
case 3:
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
}
}
}
break;
case 4:
for (int i3 = 0; i3 < ne[3]; i3++) {
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
}
}
}
}
break;
default:
assert(false);
}
return result;
}
int main(void) {
struct ggml_init_params params = {
/* .mem_size = */ 1024*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
struct ggml_context * ctx = ggml_init(params);
int64_t ne1[4] = {4, 128, 1, 1};
int64_t ne2[4] = {4, 256, 1, 1};
int64_t ne3[4] = {128, 256, 1, 1};
struct ggml_tensor * a = get_random_tensor(ctx, 2, ne1, -1, +1);
struct ggml_tensor * b = get_random_tensor(ctx, 2, ne2, -1, +1);
ggml_set_param(ctx, a);
ggml_set_param(ctx, b);
struct ggml_tensor * c = get_random_tensor(ctx, 2, ne3, -1, +1);
struct ggml_tensor * ab = ggml_mul_mat(ctx, a, b);
struct ggml_tensor * d = ggml_sub(ctx, c, ab);
struct ggml_tensor * e = ggml_sum(ctx, ggml_sqr(ctx, d));
struct ggml_cgraph * ge = ggml_new_graph_custom(ctx, GGML_DEFAULT_GRAPH_SIZE, true);
ggml_build_forward_expand(ge, e);
ggml_graph_reset(ge);
ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
const float fe = ggml_get_f32_1d(e, 0);
printf("%s: e = %.4f\n", __func__, fe);
struct ggml_opt_params opt_params = ggml_opt_default_params(GGML_OPT_TYPE_ADAM);
ggml_opt(ctx, opt_params, e);
ggml_graph_reset(ge);
ggml_graph_compute_with_ctx(ctx, ge, /*n_threads*/ 1);
const float fe_opt = ggml_get_f32_1d(e, 0);
printf("%s: original e = %.4f\n", __func__, fe);
printf("%s: optimized e = %.4f\n", __func__, fe_opt);
const bool success = (fe_opt <= fe);
assert(success);
ggml_free(ctx);
return success ? 0 : -1;
}
// int64_t ne1[4] = {4, 128, 1, 1};
// int64_t ne2[4] = {4, 256, 1, 1};;
// int64_t ne3[4] = {128, 256, 1, 1};
// main: original e = 25890.9375
// main: optimized e = 10094.7031
// int64_t ne1[4] = {8, 128, 1, 1};
// int64_t ne2[4] = {8, 256, 1, 1};;
// int64_t ne3[4] = {128, 256, 1, 1};
// main: original e = 39429.5078
// main: optimized e = 9275.8936
// int64_t ne1[4] = {16, 128, 1, 1};
// int64_t ne2[4] = {16, 256, 1, 1};;
// int64_t ne3[4] = {128, 256, 1, 1};
// main: original e = 68371.1328
// main: optimized e = 7854.4502
// int64_t ne1[4] = {32, 128, 1, 1};
// int64_t ne2[4] = {32, 256, 1, 1};;
// int64_t ne3[4] = {128, 256, 1, 1};
// main: original e = 126061.1953
// main: optimized e = 5451.0166
// int64_t ne1[4] = {4, 1024, 1, 1};
// int64_t ne2[4] = {4, 2048, 1, 1};;
// int64_t ne3[4] = {1024, 2048, 1, 1};
// main: original e = 1620817.8750
// main: optimized e = 698387.6875
// another run on M1
// int64_t ne1[4] = {4, 1024, 1, 1};
// int64_t ne2[4] = {4, 2048, 1, 1};;
// int64_t ne3[4] = {1024, 2048, 1, 1};
// main: original e = 1629595.6250
// main: optimized e = 698169.1250
// int64_t ne1[4] = {32, 1024, 1, 1};
// int64_t ne2[4] = {32, 2048, 1, 1};;
// int64_t ne3[4] = {1024, 2048, 1, 1};
// main: original e = 8146770.5000
// main: optimized e = 651119.1250

View file

@ -1,185 +0,0 @@
// Unit tests for quantization specific functions - quantize, dequantize and dot product
#include "ggml.h"
#undef NDEBUG
#include <assert.h>
#include <math.h>
#include <stdio.h>
#include <string>
#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
constexpr float MAX_QUANTIZATION_REFERENCE_ERROR = 0.0001f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR = 0.002f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS = 0.0050f;
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
constexpr float MAX_DOT_PRODUCT_ERROR_LOWBIT = 0.04f;
static const char* RESULT_STR[] = {"ok", "FAILED"};
// Generate synthetic data
static void generate_data(float offset, size_t n, float * dst) {
for (size_t i = 0; i < n; i++) {
dst[i] = 0.1 + 2*cosf(i + offset);
}
}
// Calculate RMSE between two float arrays
static float array_rmse(const float * a1, const float * a2, size_t n) {
double sum = 0;
for (size_t i = 0; i < n; i++) {
double diff = a1[i] - a2[i];
sum += diff * diff;
}
return sqrtf(sum) / n;
}
// Total quantization error on test data
static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
qfns.from_float(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
return array_rmse(test_data, tmp_out.data(), test_size);
}
// Total quantization error on test data
static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
std::vector<uint8_t> tmp_q(2*test_size);
std::vector<float> tmp_out(test_size);
std::vector<float> tmp_out_ref(test_size);
qfns.from_float(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out.data(), test_size);
qfns.from_float_ref(test_data, tmp_q.data(), test_size);
qfns.to_float(tmp_q.data(), tmp_out_ref.data(), test_size);
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
}
static float dot_product(const float * a1, const float * a2, size_t test_size) {
double sum = 0;
for (size_t i = 0; i < test_size; i++) {
sum += a1[i] * a2[i];
}
return sum;
}
// Total dot product error
static float dot_product_error(
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
) {
std::vector<uint8_t> tmp_q1(2*test_size);
std::vector<uint8_t> tmp_q2(2*test_size);
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
qfns.from_float(test_data1, tmp_q1.data(), test_size);
vdot.from_float(test_data2, tmp_q2.data(), test_size);
float result = INFINITY;
qfns.vec_dot(test_size, &result, 0, tmp_q1.data(), 0, tmp_q2.data(), 0, 1);
const float dot_ref = dot_product(test_data1, test_data2, test_size);
return fabsf(result - dot_ref) / test_size;
}
int main(int argc, char * argv[]) {
bool verbose = false;
const size_t test_size = 32 * 128;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "-v") {
verbose = true;
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
return 1;
}
}
std::vector<float> test_data(test_size);
std::vector<float> test_data2(test_size);
generate_data(0.0, test_data.size(), test_data.data());
generate_data(1.0, test_data2.size(), test_data2.data());
// Initialize GGML, ensures float conversion tables are initialized
struct ggml_init_params ggml_params = {
/* .mem_size = */ 1*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ true,
};
struct ggml_context * ctx = ggml_init(ggml_params);
int num_failed = 0;
bool failed = false;
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
// deprecated - skip
if (qfns.blck_size == 0) {
continue;
}
const ggml_type ei = (ggml_type)i;
printf("Testing %s\n", ggml_type_name((ggml_type) i));
ggml_quantize_init(ei);
if (qfns.from_float && qfns.to_float) {
const float total_error = total_quantization_error(qfns, test_size, test_data.data());
const float max_quantization_error =
type == GGML_TYPE_Q2_K ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
type == GGML_TYPE_IQ2_S ? MAX_QUANTIZATION_TOTAL_ERROR_2BITS :
type == GGML_TYPE_Q3_K ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
type == GGML_TYPE_IQ3_S ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS :
type == GGML_TYPE_IQ3_XXS ? MAX_QUANTIZATION_TOTAL_ERROR_3BITS_XXS : MAX_QUANTIZATION_TOTAL_ERROR;
failed = !(total_error < max_quantization_error);
num_failed += failed;
if (failed || verbose) {
printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error);
}
const float reference_error = reference_quantization_error(qfns, test_size, test_data.data());
failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR);
num_failed += failed;
if (failed || verbose) {
printf("%5s reference implementation error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], reference_error);
}
const float vec_dot_error = dot_product_error(qfns, test_size, test_data.data(), test_data2.data());
const float max_allowed_error = type == GGML_TYPE_Q2_K || type == GGML_TYPE_IQ2_XS || type == GGML_TYPE_IQ2_XXS ||
type == GGML_TYPE_IQ3_XXS || type == GGML_TYPE_IQ3_S || type == GGML_TYPE_IQ2_S
? MAX_DOT_PRODUCT_ERROR_LOWBIT
: MAX_DOT_PRODUCT_ERROR;
failed = !(vec_dot_error < max_allowed_error);
num_failed += failed;
if (failed || verbose) {
printf("%5s dot product error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], vec_dot_error);
}
}
}
if (num_failed || verbose) {
printf("%d tests failed\n", num_failed);
}
ggml_free(ctx);
return num_failed > 0;
}

View file

@ -1,363 +0,0 @@
// Benchmark quantization specific functions on synthetic data
#include "ggml.h"
#undef NDEBUG
#include <algorithm>
#include <assert.h>
#include <functional>
#include <inttypes.h>
#include <math.h>
#include <memory>
#include <stdio.h>
#include <string>
#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#define MAX_ALIGNMENT 64
#define QK 32
#define WARMUP 5
#define ITERATIONS 10
#define MAX_ITERATIONS 100000000
#define L1_SIZE 32*128
#define L2_SIZE 32*2048
#define L3_SIZE 32*20480
#define MEM_SIZE 32*2048000
struct quantize_perf_params {
std::vector<std::string> include_types;
std::vector<size_t> test_sizes;
size_t alignment_offset = 0;
bool op_quantize_row_q_reference = false;
bool op_quantize_row_q = false;
bool op_dequantize_row_q = false;
bool op_quantize_row_q_dot = false;
bool op_vec_dot_q = false;
int64_t iterations = ITERATIONS;
};
#if defined(__x86_64__) || defined(__i386__)
#include <x86intrin.h>
inline int64_t cpu_cycles() {
// Rough way to detect new-ish CPUs
#ifdef __POPCNT__
unsigned int dummy;
return __rdtscp(&dummy);
#else
return __rdtsc();
#endif
}
#else
#define cpu_cycles() 0
#endif
// Generate synthetic data
static void generate_data(float offset, size_t n, float * dst) {
for (size_t i = 0; i < n; i++) {
dst[i] = 0.1 + 2*cosf(i + offset);
}
}
static float gigabytes_per_second(size_t bytes, int64_t usecs) {
return bytes / (float) usecs * 1000000 / (1024*1024*1024);
}
static void * align_with_offset(void * ptr, int offset) {
size_t dummy_size = MAX_ALIGNMENT * 4;
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
}
static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<float(void)> & func) {
int64_t min_time_us = INT64_MAX;
int64_t total_time_us = 0;
int64_t min_time_cycles = INT64_MAX;
int64_t total_time_cycles = 0;
for (int i = 0; i < WARMUP; i++) {
func();
}
for (int i = 0; i < iterations; i++) {
const int64_t start_time = ggml_time_us();
const int64_t start_cycles = cpu_cycles();
func();
const int64_t end_cycles = cpu_cycles();
const int64_t end_time = ggml_time_us();
total_time_cycles += end_cycles - start_cycles;
min_time_cycles = std::min(min_time_cycles, end_cycles - start_cycles);
total_time_us += end_time - start_time;
min_time_us = std::min(min_time_us, end_time - start_time);
}
printf(" min cycles/%d vals : %9.2f\n", QK, QK * min_time_cycles / (float) size);
printf(" avg cycles/%d vals : %9.2f\n", QK, QK * total_time_cycles / (float) (size * iterations));
printf(" float32 throughput : %9.2f GB/s\n", gigabytes_per_second(4 * size * iterations, total_time_us));
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
}
static void usage(char * argv[]) {
printf("Benchmark quantization specific functions on synthetic data\n");
printf("\n");
printf("usage: %s [options]\n", argv[0]);
printf("\n");
printf("options: (default)\n");
printf(" -h, --help show this help message and exit\n");
printf(" --size SIZE set test size, divisible by 32 (L1_SIZE:%d)\n", L1_SIZE);
printf(" -3 use size as L1, L2, L3 sizes (L1:%d L2:%d L3:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE);
printf(" -4 use size as L1, L2, L3, MEM sizes (L1:%d L2:%d L3:%d MEM:%d)\n", L1_SIZE, L2_SIZE, L3_SIZE, MEM_SIZE);
printf(" --op OP set test operation as quantize_row_q_reference, quantize_row_q, dequantize_row_q,\n");
printf(" quantize_row_q_dot, vec_dot_q (all)\n");
printf(" --type TYPE set test type as");
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
if (ggml_type_name(type) != NULL) {
if (qfns.from_float && qfns.to_float) {
printf(" %s", ggml_type_name(type));
}
}
}
printf(" (all)\n");
printf(" --alignment-offset OFFSET\n");
printf(" set alignment offset as OFFSET (0)\n");
printf(" -i NUM, --iterations NUM\n");
printf(" set test iteration number (%d)\n", ITERATIONS);
}
int main(int argc, char * argv[]) {
quantize_perf_params params {};
// read command line
bool invalid_param = false;
std::string arg;
for (int i = 1; i < argc; i++) {
arg = argv[i];
if (arg == "--size") {
if (++i >= argc) {
invalid_param = true;
break;
}
size_t size = std::stoi(argv[i]);
if (size % 32 != 0) {
fprintf(stderr, "error: size %zu not divisible by 32\n", size);
invalid_param = true;
break;
}
params.test_sizes.push_back(size);
} else if (arg == "-3") {
// quick select sizes that probably fit in CPU caches
params.test_sizes.push_back(L1_SIZE);
params.test_sizes.push_back(L2_SIZE);
params.test_sizes.push_back(L3_SIZE);
} else if (arg == "-4") {
// quick select cache sizes + memory
params.test_sizes.push_back(L1_SIZE);
params.test_sizes.push_back(L2_SIZE);
params.test_sizes.push_back(L3_SIZE);
params.test_sizes.push_back(MEM_SIZE);
} else if (arg == "--op") {
if (++i >= argc) {
invalid_param = true;
break;
}
std::string op {argv[i]};
if (op == "quantize_row_q_reference") {
params.op_quantize_row_q_reference = true;
} else if (op == "quantize_row_q") {
params.op_quantize_row_q = true;
} else if (op == "dequantize_row_q") {
params.op_dequantize_row_q = true;
} else if (op == "quantize_row_q_dot") {
params.op_quantize_row_q_dot = true;
} else if (op == "vec_dot_q") {
params.op_vec_dot_q = true;
} else {
invalid_param = true;
break;
}
} else if (arg == "--type") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.include_types.push_back(argv[i]);
} else if (arg == "--alignment-offset") {
if (++i >= argc) {
invalid_param = true;
break;
}
int alignment = std::stoi(argv[i]);
if (alignment < 0 || alignment > MAX_ALIGNMENT) {
fprintf(stderr, "error: alignment-offset must be less than %d\n", MAX_ALIGNMENT);
invalid_param = true;
break;
}
params.alignment_offset = alignment;
} else if ((arg == "-i") || (arg == "--iterations")) {
if (++i >= argc) {
invalid_param = true;
break;
}
int number = std::stoi(argv[i]);
if (number < 0 || number > MAX_ITERATIONS) {
fprintf(stderr, "error: iterations must be less than %d\n", MAX_ITERATIONS);
invalid_param = true;
break;
}
params.iterations = number;
} else if ((arg == "-h") || (arg == "--help")) {
usage(argv);
return 1;
} else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
return 1;
}
}
if (invalid_param) {
fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str());
return 1;
}
if (params.test_sizes.empty()) {
params.test_sizes.push_back(L1_SIZE);
}
if (!(params.op_quantize_row_q_reference || params.op_quantize_row_q || params.op_dequantize_row_q || params.op_quantize_row_q_dot || params.op_vec_dot_q)) {
params.op_quantize_row_q_reference = params.op_quantize_row_q = params.op_dequantize_row_q = params.op_quantize_row_q_dot = params.op_vec_dot_q = true;
}
std::sort(params.test_sizes.begin(), params.test_sizes.end());
size_t largest = params.test_sizes.back();
std::vector<uint8_t> test_data1_v(largest*4 + MAX_ALIGNMENT*2);
std::vector<uint8_t> test_data2_v(largest*4 + MAX_ALIGNMENT*2);
std::vector<uint8_t> test_q1_v (largest*4 + MAX_ALIGNMENT*2);
std::vector<uint8_t> test_q2_v (largest*4 + MAX_ALIGNMENT*2);
std::vector<uint8_t> test_out_v (largest*4 + MAX_ALIGNMENT*2);
float * test_data1 = (float *) align_with_offset(test_data1_v.data(), params.alignment_offset);
float * test_data2 = (float *) align_with_offset(test_data2_v.data(), params.alignment_offset);
float * test_q1 = (float *) align_with_offset(test_q1_v.data(), params.alignment_offset);
float * test_q2 = (float *) align_with_offset(test_q2_v.data(), params.alignment_offset);
float * test_out = (float *) align_with_offset(test_out_v.data(), params.alignment_offset);
generate_data(0, largest, test_data1);
generate_data(1, largest, test_data2);
int64_t iterations = params.iterations;
// Initialize GGML, ensures float conversion tables are initialized
struct ggml_init_params ggml_params = {
/* .mem_size = */ 1*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ true,
};
struct ggml_context * ctx = ggml_init(ggml_params);
for (int i = 0; i < GGML_TYPE_COUNT; i++) {
ggml_type type = (ggml_type) i;
ggml_type_traits_t qfns = ggml_internal_get_type_traits(type);
if (!params.include_types.empty() && ggml_type_name(type) && std::find(params.include_types.begin(), params.include_types.end(), ggml_type_name(type)) == params.include_types.end()) {
continue;
}
if (qfns.from_float && qfns.to_float) {
printf("%s\n", ggml_type_name(type));
ggml_quantize_init(type);
if (params.op_quantize_row_q_reference) {
printf(" quantize_row_q_reference\n");
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
qfns.from_float_ref(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
}
if (params.op_quantize_row_q) {
printf(" quantize_row_q\n");
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
qfns.from_float(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
}
if (params.op_dequantize_row_q) {
printf(" dequantize_row_q\n");
qfns.from_float(test_data1, test_q1, largest);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
qfns.to_float(test_q1, test_out, size);
return test_out[0];
};
size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
}
if (params.op_quantize_row_q_dot) {
printf(" quantize_row_q_dot\n");
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
auto vdot = ggml_internal_get_type_traits(qfns.vec_dot_type);
vdot.from_float(test_data1, test_q1, size);
return test_q1[0];
};
size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
}
if (params.op_vec_dot_q) {
printf(" vec_dot_q\n");
qfns.from_float(test_data1, test_q1, largest);
qfns.from_float(test_data2, test_q2, largest);
for (size_t size : params.test_sizes) {
printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024));
auto quantize_fn = [&](void) -> float {
float result;
qfns.vec_dot(size, &result, 0, test_q1, 0, test_q2, 0, 1);
return result;
};
size_t quantized_size = ggml_row_size(type, size);
benchmark_function(size, quantized_size, iterations, quantize_fn);
}
printf("\n");
}
}
}
ggml_free(ctx);
return 0;
}

View file

@ -1,220 +0,0 @@
#include "ggml.h"
#include <cmath>
#include <cstdio>
#include <cstdlib>
#include <cassert>
#include <vector>
#if defined(_MSC_VER)
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
#if defined(__GNUC__)
#pragma GCC diagnostic ignored "-Wdouble-promotion"
#endif
#define MAX_NARGS 3
#undef MIN
#undef MAX
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define GGML_SILU_FP16
//
// logging
//
#if (GGML_DEBUG >= 1)
#define GGML_PRINT_DEBUG(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG(...)
#endif
#if (GGML_DEBUG >= 5)
#define GGML_PRINT_DEBUG_5(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_5(...)
#endif
#if (GGML_DEBUG >= 10)
#define GGML_PRINT_DEBUG_10(...) printf(__VA_ARGS__)
#else
#define GGML_PRINT_DEBUG_10(...)
#endif
#define GGML_PRINT(...) printf(__VA_ARGS__)
static float frand(void) {
return (float)rand()/(float)RAND_MAX;
}
static int irand(int n) {
if (n == 0) return 0;
return rand()%n;
}
static void get_random_dims(int64_t * dims, int ndims) {
dims[0] = dims[1] = dims[2] = dims[3] = 1;
for (int i = 0; i < ndims; i++) {
dims[i] = 1 + irand(4);
}
}
static struct ggml_tensor * get_random_tensor_f32(
struct ggml_context * ctx0,
int ndims,
const int64_t ne[],
float fmin,
float fmax) {
struct ggml_tensor * result = ggml_new_tensor(ctx0, GGML_TYPE_F32, ndims, ne);
switch (ndims) {
case 1:
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i0] = frand()*(fmax - fmin) + fmin;
}
break;
case 2:
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
}
}
break;
case 3:
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
}
}
}
break;
case 4:
for (int i3 = 0; i3 < ne[3]; i3++) {
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)result->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand()*(fmax - fmin) + fmin;
}
}
}
}
break;
default:
assert(false);
};
return result;
}
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
if (plan.work_size > 0) {
buf.resize(plan.work_size);
plan.work_data = buf.data();
}
ggml_graph_compute(graph, &plan);
}
int main(int /*argc*/, const char ** /*argv*/) {
struct ggml_init_params params = {
/* .mem_size = */ 128*1024*1024,
/* .mem_buffer = */ NULL,
/* .no_alloc = */ false,
};
std::vector<uint8_t> work_buffer;
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_tensor * x;
// rope f32
for (int m = 0; m < 3; ++m) {
const int ndims = 4;
const int64_t n_rot = 128;
const int64_t ne[4] = { 2*n_rot, 32, 73, 1 };
const int n_past_0 = 100;
const int n_past_2 = 33;
struct ggml_tensor * p0 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
struct ggml_tensor * p1 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
struct ggml_tensor * p2 = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, ne[2]);
for (int i = 0; i < ne[2]; ++i) {
((int32_t *) p0->data)[i] = n_past_0 + i;
((int32_t *) p1->data)[i] = n_past_2 - n_past_0;
((int32_t *) p2->data)[i] = n_past_2 + i;
}
// test mode 0, 2, 4 (standard, GPT-NeoX, GLM)
const int mode = m == 0 ? 0 : m == 1 ? 2 : 4;
x = get_random_tensor_f32(ctx0, ndims, ne, -1.0f, 1.0f);
// 100, 101, 102, ..., 172
struct ggml_tensor * r0 = ggml_rope(ctx0, x, p0, n_rot, mode);
// -67, -67, -67, ..., -67
struct ggml_tensor * r1 = ggml_rope(ctx0, r0, p1, n_rot, mode); // "context swap", i.e. forget n_past_0 - n_past_2 tokens
// 33, 34, 35, ..., 105
struct ggml_tensor * r2 = ggml_rope(ctx0, x, p2, n_rot, mode);
ggml_cgraph * gf = ggml_new_graph(ctx0);
ggml_build_forward_expand(gf, r0);
ggml_build_forward_expand(gf, r1);
ggml_build_forward_expand(gf, r2);
ggml_graph_compute_helper(work_buffer, gf, 4);
// check that r1 and r2 are the same
{
double sum0 = 0.0f;
double sum1 = 0.0f;
double diff = 0.0f;
const float * r1_data = (float *) r1->data;
const float * r2_data = (float *) r2->data;
const int n_elements = ggml_nelements(r1);
for (int i = 0; i < n_elements; ++i) {
sum0 += fabs(r1_data[i]);
sum1 += fabs(r2_data[i]);
diff += fabs(r1_data[i] - r2_data[i]);
//if (fabs(r1_data[i] - r2_data[i]) > 0.0001f) {
// printf("%d: %f %f\n", i, r1_data[i], r2_data[i]);
// printf("diff: %f\n", fabs(r1_data[i] - r2_data[i]));
//}
}
//for (int i = 4096; i < 4096 + 128; ++i) {
// printf("%f %f\n", r1_data[i], r2_data[i]);
//}
printf("mode: %d\n", mode);
printf("sum0: %f\n", sum0);
printf("sum1: %f\n", sum1);
printf("diff: %f\n", diff);
printf("rel err: %f\n", diff / sum0);
printf("rel err: %f\n", diff / sum1);
GGML_ASSERT(diff / sum0 < 0.0001f);
GGML_ASSERT(diff / sum1 < 0.0001f);
}
}
ggml_free(ctx0);
return 0;
}

View file

@ -1,301 +0,0 @@
#include "ggml.h"
#include "llama.h"
#ifdef NDEBUG
#undef NDEBUG
#endif
#include <algorithm>
#include <cmath>
#include <string>
#include <vector>
static void dump(const llama_token_data_array * candidates) {
for (size_t i = 0; i < candidates->size; i++) {
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
}
}
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_sample_softmax(nullptr, &candidates_p);
DUMP(&candidates_p);
llama_sample_top_k(nullptr, &candidates_p, k, 1);
DUMP(&candidates_p);
GGML_ASSERT(candidates_p.size == expected_probs.size());
for (size_t i = 0; i < candidates_p.size; i++) {
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-5);
}
}
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_sample_softmax(nullptr, &candidates_p);
DUMP(&candidates_p);
llama_sample_top_p(nullptr, &candidates_p, p, 1);
DUMP(&candidates_p);
GGML_ASSERT(candidates_p.size == expected_probs.size());
for (size_t i = 0; i < candidates_p.size; i++) {
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
}
}
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
DUMP(&candidates_p);
llama_sample_tail_free(nullptr, &candidates_p, z, 1);
DUMP(&candidates_p);
GGML_ASSERT(candidates_p.size == expected_probs.size());
for (size_t i = 0; i < candidates_p.size; i++) {
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
}
}
static void test_min_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
DUMP(&candidates_p);
llama_sample_min_p(nullptr, &candidates_p, p, 1);
DUMP(&candidates_p);
llama_sample_softmax(nullptr, &candidates_p);
GGML_ASSERT(candidates_p.size == expected_probs.size());
for (size_t i = 0; i < candidates_p.size; i++) {
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
}
}
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
DUMP(&candidates_p);
llama_sample_typical(nullptr, &candidates_p, p, 1);
DUMP(&candidates_p);
GGML_ASSERT(candidates_p.size == expected_probs.size());
for (size_t i = 0; i < candidates_p.size; i++) {
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
}
}
static void test_repetition_penalties(
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
const std::vector<float> & expected_probs, float repeat_penalty, float alpha_frequency, float alpha_presence
) {
GGML_ASSERT(probs.size() == expected_probs.size());
const size_t n_vocab = probs.size();
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(probs[token_id]);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_sample_softmax(nullptr, &candidates_p);
DUMP(&candidates_p);
llama_sample_repetition_penalties(nullptr, &candidates_p, (const llama_token *) last_tokens.data(), last_tokens.size(), repeat_penalty, alpha_frequency, alpha_presence);
llama_sample_softmax(nullptr, &candidates_p);
DUMP(&candidates_p);
GGML_ASSERT(candidates_p.size == expected_probs.size());
for (size_t i = 0; i < candidates_p.size; i++) {
GGML_ASSERT(fabs(candidates_p.data[i].p - expected_probs[i]) < 1e-3);
}
}
static void test_sampler_queue(
const size_t n_vocab, const std::string samplers_sequence, const int top_k, const float top_p, const float min_p
) {
std::vector<llama_token_data> candidates;
candidates.reserve(n_vocab);
for (llama_token token_id = 0; token_id < (llama_token)n_vocab; token_id++) {
const float logit = logf(token_id);
candidates.emplace_back(llama_token_data{token_id, logit, 0.0f});
}
llama_token_data_array candidates_p = { candidates.data(), candidates.size(), false };
llama_token min_token_id = 0;
const llama_token max_token_id = n_vocab-1;
for (auto s : samplers_sequence) {
switch (s){
case 'k': llama_sample_top_k (nullptr, &candidates_p, top_k, 1); break;
case 'f': GGML_ASSERT(false && "tail_free test not implemented"); break;
case 'y': GGML_ASSERT(false && "typical test not implemented"); break;
case 'p': llama_sample_top_p (nullptr, &candidates_p, top_p, 1); break;
case 'm': llama_sample_min_p (nullptr, &candidates_p, min_p, 1); break;
case 't': GGML_ASSERT(false && "temperature test not implemented"); break;
default : GGML_ASSERT(false && "Unknown sampler"); break;
}
llama_sample_softmax(nullptr, &candidates_p); // make sure tokens are sorted for tests
const int size = candidates_p.size;
if (s == 'k') {
const int expected_size = std::min(size, top_k);
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - top_k));
GGML_ASSERT(size == expected_size);
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
} else if (s == 'p') {
const int softmax_divisor = n_vocab * (n_vocab-1) / 2 - min_token_id * (min_token_id-1) / 2;
const int softmax_numerator_target = ceilf(top_p * softmax_divisor);
min_token_id = n_vocab;
int expected_size = 0;
int cumsum = 0;
do { // do-while because always at least one token is sampled
min_token_id--;
expected_size++;
cumsum += min_token_id;
} while (cumsum < softmax_numerator_target);
// token 0 has p == 0, need special consideration for cumsum because top_p immediately returns
if (min_token_id == 1) {
min_token_id--;
expected_size += 1;
}
GGML_ASSERT(size == expected_size);
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
} else if (s == 'm') {
int expected_size = ceilf((1.0f-min_p) * n_vocab);
expected_size = std::max(expected_size, 1);
expected_size = std::min(expected_size, size);
min_token_id = floorf(min_p * n_vocab);
min_token_id = std::max(min_token_id, 1);
min_token_id = std::max(min_token_id, (llama_token)(n_vocab - size));
min_token_id = std::min(min_token_id, (llama_token)(n_vocab - 1));
GGML_ASSERT(size == expected_size);
GGML_ASSERT(candidates_p.data[0].id == max_token_id);
GGML_ASSERT(candidates_p.data[expected_size-1].id == min_token_id);
} else {
GGML_ASSERT(false);
}
}
printf("Sampler queue %3s OK with n_vocab=%05ld top_k=%05d top_p=%f min_p=%f\n",
samplers_sequence.c_str(), n_vocab, top_k, top_p, min_p);
}
int main(void) {
ggml_time_init();
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 1);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 3);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 4);
test_top_k({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 0);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f}, 0);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f}, 0.7f);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f}, 0.8f);
test_top_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f, 0.3f, 0.2f, 0.1f}, 1);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.00f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/1.0f, 0.3f/1.0f, 0.2f/1.0f, 0.1f/1.0f}, 0.24f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.26f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.9f, 0.3f/0.9f, 0.2f/0.9f}, 0.49f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.51f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.7f, 0.3f/0.7f}, 0.74f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 0.76f);
test_min_p({0.1f, 0.2f, 0.3f, 0.4f}, {0.4f/0.4f}, 1.00f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f}, 0.25f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.75f);
test_tfs({0.1f, 0.15f, 0.2f, 0.25f, 0.3f}, {0.3f, 0.25f}, 0.99f);
test_typical({0.97f, 0.01f, 0.01f, 0.01f}, {0.97f}, 0.5f);
test_typical({0.4f, 0.2f, 0.2f, 0.2f}, {0.2f, 0.2f, 0.2f}, 0.5f);
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.25f, 0.25f, 0.25f, 0.25f, 0}, 50.0f, 0.0f, 0.0f);
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.5f, 0.5f, 0, 0, 0}, 50.0f, 0.0f, 0.0f);
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0}, {0.249997f, 0.249997f, 0.249997f, 0.249997f, 0.000011f}, 1.0f, 5.0f, 5.0f);
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2}, {0.499966f, 0.499966f, 0.000023f, 0.000023f, 0.000023f}, 1.0f, 5.0f, 5.0f);
test_repetition_penalties({0.2f, 0.2f, 0.2f, 0.2f, 0.2f}, {0, 1, 2, 0, 0}, {0.499977f, 0.499977f, 0.000023f, 0.000023f, 0.000000f}, 1.0f, 5.0f, 5.0f);
test_sampler_queue(10000, "k", 10000, 1.0f, 1.0f);
test_sampler_queue(10000, "k", 1, 1.0f, 1.0f);
test_sampler_queue(10000, "p", 10000, 1.0f, 1.0f);
test_sampler_queue(10000, "p", 10000, 0.0f, 1.0f);
test_sampler_queue(10000, "m", 10000, 1.0f, 1.0f);
test_sampler_queue(10000, "m", 10000, 1.0f, 1e-12);
test_sampler_queue(10000, "k", 100, 1.0000f, 1.0f);
test_sampler_queue(10000, "p", 10000, 0.0002f, 1.0f);
test_sampler_queue(10000, "p", 10000, 0.8000f, 1.0f);
test_sampler_queue(10000, "m", 10000, 1.0000f, 9997.9f/9999.0f);
test_sampler_queue(10000, "m", 10000, 1.0000f, 0.1f);
test_sampler_queue(10000, "kp", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "km", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "pk", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "pm", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "mk", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "mp", 100, 0.8f, 9997.9f/9999.0f);
test_sampler_queue(10000, "mp", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "kpm", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "kmp", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "pkm", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "pmk", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "mkp", 100, 0.8f, 0.1f);
test_sampler_queue(10000, "mpk", 100, 0.8f, 0.1f);
printf("OK\n");
return 0;
}

View file

@ -1,292 +0,0 @@
#include "llama.h"
#include "common.h"
#include "console.h"
#include <cstdio>
#include <string>
#include <map>
#include <vector>
#include <fstream>
//static const std::map<std::string, std::vector<llama_token>> & k_tests() {
// static std::map<std::string, std::vector<llama_token>> _k_tests = {
// { "" , { }, },
// { " " , { 220, }, },
// { " " , { 256, }, },
// { " " , { 262, }, },
// { "\t" , { 197, }, },
// { "\n" , { 198, }, },
// { "\n\n" , { 271, }, },
// { "\n\n\n" , { 1432, }, },
// { "\t\n" , { 1602, }, },
// { "Hello world" , { 9906, 1917, }, },
// { " Hello world" , { 22691, 1917, }, },
// { "Hello World" , { 9906, 4435, }, },
// { " Hello World" , { 22691, 4435, }, },
// { " Hello World!" , { 22691, 4435, 0, }, },
// { "Hello, world!" , { 9906, 11, 1917, 0, }, },
// { " Hello, world!" , { 22691, 11, 1917, 0, }, },
// { " this is 🦙.cpp" , { 420, 374, 11410, 99, 247, 13, 11055, }, },
// { "w048 7tuijk dsdfhu" , { 86, 23904, 220, 22, 83, 2005, 42908, 11729, 3013, 17156, }, },
// { "нещо на Български" , { 79862, 102118, 13373, 64571, 34694, 3114, 112203, 80112, }, },
// { "កាន់តែពិសេសអាចខលចេញ" , { 21549, 222, 98629, 241, 45358, 233, 21549, 237, 45358, 224, 21549, 244, 21549, 115, 21549, 253, 45358, 223, 21549, 253, 21549, 95, 98629, 227, 21549, 223, 21549, 249, 21549, 227, 45358, 223, 21549, 231, }, },
// { "🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)", { 9468, 248, 222, 320, 8416, 8, 27623, 114, 102470, 9468, 234, 104, 31643, 320, 36773, 100166, 98634, 8, 26602, 227, 320, 3323, 43465, 430, 706, 1202, 1866, 4037, 8, }, },
// { "Hello" , { 9906, }, },
// { " Hello" , { 22691, }, },
// { " Hello" , { 220, 22691, }, },
// { " Hello" , { 256, 22691, }, },
// { " Hello" , { 262, 22691, }, },
// { " Hello\n Hello" , { 262, 22691, 198, 262, 22691, }, },
// { " (" , { 320, }, },
// { "\n =" , { 198, 284, }, },
// { "' era" , { 6, 11639, }, },
// { "Hello, y'all! How are you 😁 ?我想在apple工作1314151天", { 9906, 11, 379, 65948, 0, 2650, 527, 499, 27623, 223, 949, 37046, 101067, 19000, 23182, 102301, 9263, 18136, 16, 36827, 21909, }, },
// { "3" , { 18, }, },
// { "33" , { 1644, }, },
// { "333" , { 8765, }, },
// { "3333" , { 8765, 18, }, },
// { "33333" , { 8765, 1644, }, },
// { "333333" , { 8765, 8765, }, },
// { "3333333" , { 8765, 8765, 18, }, },
// { "33333333" , { 8765, 8765, 1644, }, },
// { "333333333" , { 8765, 8765, 8765, }, },
// };
//
// return _k_tests;
//}
using llama_tests = std::map<std::string, std::vector<llama_token>>;
static llama_tests read_tests(const std::string & fname_inp, const std::string & fname_out) {
llama_tests tests;
std::ifstream ifs_inp(fname_inp);
if (!ifs_inp) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_inp.c_str());
return tests;
}
std::string sraw((std::istreambuf_iterator<char>(ifs_inp)), std::istreambuf_iterator<char>());
std::ifstream ifs_out(fname_out);
if (!ifs_out) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
return tests;
}
std::vector<std::string> sout;
for (std::string line; std::getline(ifs_out, line);) {
sout.push_back(line);
}
const std::string sep = "\n__ggml_vocab_test__\n";
std::vector<std::string> sinp;
size_t pos = 0;
while (pos < sraw.size()) {
const size_t next = sraw.find(sep, pos);
if (next == std::string::npos) {
sinp.push_back(sraw.substr(pos));
break;
}
sinp.push_back(sraw.substr(pos, next - pos));
pos = next + sep.size();
}
if (sinp.size() != sout.size()) {
fprintf(stderr, "%s : error: input and output files have different number of tests\n", __func__);
return tests;
}
for (size_t i = 0; i < sinp.size(); ++i) {
const std::string & s = sinp[i];
const std::string & o = string_strip(sout[i]);
std::vector<llama_token> toks;
size_t pos = 0;
while (pos < o.size()) {
size_t next = o.find(' ', pos);
if (next == std::string::npos) {
next = o.size();
}
const std::string stok = o.substr(pos, next - pos);
toks.push_back(std::stoi(stok));
pos = next + 1;
}
tests[s] = toks;
}
return tests;
}
int main(int argc, char **argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s vocab-file [text-file]\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
const std::string fname_inp = fname + ".inp";
const std::string fname_out = fname + ".out";
std::string fname_text;
if (argc > 2) {
fname_text = argv[2];
}
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_model * model;
llama_context * ctx;
llama_backend_init();
// load the vocab
{
auto mparams = llama_model_default_params();
mparams.vocab_only = true;
model = llama_load_model_from_file(fname.c_str(), mparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
auto cparams = llama_context_default_params();
ctx = llama_new_context_with_model(model, cparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
llama_free_model(model);
return 1;
}
}
#ifdef _WIN32
// We need this for unicode console support
console::init(false, false);
atexit([]() { console::cleanup(); });
#endif
bool success = true;
const auto k_tests = [&]() -> llama_tests {
if (!fname_text.empty()) {
return {};
}
const auto res = read_tests(fname_inp, fname_out);
if (res.empty()) {
fprintf(stderr, "%s : error: no tests found\n", __func__);
exit(1);
}
return res;
}();
const bool add_special = false;
for (const auto & test_kv : k_tests) {
const std::vector<llama_token> res = llama_tokenize(ctx, test_kv.first, add_special, false);
printf("\n");
printf("src: '%s'\n", test_kv.first.c_str());
printf("res: '%s'\n", llama_detokenize(ctx, res).c_str());
printf("tok: ");
for (const auto & tok : res) {
printf("%d ", tok);
}
printf("\n");
bool correct = res.size() == test_kv.second.size();
for (int i = 0; i < (int) res.size() && correct; ++i) {
if (test_kv.second[i] != res[i]) {
correct = false;
}
}
if (!correct) {
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
fprintf(stderr, "%s : detokenized to: '%s' instead of '%s'\n", __func__,
llama_detokenize(ctx, res).c_str(),
llama_detokenize(ctx, test_kv.second).c_str());
fprintf(stderr, "%s : expected tokens: ", __func__);
for (const auto & t : test_kv.second) {
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
}
fprintf(stderr, "\n");
fprintf(stderr, "%s : got tokens: ", __func__);
for (const auto & t : res) {
fprintf(stderr, "%6d '%s', ", t, llama_token_to_piece(ctx, t).c_str());
}
fprintf(stderr, "\n");
success = false;
}
}
if (!fname_text.empty()) {
fprintf(stderr, "%s : tokenizing: '%s'\n", __func__, fname_text.c_str());
std::string text;
{
std::ifstream ifs(fname_text);
if (!ifs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_text.c_str());
return 1;
}
text = std::string(std::istreambuf_iterator<char>(ifs), std::istreambuf_iterator<char>());
}
fprintf(stderr, "%s : text size: %zu\n", __func__, text.size());
std::vector<llama_token> res;
{
const auto t_start = ggml_time_us();
res = llama_tokenize(ctx, text, add_special, false);
const auto t_end = ggml_time_us();
fprintf(stderr, "%s : tokenized in %.3f ms (cpp)\n", __func__, (t_end - t_start) / 1000.0);
}
fprintf(stderr, "%s : tokens: %zu\n", __func__, res.size());
{
const std::string fname_out = fname_text + ".tokcpp";
std::ofstream ofs(fname_out);
if (!ofs) {
fprintf(stderr, "%s : error: could not open file '%s'\n", __func__, fname_out.c_str());
return 1;
}
for (const auto & tok : res) {
//ofs << tok << " '" << string_strip(llama_detokenize(ctx, std::vector<int>{tok})) << "'" << std::endl;
ofs << tok << "\n";
}
}
fprintf(stderr, "%s : tokens written to '%s'\n", __func__, (fname_text + ".tokcpp").c_str());
}
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
printf("\n");
printf("Tests %s\n", success ? "passed" : "failed");
return success ? 0 : 3;
}

View file

@ -1,46 +0,0 @@
import time
import argparse
from transformers import AutoTokenizer
parser = argparse.ArgumentParser()
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--fname-tok", help="path to a text file to tokenize", required=True)
args = parser.parse_args()
dir_tokenizer = args.dir_tokenizer
fname_tok = args.fname_tok
tokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
print('tokenizing file: ', fname_tok) # noqa: NP100
fname_out = fname_tok + '.tok'
with open(fname_tok, 'r', encoding='utf-8') as f:
lines = f.readlines()
s = ''.join(lines)
t_start = time.time()
res = tokenizer.encode(s, add_special_tokens=False)
t_end = time.time()
print('\nmain : tokenized in', "{:.3f}".format(1000.0 * (t_end - t_start)), 'ms (py)') # noqa: NP100
with open(fname_out, 'w', encoding='utf-8') as f:
for x in res:
# LLaMA v3 for some reason strips the space for these tokens (and others)
# if x == 662:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 1174:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 2564:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 758:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 949:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# elif x == 5354:
# f.write(str(x) + ' \' ' + tokenizer.decode(x) + '\'\n')
# else:
# f.write(str(x) + ' \'' + tokenizer.decode(x) + '\'\n')
# f.write(str(x) + ' \'' + tokenizer.decode(x).strip() + '\'\n')
f.write(str(x) + '\n')
print('len(res): ', len(res)) # noqa: NP100
print('len(lines): ', len(lines)) # noqa: NP100
print('results written to: ', fname_out) # noqa: NP100

View file

@ -1,41 +0,0 @@
#!/bin/bash
#
# Usage:
#
# test-tokenizer-0.sh <name> <input>
#
if [ $# -ne 2 ]; then
printf "Usage: $0 <name> <input>\n"
exit 1
fi
name=$1
input=$2
make -j tests/test-tokenizer-0
printf "Testing %s on %s ...\n" $name $input
set -e
printf "Tokenizing using (py) Python AutoTokenizer ...\n"
python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
printf "Tokenizing using (cpp) llama.cpp ...\n"
./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
set +e
diff $input.tok $input.tokcpp > /dev/null 2>&1
if [ $? -eq 0 ]; then
printf "Tokenization is correct!\n"
else
diff $input.tok $input.tokcpp | head -n 32
printf "Tokenization differs!\n"
fi

View file

@ -1,152 +0,0 @@
#include "llama.h"
#include "common.h"
#include "unicode.h"
#include "console.h"
#include <cassert>
#include <codecvt>
#include <cstdio>
#include <cstring>
#include <locale>
#include <string>
#include <thread>
#include <vector>
#include <atomic>
int main(int argc, char **argv) {
if (argc < 2 || argc > 3) {
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
bool ignore_merges = false;
if (argc == 3) {
if (std::strcmp(argv[2], "--ignore-merges") != 0) {
fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
return 1;
}
ignore_merges = true;
}
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
if (ignore_merges) {
fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
}
llama_model * model;
llama_context * ctx;
llama_backend_init();
// load the vocab
{
auto mparams = llama_model_default_params();
mparams.vocab_only = true;
model = llama_load_model_from_file(fname.c_str(), mparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
auto cparams = llama_context_default_params();
ctx = llama_new_context_with_model(model, cparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
llama_free_model(model);
return 1;
}
}
//GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_BPE);
if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_BPE) {
return 99;
}
#ifdef _WIN32
// We need this for unicode console support
console::init(false, false);
atexit([]() { console::cleanup(); });
#endif
const int n_vocab = llama_n_vocab(model);
for (int i = 0; i < n_vocab; ++i) {
std::string str = llama_detokenize(ctx, std::vector<int>(1, i));
try {
auto cps = unicode_cpts_from_utf8(str);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
if (ignore_merges && tokens.size() > 1) {
fprintf(stderr,
"%s : error: token %d detokenizes to '%s'(%zu) but "
"tokenization of this to multiple tokens: [",
__func__, i, str.c_str(), str.length());
fprintf(stderr, "%d", tokens[0]);
for (size_t i = 1; i < tokens.size(); i++) {
fprintf(stderr, ", %d", tokens[i]);
}
fprintf(stderr, "]\n");
return 2;
}
std::string check = llama_detokenize(ctx, tokens);
if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
return 2;
}
}
catch (const std::invalid_argument &) {
//fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
}
}
// unicode
{
const int nthread = std::thread::hardware_concurrency();
std::vector<std::thread> threads(nthread);
std::atomic_int errcode = {};
for (int i = 0; i < nthread; ++i) {
threads[i] = std::thread([i, nthread, ctx, &errcode]() {
for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
if ((0x0000D800 <= cp && cp <= 0x0000DFFF) || // surrogates \p{Cs}
(0x00040000 <= cp && cp <= 0x000E0000)) { // undefined \p{Cn}
continue;
}
std::string str = unicode_cpt_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
std::string check = llama_detokenize(ctx, tokens);
if (cp != 9601 && str != check) {
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
cp, check.c_str(), check.length(), str.c_str(), str.length());
errcode = 3;
}
}
});
}
for (auto & t : threads) {
t.join();
}
if (errcode) {
return errcode;
}
}
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
return 0;
}

View file

@ -1,122 +0,0 @@
#include "llama.h"
#include "common.h"
#include "unicode.h"
#include "console.h"
#include <cassert>
#include <codecvt>
#include <cstdio>
#include <cstring>
#include <locale>
#include <string>
#include <thread>
#include <vector>
#include <atomic>
int main(int argc, char ** argv) {
if (argc < 2) {
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
return 1;
}
const std::string fname = argv[1];
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
llama_model * model;
llama_context * ctx;
llama_backend_init();
// load the vocab
{
auto mparams = llama_model_default_params();
mparams.vocab_only = true;
model = llama_load_model_from_file(fname.c_str(), mparams);
if (model == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
return 1;
}
auto cparams = llama_context_default_params();
ctx = llama_new_context_with_model(model, cparams);
if (ctx == NULL) {
fprintf(stderr, "%s: error: failed to load vocab '%s'\n", __func__, fname.c_str());
llama_free_model(model);
return 1;
}
}
//GGML_ASSERT(llama_vocab_type(model) == LLAMA_VOCAB_TYPE_SPM);
if (llama_vocab_type(model) != LLAMA_VOCAB_TYPE_SPM) {
return 99;
}
#ifdef _WIN32
// We need this for unicode console support
console::init(false, false);
atexit([]() { console::cleanup(); });
#endif
const int n_vocab = llama_n_vocab(model);
for (int i = 0; i < n_vocab; ++i) {
std::string str = llama_detokenize(ctx, std::vector<int>(1, i), true);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
std::string check = llama_detokenize(ctx, tokens);
if (check != str) {
fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
__func__, i, str.c_str(), str.length(), check.c_str(), check.length());
return 2;
}
}
// unicode
{
const int nthread = std::thread::hardware_concurrency();
std::vector<std::thread> threads(nthread);
std::atomic_int errcode = {};
for (int i = 0; i < nthread; ++i) {
threads[i] = std::thread([i, nthread, ctx, &errcode]() {
for (uint32_t cp = i; !errcode && cp < 0x00110000; cp += nthread) {
if ((0x0000D800 <= cp && cp <= 0x0000DFFF) || // surrogates \p{Cs}
(0x00040000 <= cp && cp <= 0x000E0000)) { // undefined \p{Cn}
continue;
}
std::string str = unicode_cpt_to_utf8(cp);
std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
std::string check = llama_detokenize(ctx, tokens);
if (cp != 9601 && str != check) {
fprintf(stderr, "error: codepoint 0x%x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
cp, check.c_str(), check.length(), str.c_str(), str.length());
errcode = 3;
}
}
});
}
for (auto & t : threads) {
t.join();
}
if(errcode) {
return errcode;
}
}
llama_free_model(model);
llama_free(ctx);
llama_backend_free();
return 0;
}

View file

@ -1,566 +0,0 @@
# Test libllama tokenizer == AutoTokenizer.
# Brute force random words/text generation.
#
# Sample usage:
#
# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
#
from __future__ import annotations
import time
import logging
import argparse
import subprocess
import random
import unicodedata
from pathlib import Path
from typing import Any, Iterator, cast
from typing_extensions import Buffer
import cffi
from transformers import AutoTokenizer, PreTrainedTokenizer
logger = logging.getLogger("test-tokenizer-random")
class LibLlama:
DEFAULT_PATH_LLAMA_H = "./include/llama.h"
DEFAULT_PATH_INCLUDES = ["./ggml/include/", "./include/"]
DEFAULT_PATH_LIBLLAMA = "./build/src/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON
def __init__(self, path_llama_h: str | None = None, path_includes: list[str] = [], path_libllama: str | None = None):
path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
path_includes = path_includes or self.DEFAULT_PATH_INCLUDES
path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
(self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_includes, path_libllama)
self.lib.llama_backend_init()
def _load_libllama_cffi(self, path_llama_h: str, path_includes: list[str], path_libllama: str) -> tuple[cffi.FFI, Any]:
cmd = ["gcc", "-O0", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)="]
cmd += ["-I" + path for path in path_includes] + [path_llama_h]
res = subprocess.run(cmd, stdout=subprocess.PIPE)
assert (res.returncode == 0)
source = res.stdout.decode()
ffi = cffi.FFI()
if True: # workarounds for pycparser
source = "typedef struct { } __builtin_va_list;" + "\n" + source
source = source.replace("sizeof (int)", str(ffi.sizeof("int")))
source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
ffi.cdef(source, override=True)
lib = ffi.dlopen(path_libllama)
return (ffi, lib)
def model_default_params(self, **kwargs):
mparams = self.lib.llama_model_default_params()
for k, v in kwargs.items():
setattr(mparams, k, v)
return mparams
def context_default_params(self, **kwargs):
cparams = self.lib.llama_context_default_params()
for k, v in kwargs.items():
setattr(cparams, k, v)
return cparams
class LibLlamaModel:
def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
self.lib: Any = libllama.lib
self.ffi = libllama.ffi
if isinstance(mparams, dict):
mparams = libllama.model_default_params(**mparams)
self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
if not self.model:
raise RuntimeError("error: failed to load model '%s'" % path_model)
if isinstance(cparams, dict):
cparams = libllama.context_default_params(**cparams)
self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
if not self.ctx:
raise RuntimeError("error: failed to create context for model '%s'" % path_model)
n_tokens_max = self.lib.llama_n_ctx(self.ctx)
self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
self.text_buff = self.ffi.new("uint8_t[]", 1024)
def free(self):
if self.ctx:
self.lib.llama_free(self.ctx)
if self.model:
self.lib.llama_free_model(self.model)
self.ctx = None
self.model = None
self.lib = None
def tokenize(self, text: str, add_special: bool = False, parse_special: bool = False) -> list[int]:
encoded_text: bytes = text.encode("utf-8")
num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
while num < 0 and len(self.token_ids) < (16 << 20):
self.token_ids = self.ffi.new("llama_token[]", -2 * num)
num = self.lib.llama_tokenize(self.model, encoded_text, len(encoded_text), self.token_ids, len(self.token_ids), add_special, parse_special)
return list(self.token_ids[0:num])
def detokenize(self, ids: list[int], remove_special: bool = False, unparse_special: bool = False) -> str:
if len(self.token_ids) < len(ids):
self.token_ids = self.ffi.new("llama_token[]", 2 * len(ids))
for i, id in enumerate(ids):
self.token_ids[i] = id
num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
while num < 0 and len(self.text_buff) < (16 << 20):
self.text_buff = self.ffi.new("uint8_t[]", -2 * num)
num = self.lib.llama_detokenize(self.model, self.token_ids, len(ids), self.text_buff, len(self.text_buff), remove_special, unparse_special)
return str(cast(Buffer, self.ffi.buffer(self.text_buff, num)), encoding="utf-8", errors="replace") # replace errors with '\uFFFD'
class Tokenizer:
def encode(self, text: str) -> list[int]:
raise NotImplementedError
def decode(self, ids: list[int]) -> str:
raise NotImplementedError
class TokenizerGroundtruth (Tokenizer):
def __init__(self, dir_tokenizer: str):
self.model: PreTrainedTokenizer = AutoTokenizer.from_pretrained(dir_tokenizer)
# guess BOS and EOS
ids = self.encode("a")
assert 1 <= len(ids) <= 3
add_bos_token = len(ids) > 1 and self.model.bos_token_id == ids[0]
add_eos_token = len(ids) > 1 and self.model.eos_token_id == ids[-1]
self.add_bos_token = getattr(self.model, "add_bos_token", add_bos_token)
self.add_eos_token = getattr(self.model, "add_eos_token", add_eos_token)
# build vocab
tokens = list(self.model.get_vocab().values())
self.vocab = self.model.batch_decode(tokens, skip_special_tokens=True)
self.vocab = list(sorted(self.vocab))
# tokens and lists
self.special_tokens = list(self.model.all_special_tokens)
self.added_tokens = self.model.batch_decode(self.model.added_tokens_encoder.values(), skip_special_tokens=False)
self.bos_token = self.model.bos_token
self.eos_token = self.model.eos_token
def encode(self, text: str) -> list[int]:
return self.model.encode(text, add_special_tokens=True)
def decode(self, ids: list[int]) -> str:
return self.model.decode(ids, skip_special_tokens=False)
class TokenizerLlamaCpp (Tokenizer):
libllama: LibLlama | None = None
def __init__(self, vocab_file: str):
if not self.libllama:
self.libllama = LibLlama()
self.model = LibLlamaModel(self.libllama, vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
def encode(self, text: str) -> list[int]:
return self.model.tokenize(text, add_special=True, parse_special=True)
def decode(self, ids: list[int]) -> str:
return self.model.detokenize(ids, remove_special=False, unparse_special=True)
def generator_custom_text() -> Iterator[str]:
"""General tests"""
yield from [
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
]
def generator_custom_text_edge_cases() -> Iterator[str]:
"""Edge cases found while debugging"""
yield from [
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
'¼-a', # unicode_ranges_digit, 0x00BC
'½-a', # unicode_ranges_digit, 0x00BD
'¾-a', # unicode_ranges_digit, 0x00BE
'a b', # unicode_ranges_digit, 0x3007
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
'Cửa Việt', # llama-3, ignore_merges = true
'<s>a', # Phi-3 fail
'<unk><|endoftext|><s>', # Phi-3 fail
'a\na', # bert fail
'"`', # falcon
' \u2e4e', # falcon
'\n\x0b ', # falcon
'a\xa0\xa0\x00b', # jina-v2-es
'one <mask>', # jina-v2-es <mask> lstrip=true
'a </s> b', # rstrip phi-3
'a <mask> b', # lstrip jina-v2
'\xa0aC', # deepseek
'\u2029 \uA3E4', # deepseek-llm
"a ?",
'', # mpt
'\U000ac517', # utf-8 encode error, falcon
'\U000522f4', # utf-8 encode error, starcoder
"<s><s><unk><s>a<s>b<s>c<unk>d<unk></s>",
"<s> <s> <unk><s>a<s>b<s>c<unk>d<unk></s>",
]
def generator_vocab_words(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
"""Brute force check all vocab words"""
yield from tokenizer.vocab
def generator_ascii_lr_strip() -> Iterator[str]:
WHITESPACES = ["", " ", " "]
CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
for char1 in CHARACTERS:
for char2 in CHARACTERS:
for lstrip in WHITESPACES:
for rstrip in WHITESPACES:
yield lstrip + char1 + char2 + rstrip
yield lstrip + char1 + rstrip + char2
yield char1 + lstrip + char2 + rstrip
def generator_apostrophe() -> Iterator[str]:
WHITESPACES = ["", " ", " "]
CHARACTERS = list(chr(i) for i in range(1, 0x80)) + [""]
for char1 in CHARACTERS:
for char2 in CHARACTERS:
for lstrip in WHITESPACES:
for rstrip in WHITESPACES:
yield char1 + lstrip + "'" + rstrip + char2
yield char1 + char2 + lstrip + "'" + rstrip + "z"
yield "a" + lstrip + "'" + rstrip + char1 + char2
def generator_added_lr_strip(tokenizer: TokenizerGroundtruth) -> Iterator[str]:
WHITESPACES = ["", " ", " ", "\n", "\r\n", "\n\n", "\t", "\t\t"]
all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens)))
for token in all_tokens:
for lstrip in WHITESPACES:
for rstrip in WHITESPACES:
yield lstrip + token + rstrip
yield "a" + lstrip + token + rstrip
yield lstrip + token + rstrip + "z"
yield "a" + lstrip + token + rstrip + "z"
def generator_random_added_tokens(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
separations = [" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"]
all_tokens = list(sorted(set(tokenizer.special_tokens + tokenizer.added_tokens + separations)))
rand = random.Random()
for m in range(iterations):
rand.seed(m)
words = rand.choices(all_tokens, k=500)
if words and words[0] == tokenizer.bos_token: # skip spam warning of double BOS
while len(words) > 1 and words[1] == tokenizer.bos_token: # leave one starting BOS
words.pop(0)
if tokenizer.add_bos_token: # drop all starting BOS
words.pop(0)
if words and words[-1] == tokenizer.eos_token: # skip spam warning of double EOS
while len(words) > 1 and words[-2] == tokenizer.eos_token: # leave one trailing EOS
words.pop(-1)
if tokenizer.add_bos_token: # drop all trailing EOS
words.pop(-1)
yield "".join(words)
def generator_random_chars(iterations=100) -> Iterator[str]:
"""Brute force random text with simple characters"""
NUM_WORDS = 400
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
CHARS = list(sorted(set("""
ABCDEFGHIJKLMNOPQRSTUVWXYZ
abcdefghijklmnopqrstuvwxyz
ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
áéíóúàèìòùâêîôûäëïöü
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
""")))
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = []
for _ in range(NUM_WORDS):
k = rand.randint(1, 7)
word = rand.choices(CHARS, k=k)
word.append(rand.choice(WHITESPACES))
text.append("".join(word))
yield "".join(text)
def generator_unicodes() -> Iterator[str]:
"""Iterate unicode characters"""
MAX_CODEPOINTS = 0x30000 # 0x110000
def _valid(cpt):
if cpt >= 0x30000: # unassigned and supplement­ary
return False
# if cpt == 0x2029: # deepseek-llm
# return False
if unicodedata.category(chr(cpt)) in ("Cn", "Cs", "Co"): # undefined, surrogates, private
return False
return True
characters = [chr(cpt) for cpt in range(0, MAX_CODEPOINTS) if _valid(cpt)]
yield from characters
def generator_random_unicodes(iterations=100) -> Iterator[str]:
"""Brute force random text with unicode characters"""
NUM_WORDS = 200
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
characters = list(generator_unicodes())
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = []
for _ in range(NUM_WORDS):
k = rand.randint(1, 7)
word = rand.choices(characters, k=k)
word.append(rand.choice(WHITESPACES))
text.append("".join(word))
yield "".join(text)
def generator_random_vocab_chars(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
"""Brute force random text with vocab characters"""
vocab_chars = set()
for word in tokenizer.vocab:
vocab_chars.update(word)
vocab_chars = list(sorted(vocab_chars))
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = rand.choices(vocab_chars, k=1024)
yield "".join(text)
def generator_random_vocab_words(tokenizer: TokenizerGroundtruth, iterations=100) -> Iterator[str]:
"""Brute force random text from vocab words"""
vocab = [w.strip() for w in tokenizer.vocab]
yield from vocab
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = []
num_words = rand.randint(300, 400)
for i in range(num_words):
k = rand.randint(1, 3)
words = rand.choices(vocab, k=k)
sep = rand.choice(" \n\r\t")
text.append("".join(words) + sep)
yield "".join(text)
def compare_tokenizers(tokenizer1: TokenizerGroundtruth, tokenizer2: TokenizerLlamaCpp, generator: Iterator[str]):
def find_first_mismatch(ids1: list[int] | str, ids2: list[int] | str):
for i, (a, b) in enumerate(zip(ids1, ids2)):
if a != b:
return i
if len(ids1) == len(ids2):
return -1
return min(len(ids1), len(ids2))
def check_detokenizer(text: str, text1: str, text2: str) -> bool:
if text1 == text2: # equal to TokenizerGroundtruth?
return True
# equal to source text?
if tokenizer1.add_bos_token: # remove BOS
if text2.startswith(tokenizer1.bos_token):
text2 = text2[len(tokenizer1.bos_token):]
if tokenizer1.add_eos_token: # remove EOS
if text2.endswith(tokenizer1.eos_token):
text2 = text2[:-len(tokenizer1.eos_token)]
return text == text2
t_encode1 = 0
t_encode2 = 0
t_decode1 = 0
t_decode2 = 0
t_start = time.perf_counter()
encode_errors = 0
decode_errors = 0
MAX_ERRORS = 10
logger.info("%s: %s" % (generator.__qualname__, "ini"))
for text in generator:
# print(repr(text), text.encode())
# print(repr(text), hex(ord(text[0])), text.encode())
t0 = time.perf_counter()
ids1 = tokenizer1.encode(text)
t1 = time.perf_counter()
ids2 = tokenizer2.encode(text)
t2 = time.perf_counter()
text1 = tokenizer1.decode(ids1)
t3 = time.perf_counter()
text2 = tokenizer2.decode(ids1)
t4 = time.perf_counter()
t_encode1 += t1 - t0
t_encode2 += t2 - t1
t_decode1 += t3 - t2
t_decode2 += t4 - t3
if encode_errors < MAX_ERRORS and ids1 != ids2:
i = find_first_mismatch(ids1, ids2)
ids1 = list(ids1)[max(0, i - 2) : i + 5 + 1]
ids2 = list(ids2)[max(0, i - 2) : i + 5 + 1]
logger.error(" Expected: " + str(ids1))
logger.error(" Result: " + str(ids2))
encode_errors += 1
logger.error(f" {encode_errors=}")
if decode_errors < MAX_ERRORS and not check_detokenizer(text, text1, text2):
i = find_first_mismatch(text1, text2)
text1 = list(text1[max(0, i - 2) : i + 5 + 1])
text2 = list(text2[max(0, i - 2) : i + 5 + 1])
logger.error(" Expected: " + " ".join(hex(ord(x)) for x in text1))
logger.error(" Result: " + " ".join(hex(ord(x)) for x in text2))
decode_errors += 1
logger.error(f" {decode_errors=}")
if encode_errors >= MAX_ERRORS and decode_errors >= MAX_ERRORS:
logger.error(f" EXIT: {encode_errors=} {decode_errors=}")
# raise Exception()
break
t_total = time.perf_counter() - t_start
logger.info(f"{generator.__qualname__}: end, {t_encode1=:.3f} {t_encode2=:.3f} {t_decode1=:.3f} {t_decode2=:.3f} {t_total=:.3f}")
def main(argv: list[str] | None = None):
parser = argparse.ArgumentParser()
parser.add_argument("vocab_file", type=str, help="path to vocab 'gguf' file")
parser.add_argument("dir_tokenizer", type=str, help="directory containing 'tokenizer.model' file")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(argv)
logging.basicConfig(level = logging.DEBUG if args.verbose else logging.INFO)
logger.info(f"VOCABFILE: '{args.vocab_file}'")
tokenizer1 = TokenizerGroundtruth(args.dir_tokenizer)
tokenizer2 = TokenizerLlamaCpp(args.vocab_file)
# compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text())
# compare_tokenizers(tokenizer1, tokenizer2, generator_custom_text_edge_cases())
compare_tokenizers(tokenizer1, tokenizer2, generator_ascii_lr_strip())
compare_tokenizers(tokenizer1, tokenizer2, generator_apostrophe())
compare_tokenizers(tokenizer1, tokenizer2, generator_unicodes())
compare_tokenizers(tokenizer1, tokenizer2, generator_vocab_words(tokenizer1))
compare_tokenizers(tokenizer1, tokenizer2, generator_added_lr_strip(tokenizer1))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_added_tokens(tokenizer1, 10_000))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_chars(10_000))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_unicodes(10_000))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_chars(tokenizer1, 10_000))
# compare_tokenizers(tokenizer1, tokenizer2, generator_random_vocab_words(tokenizer1, 5_000))
tokenizer2.model.free()
if __name__ == "__main__":
# main()
if True:
logging.basicConfig(
level = logging.DEBUG,
format = "%(asctime)s.%(msecs)03d %(name)s %(levelname)s %(message)s",
datefmt = "%Y-%m-%d %H:%M:%S",
filename = logger.name + ".log",
filemode = "a"
)
logging.basicConfig(
level = logging.DEBUG,
format = "%(levelname)s %(message)s",
)
path_tokenizers = Path("./models/tokenizers/")
path_vocab_format = "./models/ggml-vocab-%s.gguf"
tokenizers = [
"llama-spm", # SPM
"phi-3", # SPM
"gemma", # SPM
"gemma-2", # SPM
"baichuan", # SPM
"bert-bge", # WPM
"jina-v2-en", # WPM
"llama-bpe", # BPE
"phi-2", # BPE
"deepseek-llm", # BPE
"deepseek-coder", # BPE
"falcon", # BPE
"mpt", # BPE
"starcoder", # BPE
"gpt-2", # BPE
"stablelm2", # BPE
"refact", # BPE
"qwen2", # BPE
"olmo", # BPE
"jina-v2-es", # BPE
"jina-v2-de", # BPE
"smaug-bpe", # BPE
"poro-chat", # BPE
"jina-v2-code", # BPE
"viking", # BPE
"jais", # BPE
]
logger.info("=" * 50)
for tokenizer in tokenizers:
logger.info("-" * 50)
logger.info(f"TOKENIZER: '{tokenizer}'")
vocab_file = Path(path_vocab_format % tokenizer)
dir_tokenizer = path_tokenizers / tokenizer
main([str(vocab_file), str(dir_tokenizer), "--verbose"])