Merge branch 'master' into hkvc_chaton_v3
Have merged master branch has of 20240510IST12XY with chaton_v3 branch. As part of same had to update the flow in examples/main/main.cpp wrt conversion related commit in master branch and my chaton related commits in this branch.
|
@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
|
||||||
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
|
||||||
"llama: max. batch size for using peer access")
|
"llama: max. batch size for using peer access")
|
||||||
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
|
||||||
|
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
|
||||||
|
|
||||||
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
|
||||||
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
|
||||||
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
|
||||||
|
@ -403,12 +405,16 @@ if (LLAMA_CUDA)
|
||||||
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
|
||||||
|
|
||||||
add_compile_definitions(GGML_USE_CUDA)
|
add_compile_definitions(GGML_USE_CUDA)
|
||||||
|
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
|
||||||
if (LLAMA_CUDA_FORCE_DMMV)
|
if (LLAMA_CUDA_FORCE_DMMV)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
|
||||||
endif()
|
endif()
|
||||||
if (LLAMA_CUDA_FORCE_MMQ)
|
if (LLAMA_CUDA_FORCE_MMQ)
|
||||||
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
|
||||||
endif()
|
endif()
|
||||||
|
if (LLAMA_CUDA_NO_VMM)
|
||||||
|
add_compile_definitions(GGML_CUDA_NO_VMM)
|
||||||
|
endif()
|
||||||
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
|
||||||
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
|
||||||
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
if (DEFINED LLAMA_CUDA_DMMV_Y)
|
||||||
|
@ -425,7 +431,7 @@ if (LLAMA_CUDA)
|
||||||
|
|
||||||
if (LLAMA_STATIC)
|
if (LLAMA_STATIC)
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
|
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
|
||||||
else ()
|
else ()
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
|
||||||
|
@ -434,7 +440,11 @@ if (LLAMA_CUDA)
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
|
||||||
endif()
|
endif()
|
||||||
|
|
||||||
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
|
if (LLAMA_CUDA_NO_VMM)
|
||||||
|
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
|
||||||
|
else()
|
||||||
|
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
|
||||||
|
endif()
|
||||||
|
|
||||||
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
|
||||||
# 52 == lowest CUDA 12 standard
|
# 52 == lowest CUDA 12 standard
|
||||||
|
|
2
Makefile
|
@ -433,7 +433,7 @@ ifdef LLAMA_CUDA
|
||||||
else
|
else
|
||||||
CUDA_PATH ?= /usr/local/cuda
|
CUDA_PATH ?= /usr/local/cuda
|
||||||
endif
|
endif
|
||||||
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
|
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
|
||||||
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
|
||||||
OBJS += ggml-cuda.o
|
OBJS += ggml-cuda.o
|
||||||
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))
|
||||||
|
|
53
README.md
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||

|

|
||||||
|
|
||||||
[](https://opensource.org/licenses/MIT)
|
[](https://opensource.org/licenses/MIT) [](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
|
||||||
|
|
||||||
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
|
||||||
|
|
||||||
|
@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
|
||||||
|
|
||||||
### Hot topics
|
### Hot topics
|
||||||
|
|
||||||
- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
|
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
|
||||||
|
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
|
||||||
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
|
||||||
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
|
||||||
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
|
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
|
||||||
|
@ -175,6 +176,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
|
||||||
- [nat/openplayground](https://github.com/nat/openplayground)
|
- [nat/openplayground](https://github.com/nat/openplayground)
|
||||||
- [Faraday](https://faraday.dev/) (proprietary)
|
- [Faraday](https://faraday.dev/) (proprietary)
|
||||||
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
- [LMStudio](https://lmstudio.ai/) (proprietary)
|
||||||
|
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
|
||||||
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
|
||||||
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
|
||||||
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
|
||||||
|
@ -935,17 +937,25 @@ If your issue is with model generation quality, then please at least scan the fo
|
||||||
|
|
||||||
### Android
|
### Android
|
||||||
|
|
||||||
|
#### Build on Android using Termux
|
||||||
|
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
|
||||||
|
```
|
||||||
|
apt update && apt upgrade -y
|
||||||
|
apt install git make cmake
|
||||||
|
```
|
||||||
|
|
||||||
|
It's recommended to move your model inside the `~/` directory for best performance:
|
||||||
|
```
|
||||||
|
cd storage/downloads
|
||||||
|
mv model.gguf ~/
|
||||||
|
```
|
||||||
|
|
||||||
|
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
|
||||||
|
|
||||||
#### Building the Project using Android NDK
|
#### Building the Project using Android NDK
|
||||||
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
|
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
|
||||||
|
|
||||||
First, install the essential packages for termux:
|
|
||||||
```
|
|
||||||
pkg install clang wget git cmake
|
|
||||||
```
|
|
||||||
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
|
|
||||||
|
|
||||||
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
|
|
||||||
|
|
||||||
|
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
|
||||||
```
|
```
|
||||||
$ mkdir build-android
|
$ mkdir build-android
|
||||||
$ cd build-android
|
$ cd build-android
|
||||||
|
@ -953,7 +963,9 @@ $ export NDK=<your_ndk_directory>
|
||||||
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
|
||||||
$ make
|
$ make
|
||||||
```
|
```
|
||||||
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
|
|
||||||
|
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
|
||||||
|
|
||||||
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
|
||||||
|
|
||||||
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
|
||||||
|
@ -975,25 +987,10 @@ $cd /data/data/com.termux/files/home/bin
|
||||||
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
|
||||||
```
|
```
|
||||||
|
|
||||||
Here is a demo of an interactive session running on Pixel 5 phone:
|
Here's a demo of an interactive session running on Pixel 5 phone:
|
||||||
|
|
||||||
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
|
||||||
|
|
||||||
#### Build on Android using Termux
|
|
||||||
[Termux](https://github.com/termux/termux-app#installation) is an alternative to execute `llama.cpp` on an Android device (no root required).
|
|
||||||
```
|
|
||||||
apt update && apt upgrade -y
|
|
||||||
apt install git
|
|
||||||
```
|
|
||||||
|
|
||||||
It's recommended to move your model inside the `~/` directory for best performance:
|
|
||||||
```
|
|
||||||
cd storage/downloads
|
|
||||||
mv model.gguf ~/
|
|
||||||
```
|
|
||||||
|
|
||||||
[Follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
|
|
||||||
|
|
||||||
### Docker
|
### Docker
|
||||||
|
|
||||||
#### Prerequisites
|
#### Prerequisites
|
||||||
|
|
|
@ -160,9 +160,8 @@ function gg_run_test_scripts_debug {
|
||||||
|
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# TODO: too slow, run on dedicated node
|
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
#(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
||||||
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
|
|
||||||
|
|
||||||
set +e
|
set +e
|
||||||
}
|
}
|
||||||
|
@ -695,8 +694,10 @@ test $ret -eq 0 && gg_run ctest_release
|
||||||
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
if [ -z ${GG_BUILD_LOW_PERF} ]; then
|
||||||
test $ret -eq 0 && gg_run embd_bge_small
|
test $ret -eq 0 && gg_run embd_bge_small
|
||||||
|
|
||||||
|
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
|
||||||
test $ret -eq 0 && gg_run test_scripts_debug
|
test $ret -eq 0 && gg_run test_scripts_debug
|
||||||
test $ret -eq 0 && gg_run test_scripts_release
|
test $ret -eq 0 && gg_run test_scripts_release
|
||||||
|
fi
|
||||||
|
|
||||||
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
|
||||||
if [ -z ${GG_BUILD_CUDA} ]; then
|
if [ -z ${GG_BUILD_CUDA} ]; then
|
||||||
|
|
|
@ -1,4 +1,6 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
#include "json-schema-to-grammar.h"
|
#include "json-schema-to-grammar.h"
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
|
@ -911,6 +913,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
|
||||||
params.instruct = true;
|
params.instruct = true;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
if (arg == "-cnv" || arg == "--conversation") {
|
||||||
|
params.conversation = true;
|
||||||
|
return true;
|
||||||
|
}
|
||||||
if (arg == "-cml" || arg == "--chatml") {
|
if (arg == "-cml" || arg == "--chatml") {
|
||||||
params.chatml = true;
|
params.chatml = true;
|
||||||
return true;
|
return true;
|
||||||
|
@ -1439,6 +1445,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
|
||||||
printf(" --version show version and build info\n");
|
printf(" --version show version and build info\n");
|
||||||
printf(" -i, --interactive run in interactive mode\n");
|
printf(" -i, --interactive run in interactive mode\n");
|
||||||
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
printf(" --interactive-first run in interactive mode and wait for input right away\n");
|
||||||
|
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
|
||||||
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
|
||||||
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
|
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
|
||||||
printf(" --chaton-meta-json JsonFile\n");
|
printf(" --chaton-meta-json JsonFile\n");
|
||||||
|
@ -1990,18 +1997,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
|
||||||
try {
|
try {
|
||||||
metadata_in >> metadata;
|
metadata_in >> metadata;
|
||||||
fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
|
||||||
if (metadata.contains("url") && metadata["url"].is_string()) {
|
if (metadata.contains("url") && metadata.at("url").is_string()) {
|
||||||
auto previous_url = metadata["url"].get<std::string>();
|
auto previous_url = metadata.at("url").get<std::string>();
|
||||||
if (previous_url != url) {
|
if (previous_url != url) {
|
||||||
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (metadata.contains("etag") && metadata["etag"].is_string()) {
|
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
|
||||||
etag = metadata["etag"];
|
etag = metadata.at("etag");
|
||||||
}
|
}
|
||||||
if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
|
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
|
||||||
last_modified = metadata["lastModified"];
|
last_modified = metadata.at("lastModified");
|
||||||
}
|
}
|
||||||
} catch (const nlohmann::json::exception & e) {
|
} catch (const nlohmann::json::exception & e) {
|
||||||
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());
|
||||||
|
|
|
@ -140,6 +140,7 @@ struct gpt_params {
|
||||||
bool random_prompt = false; // do not randomize prompt if none provided
|
bool random_prompt = false; // do not randomize prompt if none provided
|
||||||
bool use_color = false; // use color to distinguish generations and inputs
|
bool use_color = false; // use color to distinguish generations and inputs
|
||||||
bool interactive = false; // interactive mode
|
bool interactive = false; // interactive mode
|
||||||
|
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
|
||||||
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
|
||||||
bool chaton = false; // whether chaton is enabled or disabled
|
bool chaton = false; // whether chaton is enabled or disabled
|
||||||
std::string chaton_meta_json = ""; // name of the json file containing the chaton templates
|
std::string chaton_meta_json = ""; // name of the json file containing the chaton templates
|
||||||
|
|
|
@ -1,4 +1,8 @@
|
||||||
#pragma once
|
#pragma once
|
||||||
|
|
||||||
|
#include "ggml.h"
|
||||||
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
|
||||||
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
|
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);
|
||||||
|
|
|
@ -35,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
|
||||||
|
|
||||||
result->prev.resize(params.n_prev);
|
result->prev.resize(params.n_prev);
|
||||||
|
|
||||||
|
result->n_considered = 0;
|
||||||
|
|
||||||
llama_sampling_set_rng_seed(result, params.seed);
|
llama_sampling_set_rng_seed(result, params.seed);
|
||||||
|
|
||||||
return result;
|
return result;
|
||||||
|
@ -64,6 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
|
||||||
|
|
||||||
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
|
||||||
ctx->cur.clear();
|
ctx->cur.clear();
|
||||||
|
ctx->n_considered = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
|
||||||
|
@ -253,6 +256,8 @@ static llama_token llama_sampling_sample_impl(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ctx_sampling->n_considered = cur_p.size;
|
||||||
|
|
||||||
return id;
|
return id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -81,6 +81,7 @@ struct llama_sampling_context {
|
||||||
// TODO: replace with ring-buffer
|
// TODO: replace with ring-buffer
|
||||||
std::vector<llama_token> prev;
|
std::vector<llama_token> prev;
|
||||||
std::vector<llama_token_data> cur;
|
std::vector<llama_token_data> cur;
|
||||||
|
size_t n_considered;
|
||||||
|
|
||||||
std::mt19937 rng;
|
std::mt19937 rng;
|
||||||
};
|
};
|
||||||
|
|
|
@ -49,6 +49,10 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶
|
||||||
|
|
||||||
if len(sys.argv) == 2:
|
if len(sys.argv) == 2:
|
||||||
token = sys.argv[1]
|
token = sys.argv[1]
|
||||||
|
if not token.startswith("hf_"):
|
||||||
|
logger.info("Huggingface token seems invalid")
|
||||||
|
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||||
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
@ -67,6 +71,9 @@ models = [
|
||||||
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
|
||||||
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
|
||||||
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
|
||||||
|
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
|
||||||
|
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
|
||||||
|
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
|
||||||
]
|
]
|
||||||
|
|
||||||
# make directory "models/tokenizers" if it doesn't exist
|
# make directory "models/tokenizers" if it doesn't exist
|
||||||
|
@ -150,6 +157,8 @@ for model in models:
|
||||||
# print the "pre_tokenizer" content from the tokenizer.json
|
# print the "pre_tokenizer" content from the tokenizer.json
|
||||||
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
|
||||||
cfg = json.load(f)
|
cfg = json.load(f)
|
||||||
|
normalizer = cfg["normalizer"]
|
||||||
|
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
|
||||||
pre_tokenizer = cfg["pre_tokenizer"]
|
pre_tokenizer = cfg["pre_tokenizer"]
|
||||||
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
|
||||||
|
|
||||||
|
@ -252,6 +261,7 @@ tests = [
|
||||||
"3333333",
|
"3333333",
|
||||||
"33333333",
|
"33333333",
|
||||||
"333333333",
|
"333333333",
|
||||||
|
# "Cửa Việt", # llama-bpe fails on this
|
||||||
chktxt,
|
chktxt,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
33
convert.py
|
@ -284,6 +284,7 @@ class Params:
|
||||||
n_experts = None
|
n_experts = None
|
||||||
n_experts_used = None
|
n_experts_used = None
|
||||||
f_rope_freq_base = None
|
f_rope_freq_base = None
|
||||||
|
n_ff = None
|
||||||
|
|
||||||
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
# hack to determine LLaMA v1 vs v2 vs CodeLlama
|
||||||
if config.get("moe"):
|
if config.get("moe"):
|
||||||
|
@ -308,6 +309,8 @@ class Params:
|
||||||
n_experts_used = config["moe"]["num_experts_per_tok"]
|
n_experts_used = config["moe"]["num_experts_per_tok"]
|
||||||
f_rope_freq_base = 1e6
|
f_rope_freq_base = 1e6
|
||||||
|
|
||||||
|
assert n_ff is not None
|
||||||
|
|
||||||
return Params(
|
return Params(
|
||||||
n_vocab = model["tok_embeddings.weight"].shape[0],
|
n_vocab = model["tok_embeddings.weight"].shape[0],
|
||||||
n_embd = config["dim"],
|
n_embd = config["dim"],
|
||||||
|
@ -462,7 +465,8 @@ class SentencePieceVocab(Vocab):
|
||||||
# not found in alternate location either
|
# not found in alternate location either
|
||||||
raise FileNotFoundError('Cannot find tokenizer.model')
|
raise FileNotFoundError('Cannot find tokenizer.model')
|
||||||
|
|
||||||
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
|
self.sentencepiece_tokenizer = SentencePieceProcessor()
|
||||||
|
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
|
||||||
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
vocab_size = self.sentencepiece_tokenizer.vocab_size()
|
||||||
|
|
||||||
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
|
||||||
|
@ -482,23 +486,23 @@ class SentencePieceVocab(Vocab):
|
||||||
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
|
||||||
tokenizer = self.sentencepiece_tokenizer
|
tokenizer = self.sentencepiece_tokenizer
|
||||||
for i in range(tokenizer.vocab_size()):
|
for i in range(tokenizer.vocab_size()):
|
||||||
piece = tokenizer.id_to_piece(i)
|
piece = tokenizer.IdToPiece(i)
|
||||||
text = piece.encode("utf-8")
|
text = piece.encode("utf-8")
|
||||||
score: float = tokenizer.get_score(i)
|
score: float = tokenizer.GetScore(i)
|
||||||
|
|
||||||
toktype = gguf.TokenType.NORMAL
|
toktype = gguf.TokenType.NORMAL
|
||||||
if tokenizer.is_unknown(i):
|
if tokenizer.IsUnknown(i):
|
||||||
toktype = gguf.TokenType.UNKNOWN
|
toktype = gguf.TokenType.UNKNOWN
|
||||||
if tokenizer.is_control(i):
|
if tokenizer.IsControl(i):
|
||||||
toktype = gguf.TokenType.CONTROL
|
toktype = gguf.TokenType.CONTROL
|
||||||
|
|
||||||
# NOTE: I think added_tokens are user defined.
|
# NOTE: I think added_tokens are user defined.
|
||||||
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
|
||||||
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
|
||||||
|
|
||||||
if tokenizer.is_unused(i):
|
if tokenizer.IsUnused(i):
|
||||||
toktype = gguf.TokenType.UNUSED
|
toktype = gguf.TokenType.UNUSED
|
||||||
if tokenizer.is_byte(i):
|
if tokenizer.IsByte(i):
|
||||||
toktype = gguf.TokenType.BYTE
|
toktype = gguf.TokenType.BYTE
|
||||||
|
|
||||||
yield text, score, toktype
|
yield text, score, toktype
|
||||||
|
@ -906,7 +910,7 @@ class LazyUnpickler(pickle.Unpickler):
|
||||||
def rebuild_from_type_v2(func, new_type, args, state):
|
def rebuild_from_type_v2(func, new_type, args, state):
|
||||||
return func(*args)
|
return func(*args)
|
||||||
|
|
||||||
CLASSES = {
|
CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
|
||||||
# getattr used here as a workaround for mypy not being smart enough to determine
|
# getattr used here as a workaround for mypy not being smart enough to determine
|
||||||
# the staticmethods have a __func__ attribute.
|
# the staticmethods have a __func__ attribute.
|
||||||
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
|
||||||
|
@ -1508,6 +1512,8 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
if args.big_endian:
|
if args.big_endian:
|
||||||
endianess = gguf.GGUFEndian.BIG
|
endianess = gguf.GGUFEndian.BIG
|
||||||
|
|
||||||
|
params = None
|
||||||
|
if args.pad_vocab or not args.vocab_only:
|
||||||
params = Params.load(model_plus)
|
params = Params.load(model_plus)
|
||||||
if params.n_ctx == -1:
|
if params.n_ctx == -1:
|
||||||
if args.ctx is None:
|
if args.ctx is None:
|
||||||
|
@ -1539,6 +1545,17 @@ def main(args_in: list[str] | None = None) -> None:
|
||||||
if not args.outfile:
|
if not args.outfile:
|
||||||
raise ValueError("need --outfile if using --vocab-only")
|
raise ValueError("need --outfile if using --vocab-only")
|
||||||
outfile = args.outfile
|
outfile = args.outfile
|
||||||
|
if params is None:
|
||||||
|
params = Params(
|
||||||
|
n_vocab = vocab.vocab_size,
|
||||||
|
n_embd = 1,
|
||||||
|
n_layer = 1,
|
||||||
|
n_ctx = 1,
|
||||||
|
n_ff = 1,
|
||||||
|
n_head = 1,
|
||||||
|
n_head_kv = 1,
|
||||||
|
f_norm_eps = 1e-5,
|
||||||
|
)
|
||||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||||
endianess=endianess, pad_vocab=args.pad_vocab)
|
endianess=endianess, pad_vocab=args.pad_vocab)
|
||||||
logger.info(f"Wrote {outfile}")
|
logger.info(f"Wrote {outfile}")
|
||||||
|
|
|
@ -23,7 +23,7 @@ Install BLIS:
|
||||||
sudo make install
|
sudo make install
|
||||||
```
|
```
|
||||||
|
|
||||||
We recommend using openmp since it's easier to modify the cores been used.
|
We recommend using openmp since it's easier to modify the cores being used.
|
||||||
|
|
||||||
### llama.cpp compilation
|
### llama.cpp compilation
|
||||||
|
|
||||||
|
|
|
@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc
|
||||||
|
|
||||||
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
|
||||||
|
|
||||||
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
|
||||||
|
|
||||||
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
|
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
|
||||||
|
|
||||||
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
|
||||||
|
|
||||||
|
|
|
@ -2,7 +2,7 @@
|
||||||
|
|
||||||
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
|
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
|
||||||
|
|
||||||
To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
|
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
|
||||||
|
|
||||||
`$ make -j`
|
`$ make -j`
|
||||||
|
|
||||||
|
|
|
@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
|
||||||
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
|
||||||
float v;
|
float v;
|
||||||
if (type == GGML_TYPE_F16) {
|
if (type == GGML_TYPE_F16) {
|
||||||
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
|
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
|
||||||
} else if (type == GGML_TYPE_F32) {
|
} else if (type == GGML_TYPE_F32) {
|
||||||
v = *(float *) data + i;
|
v = *(float *) &data[i];
|
||||||
} else if (type == GGML_TYPE_I32) {
|
} else if (type == GGML_TYPE_I32) {
|
||||||
v = (float) *(int32_t *) data + i;
|
v = (float) *(int32_t *) &data[i];
|
||||||
} else if (type == GGML_TYPE_I16) {
|
} else if (type == GGML_TYPE_I16) {
|
||||||
v = (float) *(int16_t *) data + i;
|
v = (float) *(int16_t *) &data[i];
|
||||||
} else if (type == GGML_TYPE_I8) {
|
} else if (type == GGML_TYPE_I8) {
|
||||||
v = (float) *(int8_t *) data + i;
|
v = (float) *(int8_t *) &data[i];
|
||||||
} else {
|
} else {
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
|
|
|
@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
|
||||||
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
|
||||||
|
|
||||||
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||||
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
|
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
|
||||||
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
|
||||||
} else if (a->type == GGML_TYPE_F32) {
|
} else if (a->type == GGML_TYPE_F32) {
|
||||||
return ggml_add(ctx, a, b);
|
return ggml_add(ctx, a, b);
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
|
|
||||||
struct Stats {
|
struct Stats {
|
||||||
std::vector<float> values;
|
std::vector<float> values;
|
||||||
|
std::vector<int> counts;
|
||||||
int ncall = 0;
|
int ncall = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -121,12 +122,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
auto & e = m_stats[wname];
|
auto & e = m_stats[wname];
|
||||||
|
|
||||||
++e.ncall;
|
++e.ncall;
|
||||||
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
|
|
||||||
// using the following line, we can correct for that if needed by replacing the line above with:
|
|
||||||
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
|
|
||||||
|
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0]*n_as, 0);
|
e.values.resize(src1->ne[0]*n_as, 0);
|
||||||
|
e.counts.resize(src1->ne[0]*n_as, 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
|
||||||
|
@ -153,6 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
|
|
||||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||||
e.values[e_start + j] += x[j]*x[j];
|
e.values[e_start + j] += x[j]*x[j];
|
||||||
|
e.counts[e_start + j]++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -170,6 +170,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
auto& e = m_stats[wname];
|
auto& e = m_stats[wname];
|
||||||
if (e.values.empty()) {
|
if (e.values.empty()) {
|
||||||
e.values.resize(src1->ne[0], 0);
|
e.values.resize(src1->ne[0], 0);
|
||||||
|
e.counts.resize(src1->ne[0], 0);
|
||||||
}
|
}
|
||||||
else if (e.values.size() != (size_t)src1->ne[0]) {
|
else if (e.values.size() != (size_t)src1->ne[0]) {
|
||||||
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
|
||||||
|
@ -183,6 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
|
||||||
const float * x = data + row * src1->ne[0];
|
const float * x = data + row * src1->ne[0];
|
||||||
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
for (int j = 0; j < (int)src1->ne[0]; ++j) {
|
||||||
e.values[j] += x[j]*x[j];
|
e.values[j] += x[j]*x[j];
|
||||||
|
e.counts[j]++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (e.ncall > m_last_call) {
|
if (e.ncall > m_last_call) {
|
||||||
|
@ -222,7 +224,13 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
|
||||||
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
|
||||||
int nval = p.second.values.size();
|
int nval = p.second.values.size();
|
||||||
out.write((const char *) &nval, sizeof(nval));
|
out.write((const char *) &nval, sizeof(nval));
|
||||||
if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
|
if (nval > 0) {
|
||||||
|
std::vector<float> tmp(nval);
|
||||||
|
for (int i = 0; i < nval; i++) {
|
||||||
|
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
|
||||||
|
}
|
||||||
|
out.write((const char*)tmp.data(), nval*sizeof(float));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Write the number of call the matrix was computed with
|
// Write the number of call the matrix was computed with
|
||||||
|
@ -270,14 +278,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
|
||||||
imatrix_data = {};
|
imatrix_data = {};
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
e.values.resize(nval);
|
|
||||||
in.read((char*)e.values.data(), nval*sizeof(float));
|
// When re-called from load_imatrix() with add set, this will already be created.
|
||||||
|
if (e.values.empty()) {
|
||||||
|
e.values.resize(nval, 0);
|
||||||
|
e.counts.resize(nval, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
std::vector<float> tmp(nval);
|
||||||
|
in.read((char*)tmp.data(), nval*sizeof(float));
|
||||||
if (in.fail()) {
|
if (in.fail()) {
|
||||||
printf("%s: failed reading data for entry %d\n",__func__,i);
|
printf("%s: failed reading data for entry %d\n",__func__,i);
|
||||||
imatrix_data = {};
|
imatrix_data = {};
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
e.ncall = ncall;
|
|
||||||
|
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
|
||||||
|
for (int i = 0; i < nval; i++) {
|
||||||
|
e.values[i] += tmp[i];
|
||||||
|
e.counts[i] += ncall;
|
||||||
|
}
|
||||||
|
e.ncall += ncall;
|
||||||
|
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,7 +56,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
|
||||||
python ./convert.py ../llava-v1.5-7b --skip-unknown
|
python ./convert.py ../llava-v1.5-7b --skip-unknown
|
||||||
```
|
```
|
||||||
|
|
||||||
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
|
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
|
||||||
|
|
||||||
## LLaVA 1.6 gguf conversion
|
## LLaVA 1.6 gguf conversion
|
||||||
1) First clone a LLaVA 1.6 model:
|
1) First clone a LLaVA 1.6 model:
|
||||||
|
|
|
@ -573,13 +573,13 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
||||||
struct ggml_tensor * embeddings = inp;
|
struct ggml_tensor * embeddings = inp;
|
||||||
if (ctx->has_class_embedding) {
|
if (ctx->has_class_embedding) {
|
||||||
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
||||||
|
ggml_set_name(embeddings, "embeddings");
|
||||||
|
ggml_set_input(embeddings);
|
||||||
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
||||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
||||||
embeddings = ggml_acc(ctx0, embeddings, inp,
|
embeddings = ggml_acc(ctx0, embeddings, inp,
|
||||||
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
||||||
}
|
}
|
||||||
ggml_set_name(embeddings, "embeddings");
|
|
||||||
ggml_set_input(embeddings);
|
|
||||||
|
|
||||||
|
|
||||||
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
||||||
|
@ -1846,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
const int image_size = hparams.image_size;
|
const int image_size = hparams.image_size;
|
||||||
const int patch_size = hparams.patch_size;
|
const int patch_size = hparams.patch_size;
|
||||||
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
||||||
const int num_positions = num_patches + 1;
|
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
||||||
|
@ -1874,6 +1874,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
}
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
|
if (ctx->has_class_embedding) {
|
||||||
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
||||||
|
|
||||||
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
||||||
|
@ -1881,6 +1882,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
||||||
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
||||||
free(zero_mem);
|
free(zero_mem);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
{
|
{
|
||||||
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");
|
||||||
|
|
|
@ -143,7 +143,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
|
||||||
|
|
||||||
### Extended Context Size
|
### Extended Context Size
|
||||||
|
|
||||||
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
|
||||||
|
|
||||||
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
|
||||||
|
|
||||||
|
@ -286,7 +286,7 @@ These options help improve the performance and memory usage of the LLaMA models.
|
||||||
|
|
||||||
- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
|
- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
|
||||||
- `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
|
- `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
|
||||||
- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
|
- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
|
||||||
|
|
||||||
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
|
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.
|
||||||
|
|
||||||
|
|
|
@ -373,6 +373,9 @@ int main(int argc, char ** argv) {
|
||||||
params.interactive_first = true;
|
params.interactive_first = true;
|
||||||
params.antiprompt.emplace_back("<|im_start|>user\n");
|
params.antiprompt.emplace_back("<|im_start|>user\n");
|
||||||
}
|
}
|
||||||
|
else if (params.conversation) {
|
||||||
|
params.interactive_first = true;
|
||||||
|
}
|
||||||
|
|
||||||
// chaton mode
|
// chaton mode
|
||||||
const auto chaton_assitant_prefix = ::llama_tokenize(ctx, chaton_tmpl_role_kv(params.chaton_template_id, K_ASSISTANT, {K_BEGIN, K_PREFIX}), false, true);
|
const auto chaton_assitant_prefix = ::llama_tokenize(ctx, chaton_tmpl_role_kv(params.chaton_template_id, K_ASSISTANT, {K_BEGIN, K_PREFIX}), false, true);
|
||||||
|
@ -754,7 +757,7 @@ int main(int argc, char ** argv) {
|
||||||
// display text
|
// display text
|
||||||
if (input_echo && display) {
|
if (input_echo && display) {
|
||||||
for (auto id : embd) {
|
for (auto id : embd) {
|
||||||
const std::string token_str = llama_token_to_piece(ctx, id);
|
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
|
||||||
printf("%s", token_str.c_str());
|
printf("%s", token_str.c_str());
|
||||||
|
|
||||||
if (embd.size() > 1) {
|
if (embd.size() > 1) {
|
||||||
|
@ -817,7 +820,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
// deal with end of generation tokens in interactive mode
|
// deal with end of generation tokens in interactive mode
|
||||||
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
||||||
LOG("found EOS token\n");
|
LOG("found an EOG token\n");
|
||||||
|
|
||||||
if (params.interactive) {
|
if (params.interactive) {
|
||||||
if (!params.antiprompt.empty()) {
|
if (!params.antiprompt.empty()) {
|
||||||
|
@ -837,7 +840,7 @@ int main(int argc, char ** argv) {
|
||||||
if (n_past > 0 && is_interacting) {
|
if (n_past > 0 && is_interacting) {
|
||||||
LOG("waiting for user input\n");
|
LOG("waiting for user input\n");
|
||||||
|
|
||||||
if (params.instruct || params.chatml || params.chaton) {
|
if (params.conversation || params.instruct || params.chatml || params.chaton) {
|
||||||
printf("\n> ");
|
printf("\n> ");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -847,7 +850,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string buffer;
|
std::string buffer;
|
||||||
if (!params.input_prefix.empty()) {
|
if (!params.input_prefix.empty() && !params.conversation) {
|
||||||
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
||||||
printf("%s", params.input_prefix.c_str());
|
printf("%s", params.input_prefix.c_str());
|
||||||
}
|
}
|
||||||
|
@ -871,7 +874,7 @@ int main(int argc, char ** argv) {
|
||||||
// Entering a empty line lets the user pass control back
|
// Entering a empty line lets the user pass control back
|
||||||
if (buffer.length() > 1) {
|
if (buffer.length() > 1) {
|
||||||
// append input suffix if any
|
// append input suffix if any
|
||||||
if (!params.input_suffix.empty()) {
|
if (!params.input_suffix.empty() && !params.conversation) {
|
||||||
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
||||||
printf("%s", params.input_suffix.c_str());
|
printf("%s", params.input_suffix.c_str());
|
||||||
}
|
}
|
||||||
|
|
|
@ -46,7 +46,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
|
||||||
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
|
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
|
||||||
|
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
|
||||||
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
|
||||||
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
|
||||||
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
|
||||||
|
|
|
@ -62,6 +62,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
|
||||||
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
|
||||||
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
|
||||||
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
|
||||||
|
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
|
||||||
|
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
|
||||||
|
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
|
||||||
|
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
|
||||||
|
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
|
||||||
|
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
|
||||||
|
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
|
||||||
|
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
|
||||||
|
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
|
||||||
|
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
|
||||||
|
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
|
||||||
|
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
|
||||||
|
|
||||||
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
**If compiled with `LLAMA_SERVER_SSL=ON`**
|
||||||
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
|
||||||
|
@ -260,7 +272,7 @@ node index.js
|
||||||
|
|
||||||
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
|
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
|
||||||
|
|
||||||
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`
|
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
|
||||||
|
|
||||||
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
|
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
|
||||||
|
|
||||||
|
@ -319,7 +331,7 @@ Notice that each `probs` is an array of length `n_probs`.
|
||||||
|
|
||||||
`content`: Set the text to tokenize.
|
`content`: Set the text to tokenize.
|
||||||
|
|
||||||
Note that a special `BOS` token is never inserted.
|
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
|
||||||
|
|
||||||
- **POST** `/detokenize`: Convert tokens to text.
|
- **POST** `/detokenize`: Convert tokens to text.
|
||||||
|
|
||||||
|
|
BIN
examples/server/public/favicon.ico
Normal file
After Width: | Height: | Size: 4 KiB |
|
@ -12,6 +12,8 @@
|
||||||
// increase max payload length to allow use of larger context size
|
// increase max payload length to allow use of larger context size
|
||||||
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
|
||||||
#include "httplib.h"
|
#include "httplib.h"
|
||||||
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
|
||||||
// auto generated files (update with ./deps.sh)
|
// auto generated files (update with ./deps.sh)
|
||||||
|
@ -859,7 +861,7 @@ struct server_context {
|
||||||
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
|
||||||
|
|
||||||
// process "json_schema" and "grammar"
|
// process "json_schema" and "grammar"
|
||||||
if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
|
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
|
||||||
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
|
||||||
return false;
|
return false;
|
||||||
} else if (data.contains("json_schema") && !data.contains("grammar")) {
|
} else if (data.contains("json_schema") && !data.contains("grammar")) {
|
||||||
|
@ -1512,7 +1514,7 @@ struct server_context {
|
||||||
// add subtasks
|
// add subtasks
|
||||||
for (int i = 0; i < prompt_count; i++) {
|
for (int i = 0; i < prompt_count; i++) {
|
||||||
json subtask_data = multiprompt_task.data;
|
json subtask_data = multiprompt_task.data;
|
||||||
subtask_data["prompt"] = subtask_data["prompt"][i];
|
subtask_data["prompt"] = subtask_data.at("prompt")[i];
|
||||||
|
|
||||||
// subtasks inherit everything else (infill mode, embedding mode, etc.)
|
// subtasks inherit everything else (infill mode, embedding mode, etc.)
|
||||||
request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
|
request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
|
||||||
|
@ -1532,7 +1534,7 @@ struct server_context {
|
||||||
}
|
}
|
||||||
|
|
||||||
if (task.data.contains("system_prompt")) {
|
if (task.data.contains("system_prompt")) {
|
||||||
system_prompt_set(task.data["system_prompt"]);
|
system_prompt_set(task.data.at("system_prompt"));
|
||||||
|
|
||||||
for (server_slot & slot : slots) {
|
for (server_slot & slot : slots) {
|
||||||
slot.n_past = 0;
|
slot.n_past = 0;
|
||||||
|
@ -1644,7 +1646,7 @@ struct server_context {
|
||||||
} break;
|
} break;
|
||||||
case SERVER_TASK_TYPE_SLOT_SAVE:
|
case SERVER_TASK_TYPE_SLOT_SAVE:
|
||||||
{
|
{
|
||||||
int id_slot = task.data["id_slot"];
|
int id_slot = task.data.at("id_slot");
|
||||||
server_slot * slot = get_slot(id_slot);
|
server_slot * slot = get_slot(id_slot);
|
||||||
if (slot == nullptr) {
|
if (slot == nullptr) {
|
||||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
@ -1654,8 +1656,8 @@ struct server_context {
|
||||||
const size_t token_count = slot->cache_tokens.size();
|
const size_t token_count = slot->cache_tokens.size();
|
||||||
const int64_t t_start = ggml_time_us();
|
const int64_t t_start = ggml_time_us();
|
||||||
|
|
||||||
std::string filename = task.data["filename"];
|
std::string filename = task.data.at("filename");
|
||||||
std::string filepath = task.data["filepath"];
|
std::string filepath = task.data.at("filepath");
|
||||||
|
|
||||||
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
|
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
|
||||||
|
|
||||||
|
@ -1679,7 +1681,7 @@ struct server_context {
|
||||||
} break;
|
} break;
|
||||||
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
case SERVER_TASK_TYPE_SLOT_RESTORE:
|
||||||
{
|
{
|
||||||
int id_slot = task.data["id_slot"];
|
int id_slot = task.data.at("id_slot");
|
||||||
server_slot * slot = get_slot(id_slot);
|
server_slot * slot = get_slot(id_slot);
|
||||||
if (slot == nullptr) {
|
if (slot == nullptr) {
|
||||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
@ -1688,8 +1690,8 @@ struct server_context {
|
||||||
|
|
||||||
const int64_t t_start = ggml_time_us();
|
const int64_t t_start = ggml_time_us();
|
||||||
|
|
||||||
std::string filename = task.data["filename"];
|
std::string filename = task.data.at("filename");
|
||||||
std::string filepath = task.data["filepath"];
|
std::string filepath = task.data.at("filepath");
|
||||||
|
|
||||||
slot->cache_tokens.resize(slot->n_ctx);
|
slot->cache_tokens.resize(slot->n_ctx);
|
||||||
size_t token_count = 0;
|
size_t token_count = 0;
|
||||||
|
@ -1721,7 +1723,7 @@ struct server_context {
|
||||||
} break;
|
} break;
|
||||||
case SERVER_TASK_TYPE_SLOT_ERASE:
|
case SERVER_TASK_TYPE_SLOT_ERASE:
|
||||||
{
|
{
|
||||||
int id_slot = task.data["id_slot"];
|
int id_slot = task.data.at("id_slot");
|
||||||
server_slot * slot = get_slot(id_slot);
|
server_slot * slot = get_slot(id_slot);
|
||||||
if (slot == nullptr) {
|
if (slot == nullptr) {
|
||||||
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
|
||||||
|
@ -2266,18 +2268,32 @@ struct server_context {
|
||||||
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
|
||||||
result.tok = id;
|
result.tok = id;
|
||||||
|
|
||||||
const int32_t n_probs = slot.sparams.n_probs;
|
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
|
||||||
if (slot.sparams.temp <= 0 && n_probs > 0) {
|
if (n_probs > 0) {
|
||||||
// for llama_sample_token_greedy we need to sort candidates
|
const size_t n_considered = slot.ctx_sampling->n_considered;
|
||||||
llama_sample_softmax(ctx, &cur_p);
|
|
||||||
|
// Make sure at least n_probs top tokens are at the front of the vector:
|
||||||
|
if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
|
||||||
|
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
|
if (slot.sparams.temp == 0.0f) {
|
||||||
|
// With greedy sampling the probabilities have possibly not been calculated.
|
||||||
|
for (size_t i = 0; i < n_probs; ++i) {
|
||||||
result.probs.push_back({
|
result.probs.push_back({
|
||||||
cur_p.data[i].id,
|
cur_p.data[i].id,
|
||||||
cur_p.data[i].p
|
i == 0 ? 1.0f : 0.0f
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
} else {
|
||||||
|
for (size_t i = 0; i < n_probs; ++i) {
|
||||||
|
result.probs.push_back({
|
||||||
|
cur_p.data[i].id,
|
||||||
|
i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!process_token(result, slot)) {
|
if (!process_token(result, slot)) {
|
||||||
slot.release();
|
slot.release();
|
||||||
|
@ -3122,8 +3138,8 @@ int main(int argc, char ** argv) {
|
||||||
server_task_result result = ctx_server.queue_results.recv(task.id);
|
server_task_result result = ctx_server.queue_results.recv(task.id);
|
||||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||||
|
|
||||||
const int n_idle_slots = result.data["idle"];
|
const int n_idle_slots = result.data.at("idle");
|
||||||
const int n_processing_slots = result.data["processing"];
|
const int n_processing_slots = result.data.at("processing");
|
||||||
|
|
||||||
json health = {
|
json health = {
|
||||||
{"status", "ok"},
|
{"status", "ok"},
|
||||||
|
@ -3133,7 +3149,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
if (sparams.slots_endpoint && req.has_param("include_slots")) {
|
if (sparams.slots_endpoint && req.has_param("include_slots")) {
|
||||||
health["slots"] = result.data["slots"];
|
health["slots"] = result.data.at("slots");
|
||||||
}
|
}
|
||||||
|
|
||||||
if (n_idle_slots == 0) {
|
if (n_idle_slots == 0) {
|
||||||
|
@ -3177,7 +3193,7 @@ int main(int argc, char ** argv) {
|
||||||
server_task_result result = ctx_server.queue_results.recv(task.id);
|
server_task_result result = ctx_server.queue_results.recv(task.id);
|
||||||
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
ctx_server.queue_results.remove_waiting_task_id(task.id);
|
||||||
|
|
||||||
res.set_content(result.data["slots"].dump(), "application/json");
|
res.set_content(result.data.at("slots").dump(), "application/json");
|
||||||
res.status = 200; // HTTP OK
|
res.status = 200; // HTTP OK
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -3204,32 +3220,32 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
json data = result.data;
|
json data = result.data;
|
||||||
|
|
||||||
const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
|
const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
|
||||||
const uint64_t t_prompt_processing = data["t_prompt_processing"];
|
const uint64_t t_prompt_processing = data.at("t_prompt_processing");
|
||||||
|
|
||||||
const uint64_t n_tokens_predicted = data["n_tokens_predicted"];
|
const uint64_t n_tokens_predicted = data.at("n_tokens_predicted");
|
||||||
const uint64_t t_tokens_generation = data["t_tokens_generation"];
|
const uint64_t t_tokens_generation = data.at("t_tokens_generation");
|
||||||
|
|
||||||
const int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
|
const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
|
||||||
|
|
||||||
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
|
||||||
json all_metrics_def = json {
|
json all_metrics_def = json {
|
||||||
{"counter", {{
|
{"counter", {{
|
||||||
{"name", "prompt_tokens_total"},
|
{"name", "prompt_tokens_total"},
|
||||||
{"help", "Number of prompt tokens processed."},
|
{"help", "Number of prompt tokens processed."},
|
||||||
{"value", (uint64_t) data["n_prompt_tokens_processed_total"]}
|
{"value", (uint64_t) data.at("n_prompt_tokens_processed_total")}
|
||||||
}, {
|
}, {
|
||||||
{"name", "prompt_seconds_total"},
|
{"name", "prompt_seconds_total"},
|
||||||
{"help", "Prompt process time"},
|
{"help", "Prompt process time"},
|
||||||
{"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3}
|
{"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
|
||||||
}, {
|
}, {
|
||||||
{"name", "tokens_predicted_total"},
|
{"name", "tokens_predicted_total"},
|
||||||
{"help", "Number of generation tokens processed."},
|
{"help", "Number of generation tokens processed."},
|
||||||
{"value", (uint64_t) data["n_tokens_predicted_total"]}
|
{"value", (uint64_t) data.at("n_tokens_predicted_total")}
|
||||||
}, {
|
}, {
|
||||||
{"name", "tokens_predicted_seconds_total"},
|
{"name", "tokens_predicted_seconds_total"},
|
||||||
{"help", "Predict process time"},
|
{"help", "Predict process time"},
|
||||||
{"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3}
|
{"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
|
||||||
}}},
|
}}},
|
||||||
{"gauge", {{
|
{"gauge", {{
|
||||||
{"name", "prompt_tokens_seconds"},
|
{"name", "prompt_tokens_seconds"},
|
||||||
|
@ -3246,15 +3262,15 @@ int main(int argc, char ** argv) {
|
||||||
},{
|
},{
|
||||||
{"name", "kv_cache_tokens"},
|
{"name", "kv_cache_tokens"},
|
||||||
{"help", "KV-cache tokens."},
|
{"help", "KV-cache tokens."},
|
||||||
{"value", (uint64_t) data["kv_cache_tokens_count"]}
|
{"value", (uint64_t) data.at("kv_cache_tokens_count")}
|
||||||
},{
|
},{
|
||||||
{"name", "requests_processing"},
|
{"name", "requests_processing"},
|
||||||
{"help", "Number of request processing."},
|
{"help", "Number of request processing."},
|
||||||
{"value", (uint64_t) data["processing"]}
|
{"value", (uint64_t) data.at("processing")}
|
||||||
},{
|
},{
|
||||||
{"name", "requests_deferred"},
|
{"name", "requests_deferred"},
|
||||||
{"help", "Number of request deferred."},
|
{"help", "Number of request deferred."},
|
||||||
{"value", (uint64_t) data["deferred"]}
|
{"value", (uint64_t) data.at("deferred")}
|
||||||
}}}
|
}}}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -3265,8 +3281,8 @@ int main(int argc, char ** argv) {
|
||||||
const auto & metrics_def = el.value();
|
const auto & metrics_def = el.value();
|
||||||
|
|
||||||
for (const auto & metric_def : metrics_def) {
|
for (const auto & metric_def : metrics_def) {
|
||||||
const std::string name = metric_def["name"];
|
const std::string name = metric_def.at("name");
|
||||||
const std::string help = metric_def["help"];
|
const std::string help = metric_def.at("help");
|
||||||
|
|
||||||
auto value = json_value(metric_def, "value", 0.);
|
auto value = json_value(metric_def, "value", 0.);
|
||||||
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
|
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
|
||||||
|
@ -3275,7 +3291,7 @@ int main(int argc, char ** argv) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t t_start = data["t_start"];
|
const int64_t t_start = data.at("t_start");
|
||||||
res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
|
res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
|
||||||
|
|
||||||
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
|
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
|
||||||
|
@ -3284,7 +3300,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||||
json request_data = json::parse(req.body);
|
json request_data = json::parse(req.body);
|
||||||
std::string filename = request_data["filename"];
|
std::string filename = request_data.at("filename");
|
||||||
if (!validate_file_name(filename)) {
|
if (!validate_file_name(filename)) {
|
||||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||||
return;
|
return;
|
||||||
|
@ -3314,7 +3330,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
|
||||||
json request_data = json::parse(req.body);
|
json request_data = json::parse(req.body);
|
||||||
std::string filename = request_data["filename"];
|
std::string filename = request_data.at("filename");
|
||||||
if (!validate_file_name(filename)) {
|
if (!validate_file_name(filename)) {
|
||||||
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
|
||||||
return;
|
return;
|
||||||
|
@ -3633,7 +3649,8 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::vector<llama_token> tokens;
|
std::vector<llama_token> tokens;
|
||||||
if (body.count("content") != 0) {
|
if (body.count("content") != 0) {
|
||||||
tokens = ctx_server.tokenize(body["content"], false);
|
const bool add_special = json_value(body, "add_special", false);
|
||||||
|
tokens = ctx_server.tokenize(body.at("content"), add_special);
|
||||||
}
|
}
|
||||||
const json data = format_tokenizer_response(tokens);
|
const json data = format_tokenizer_response(tokens);
|
||||||
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
return res.set_content(data.dump(), "application/json; charset=utf-8");
|
||||||
|
@ -3645,7 +3662,7 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
std::string content;
|
std::string content;
|
||||||
if (body.count("tokens") != 0) {
|
if (body.count("tokens") != 0) {
|
||||||
const std::vector<llama_token> tokens = body["tokens"];
|
const std::vector<llama_token> tokens = body.at("tokens");
|
||||||
content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
|
content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -3668,10 +3685,10 @@ int main(int argc, char ** argv) {
|
||||||
json prompt;
|
json prompt;
|
||||||
if (body.count("input") != 0) {
|
if (body.count("input") != 0) {
|
||||||
is_openai = true;
|
is_openai = true;
|
||||||
prompt = body["input"];
|
prompt = body.at("input");
|
||||||
} else if (body.count("content") != 0) {
|
} else if (body.count("content") != 0) {
|
||||||
// with "content", we only support single prompt
|
// with "content", we only support single prompt
|
||||||
prompt = std::vector<std::string>{body["content"]};
|
prompt = std::vector<std::string>{body.at("content")};
|
||||||
} else {
|
} else {
|
||||||
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
|
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
|
||||||
return;
|
return;
|
||||||
|
@ -3690,7 +3707,7 @@ int main(int argc, char ** argv) {
|
||||||
if (!result.error) {
|
if (!result.error) {
|
||||||
if (result.data.count("results")) {
|
if (result.data.count("results")) {
|
||||||
// result for multi-task
|
// result for multi-task
|
||||||
responses = result.data["results"];
|
responses = result.data.at("results");
|
||||||
} else {
|
} else {
|
||||||
// result for single task
|
// result for single task
|
||||||
responses = std::vector<json>{result.data};
|
responses = std::vector<json>{result.data};
|
||||||
|
|
|
@ -7,6 +7,7 @@ Feature: llama.cpp server
|
||||||
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
|
||||||
And a model file test-model.gguf
|
And a model file test-model.gguf
|
||||||
And a model alias tinyllama-2
|
And a model alias tinyllama-2
|
||||||
|
And BOS token is 1
|
||||||
And 42 as server seed
|
And 42 as server seed
|
||||||
# KV Cache corresponds to the total amount of tokens
|
# KV Cache corresponds to the total amount of tokens
|
||||||
# that can be stored across all independent sequences: #4130
|
# that can be stored across all independent sequences: #4130
|
||||||
|
@ -91,7 +92,18 @@ Feature: llama.cpp server
|
||||||
"""
|
"""
|
||||||
What is the capital of France ?
|
What is the capital of France ?
|
||||||
"""
|
"""
|
||||||
Then tokens can be detokenize
|
Then tokens can be detokenized
|
||||||
|
And tokens do not begin with BOS
|
||||||
|
|
||||||
|
Scenario: Tokenize w/ BOS
|
||||||
|
Given adding special tokens
|
||||||
|
When tokenizing:
|
||||||
|
"""
|
||||||
|
What is the capital of Germany?
|
||||||
|
"""
|
||||||
|
Then tokens begin with BOS
|
||||||
|
Given first token is removed
|
||||||
|
Then tokens can be detokenized
|
||||||
|
|
||||||
Scenario: Models available
|
Scenario: Models available
|
||||||
Given available models
|
Given available models
|
||||||
|
|
|
@ -376,6 +376,11 @@ def step_seed(context, seed):
|
||||||
context.seed.append(seed)
|
context.seed.append(seed)
|
||||||
|
|
||||||
|
|
||||||
|
@step('BOS token is {bos:d}')
|
||||||
|
def step_bos_token(context, bos):
|
||||||
|
context.bos = bos
|
||||||
|
|
||||||
|
|
||||||
@step('a prefix prompt')
|
@step('a prefix prompt')
|
||||||
def step_prompt_prefix(context):
|
def step_prompt_prefix(context):
|
||||||
context.prompt_prefix = context_text(context)
|
context.prompt_prefix = context_text(context)
|
||||||
|
@ -656,21 +661,29 @@ async def all_embeddings_are_generated(context):
|
||||||
assert_embeddings(context.tasks_result.pop().pop())
|
assert_embeddings(context.tasks_result.pop().pop())
|
||||||
|
|
||||||
|
|
||||||
|
@step('adding special tokens')
|
||||||
|
def step_tokenize_set_add_special(context):
|
||||||
|
context.tokenize_add_special = True
|
||||||
|
|
||||||
|
|
||||||
@step('tokenizing')
|
@step('tokenizing')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_tokenize(context):
|
async def step_tokenize(context):
|
||||||
context.tokenized_text = context_text(context)
|
context.tokenized_text = context_text(context)
|
||||||
async with aiohttp.ClientSession() as session:
|
async with aiohttp.ClientSession() as session:
|
||||||
async with session.post(f'{context.base_url}/tokenize',
|
tokenize_args = {
|
||||||
json={
|
|
||||||
"content": context.tokenized_text,
|
"content": context.tokenized_text,
|
||||||
}) as response:
|
}
|
||||||
|
if getattr(context, 'tokenize_add_special', None) is not None:
|
||||||
|
tokenize_args['add_special'] = context.tokenize_add_special
|
||||||
|
async with session.post(f'{context.base_url}/tokenize',
|
||||||
|
json=tokenize_args) as response:
|
||||||
assert response.status == 200
|
assert response.status == 200
|
||||||
tokenize_json = await response.json()
|
tokenize_json = await response.json()
|
||||||
context.tokens = tokenize_json['tokens']
|
context.tokens = tokenize_json['tokens']
|
||||||
|
|
||||||
|
|
||||||
@step('tokens can be detokenize')
|
@step('tokens can be detokenized')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_detokenize(context):
|
async def step_detokenize(context):
|
||||||
assert len(context.tokens) > 0
|
assert len(context.tokens) > 0
|
||||||
|
@ -685,6 +698,21 @@ async def step_detokenize(context):
|
||||||
assert context.tokenized_text == detokenize_json['content'].strip()
|
assert context.tokenized_text == detokenize_json['content'].strip()
|
||||||
|
|
||||||
|
|
||||||
|
@step('tokens begin with BOS')
|
||||||
|
def step_strings_for_tokenization(context):
|
||||||
|
assert context.tokens[0] == context.bos
|
||||||
|
|
||||||
|
|
||||||
|
@step('tokens do not begin with BOS')
|
||||||
|
def step_strings_for_tokenization(context):
|
||||||
|
assert context.tokens[0] != context.bos
|
||||||
|
|
||||||
|
|
||||||
|
@step('first token is removed')
|
||||||
|
def step_strings_for_tokenization(context):
|
||||||
|
context.tokens = context.tokens[1:]
|
||||||
|
|
||||||
|
|
||||||
@step('an OPTIONS request is sent from {origin}')
|
@step('an OPTIONS request is sent from {origin}')
|
||||||
@async_run_until_complete
|
@async_run_until_complete
|
||||||
async def step_options_request(context, origin):
|
async def step_options_request(context, origin):
|
||||||
|
@ -911,7 +939,7 @@ async def oai_chat_completions(user_prompt,
|
||||||
while event_received:
|
while event_received:
|
||||||
event_received = False
|
event_received = False
|
||||||
async for line_in_bytes in response.content:
|
async for line_in_bytes in response.content:
|
||||||
line = line_in_bytes.decode('utf8')
|
line = line_in_bytes.decode('utf-8')
|
||||||
line = line.rstrip('\n').rstrip('\r')
|
line = line.rstrip('\n').rstrip('\r')
|
||||||
if line == '':
|
if line == '':
|
||||||
continue
|
continue
|
||||||
|
|
5
examples/server/themes/README.md
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# LLaMA.cpp Server Wild Theme
|
||||||
|
|
||||||
|
Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`.
|
||||||
|
|
||||||
|

|
7
examples/server/themes/buttons-top/README.md
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
# LLaMA.cpp Server Buttons Top Theme
|
||||||
|
|
||||||
|
Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page.
|
||||||
|
|
||||||
|
To use simply run server with `--path=themes/buttons_top`
|
||||||
|
|
||||||
|

|
BIN
examples/server/themes/buttons-top/buttons_top.png
Normal file
After Width: | Height: | Size: 117 KiB |
BIN
examples/server/themes/buttons-top/favicon.ico
Normal file
After Width: | Height: | Size: 4 KiB |
1057
examples/server/themes/buttons-top/index.html
Normal file
5
examples/server/themes/wild/README.md
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
# LLaMA.cpp Server Wild Theme
|
||||||
|
|
||||||
|
Simple tweaks to the UI. To use simply run server with `--path=themes/wild`
|
||||||
|
|
||||||
|

|
BIN
examples/server/themes/wild/favicon.ico
Normal file
After Width: | Height: | Size: 4 KiB |
1061
examples/server/themes/wild/index.html
Normal file
BIN
examples/server/themes/wild/llama_cpp.png
Normal file
After Width: | Height: | Size: 75 KiB |
BIN
examples/server/themes/wild/llamapattern.png
Normal file
After Width: | Height: | Size: 254 KiB |
BIN
examples/server/themes/wild/wild.png
Normal file
After Width: | Height: | Size: 485 KiB |
|
@ -3,6 +3,8 @@
|
||||||
#include "llama.h"
|
#include "llama.h"
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
|
// Change JSON_ASSERT from assert() to GGML_ASSERT:
|
||||||
|
#define JSON_ASSERT GGML_ASSERT
|
||||||
#include "json.hpp"
|
#include "json.hpp"
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
@ -49,18 +51,18 @@ extern bool server_log_json;
|
||||||
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
|
||||||
|
|
||||||
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
|
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
|
||||||
|
|
||||||
template <typename T>
|
template <typename T>
|
||||||
static T json_value(const json &body, const std::string &key, const T &default_value) {
|
static T json_value(const json & body, const std::string & key, const T & default_value) {
|
||||||
// Fallback null to default value
|
// Fallback null to default value
|
||||||
if (body.contains(key) && !body.at(key).is_null()){
|
if (body.contains(key) && !body.at(key).is_null()) {
|
||||||
try {
|
try {
|
||||||
return body.value(key, default_value);
|
return body.at(key);
|
||||||
}
|
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
|
||||||
catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
|
std::stringstream ss;
|
||||||
std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
|
ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
|
||||||
server_log("WARN", __func__, __LINE__, message.c_str(), body);
|
LOG_WARNING(ss.str().c_str(), body);
|
||||||
return default_value;
|
return default_value;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -68,16 +70,16 @@ static T json_value(const json &body, const std::string &key, const T &default_v
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
|
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
|
||||||
std::stringstream ss_tid;
|
std::stringstream ss_tid;
|
||||||
ss_tid << std::this_thread::get_id();
|
ss_tid << std::this_thread::get_id();
|
||||||
json log = nlohmann::ordered_json{
|
json log = json{
|
||||||
{"tid", ss_tid.str()},
|
{"tid", ss_tid.str()},
|
||||||
{"timestamp", time(nullptr)},
|
{"timestamp", time(nullptr)},
|
||||||
};
|
};
|
||||||
|
|
||||||
if (server_log_json) {
|
if (server_log_json) {
|
||||||
log.merge_patch( {
|
log.merge_patch({
|
||||||
{"level", level},
|
{"level", level},
|
||||||
{"function", function},
|
{"function", function},
|
||||||
{"line", line},
|
{"line", line},
|
||||||
|
@ -98,7 +100,7 @@ static inline void server_log(const char *level, const char *function, int line,
|
||||||
}
|
}
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << buf << " |";
|
ss << buf << " |";
|
||||||
for (const auto& el : log.items())
|
for (const auto & el : log.items())
|
||||||
{
|
{
|
||||||
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
|
||||||
ss << " " << el.key() << "=" << value;
|
ss << " " << el.key() << "=" << value;
|
||||||
|
@ -373,11 +375,11 @@ static json oaicompat_completion_params_parse(
|
||||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||||
|
|
||||||
// Apply chat template to the list of messages
|
// Apply chat template to the list of messages
|
||||||
llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
|
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
|
||||||
|
|
||||||
// Handle "stop" field
|
// Handle "stop" field
|
||||||
if (body.contains("stop") && body["stop"].is_string()) {
|
if (body.contains("stop") && body.at("stop").is_string()) {
|
||||||
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
|
llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
|
||||||
} else {
|
} else {
|
||||||
llama_params["stop"] = json_value(body, "stop", json::array());
|
llama_params["stop"] = json_value(body, "stop", json::array());
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,6 +1,6 @@
|
||||||
# llama.cpp/example/sycl
|
# llama.cpp/example/sycl
|
||||||
|
|
||||||
This example program provide the tools for llama.cpp for SYCL on Intel GPU.
|
This example program provides the tools for llama.cpp for SYCL on Intel GPU.
|
||||||
|
|
||||||
## Tool
|
## Tool
|
||||||
|
|
||||||
|
|
30
flake.lock
generated
|
@ -5,11 +5,11 @@
|
||||||
"nixpkgs-lib": "nixpkgs-lib"
|
"nixpkgs-lib": "nixpkgs-lib"
|
||||||
},
|
},
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1712014858,
|
"lastModified": 1714641030,
|
||||||
"narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
|
"narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
|
||||||
"owner": "hercules-ci",
|
"owner": "hercules-ci",
|
||||||
"repo": "flake-parts",
|
"repo": "flake-parts",
|
||||||
"rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
|
"rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -20,11 +20,11 @@
|
||||||
},
|
},
|
||||||
"nixpkgs": {
|
"nixpkgs": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"lastModified": 1714076141,
|
"lastModified": 1714635257,
|
||||||
"narHash": "sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY=",
|
"narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
|
||||||
"owner": "NixOS",
|
"owner": "NixOS",
|
||||||
"repo": "nixpkgs",
|
"repo": "nixpkgs",
|
||||||
"rev": "7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856",
|
"rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
|
||||||
"type": "github"
|
"type": "github"
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
|
@ -36,20 +36,14 @@
|
||||||
},
|
},
|
||||||
"nixpkgs-lib": {
|
"nixpkgs-lib": {
|
||||||
"locked": {
|
"locked": {
|
||||||
"dir": "lib",
|
"lastModified": 1714640452,
|
||||||
"lastModified": 1711703276,
|
"narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=",
|
||||||
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
|
"type": "tarball",
|
||||||
"owner": "NixOS",
|
"url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
|
||||||
"repo": "nixpkgs",
|
|
||||||
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
|
|
||||||
"type": "github"
|
|
||||||
},
|
},
|
||||||
"original": {
|
"original": {
|
||||||
"dir": "lib",
|
"type": "tarball",
|
||||||
"owner": "NixOS",
|
"url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
|
||||||
"ref": "nixos-unstable",
|
|
||||||
"repo": "nixpkgs",
|
|
||||||
"type": "github"
|
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"root": {
|
"root": {
|
||||||
|
|
282
ggml-cuda.cu
|
@ -113,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||||
for (int id = 0; id < info.device_count; ++id) {
|
for (int id = 0; id < info.device_count; ++id) {
|
||||||
int device_vmm = 0;
|
int device_vmm = 0;
|
||||||
|
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||||
CUdevice device;
|
CUdevice device;
|
||||||
CU_CHECK(cuDeviceGet(&device, id));
|
CU_CHECK(cuDeviceGet(&device, id));
|
||||||
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
|
||||||
|
@ -259,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
||||||
};
|
};
|
||||||
|
|
||||||
// pool with virtual memory
|
// pool with virtual memory
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||||
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
|
||||||
|
|
||||||
|
@ -356,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
|
||||||
#endif // !defined(GGML_USE_HIPBLAS)
|
#endif // !defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
|
||||||
#if !defined(GGML_USE_HIPBLAS)
|
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
|
||||||
if (ggml_cuda_info().devices[device].vmm) {
|
if (ggml_cuda_info().devices[device].vmm) {
|
||||||
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
|
||||||
}
|
}
|
||||||
|
@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
|
||||||
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
|
||||||
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
|
||||||
|
@ -1670,7 +1670,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
|
||||||
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
|
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
GGML_ASSERT(!ggml_is_transposed(src0));
|
GGML_ASSERT(!ggml_is_transposed(src0));
|
||||||
GGML_ASSERT(!ggml_is_transposed(src1));
|
GGML_ASSERT(!ggml_is_transposed(src1));
|
||||||
GGML_ASSERT(!ggml_is_permuted(src0));
|
GGML_ASSERT(!ggml_is_permuted(src0));
|
||||||
|
@ -2410,11 +2410,184 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
|
||||||
GGML_UNUSED(backend);
|
GGML_UNUSED(backend);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||||
|
graph_node_properties->node_address = node->data;
|
||||||
|
graph_node_properties->node_op = node->op;
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
|
graph_node_properties->ne[i] = node->ne[i];
|
||||||
|
graph_node_properties->nb[i] = node->nb[i];
|
||||||
|
}
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
|
||||||
|
if (node->data != graph_node_properties->node_address &&
|
||||||
|
node->op != GGML_OP_CPY &&
|
||||||
|
node->op != GGML_OP_VIEW) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node->op != graph_node_properties->node_op) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < GGML_MAX_DIMS; i++) {
|
||||||
|
if (node->ne[i] != graph_node_properties->ne[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (node->nb[i] != graph_node_properties->nb[i]) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for (int i = 0; i < GGML_MAX_SRC; i++) {
|
||||||
|
if (node->src[i] &&
|
||||||
|
node->src[i]->data != graph_node_properties->src_address[i] &&
|
||||||
|
node->op != GGML_OP_CPY &&
|
||||||
|
node->op != GGML_OP_VIEW
|
||||||
|
) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
|
||||||
|
|
||||||
ggml_cuda_set_device(cuda_ctx->device);
|
ggml_cuda_set_device(cuda_ctx->device);
|
||||||
|
|
||||||
|
#ifdef USE_CUDA_GRAPH
|
||||||
|
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
|
||||||
|
|
||||||
|
// Objects required for CUDA Graph
|
||||||
|
if (cuda_ctx->cuda_graph == nullptr) {
|
||||||
|
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
|
||||||
|
}
|
||||||
|
|
||||||
|
bool use_cuda_graph = true;
|
||||||
|
bool cuda_graph_update_required = false;
|
||||||
|
// pointer to CUDA cpy kernel, which is required to identify
|
||||||
|
// kernel parameters which need updated in the graph for each token
|
||||||
|
void * ggml_cuda_cpy_fn_ptr = nullptr;
|
||||||
|
|
||||||
|
if (cuda_ctx->cuda_graph->graph == nullptr) {
|
||||||
|
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
||||||
|
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
|
||||||
|
// or previous graph capture failure.
|
||||||
|
// Also disable for multi-gpu for now. TO DO investigate
|
||||||
|
if (disable_cuda_graphs_due_to_env
|
||||||
|
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|
||||||
|
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|
||||||
|
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
|
||||||
|
use_cuda_graph = false;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (use_cuda_graph) {
|
||||||
|
if (cuda_ctx->cuda_graph->instance == nullptr) {
|
||||||
|
cuda_graph_update_required = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Check if the graph size has changed
|
||||||
|
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
|
||||||
|
cuda_graph_update_required = true;
|
||||||
|
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loop over nodes in GGML graph to determine if CUDA graph update is required
|
||||||
|
// and store properties to allow this comparison for the next token
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
bool has_matching_properties = true;
|
||||||
|
if (!cuda_graph_update_required) {
|
||||||
|
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
||||||
|
}
|
||||||
|
if (!has_matching_properties) {
|
||||||
|
cuda_graph_update_required = true;
|
||||||
|
}
|
||||||
|
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
|
||||||
|
cuda_ctx->cuda_graph->updated_kernel_arg.clear();
|
||||||
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
||||||
|
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node->op == GGML_OP_MUL_MAT_ID) {
|
||||||
|
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
|
||||||
|
// disable CUDA graphs for batch size > 1 for now.
|
||||||
|
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
||||||
|
use_cuda_graph = false;
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (node->op == GGML_OP_CPY) {
|
||||||
|
// store the copy op parameter which changes with each token.
|
||||||
|
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
|
||||||
|
if (ggml_cuda_cpy_fn_ptr == nullptr) {
|
||||||
|
// store a pointer to the copy op CUDA kernel to identify it later
|
||||||
|
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!use_cuda_graph) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
|
||||||
|
if (cuda_graph_update_required) {
|
||||||
|
cuda_ctx->cuda_graph->number_consecutive_updates++;
|
||||||
|
} else {
|
||||||
|
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
||||||
|
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
|
||||||
|
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
|
||||||
|
}
|
||||||
|
|
||||||
|
#else
|
||||||
|
bool use_cuda_graph = false;
|
||||||
|
bool cuda_graph_update_required = false;
|
||||||
|
#endif // USE_CUDA_GRAPH
|
||||||
|
|
||||||
|
bool graph_evaluated_or_captured = false;
|
||||||
|
|
||||||
|
while (!graph_evaluated_or_captured) {
|
||||||
|
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
|
||||||
|
// With the use of CUDA graphs, the execution will be performed by the graph launch.
|
||||||
|
if (!use_cuda_graph || cuda_graph_update_required) {
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
ggml_tensor * node = cgraph->nodes[i];
|
ggml_tensor * node = cgraph->nodes[i];
|
||||||
|
|
||||||
|
@ -2437,6 +2610,105 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
||||||
}
|
}
|
||||||
GGML_ASSERT(ok);
|
GGML_ASSERT(ok);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef USE_CUDA_GRAPH
|
||||||
|
if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
|
||||||
|
if (cuda_ctx->cuda_graph->graph != nullptr) {
|
||||||
|
CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
|
||||||
|
cuda_ctx->cuda_graph->graph = nullptr;
|
||||||
|
}
|
||||||
|
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
|
||||||
|
|
||||||
|
#if 0
|
||||||
|
if (disable_cuda_graphs_due_to_failed_capture) {
|
||||||
|
use_cuda_graph = false;
|
||||||
|
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
||||||
|
} else {
|
||||||
|
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (use_cuda_graph) {
|
||||||
|
if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
|
||||||
|
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
|
||||||
|
|
||||||
|
if (cuda_graph_update_required) {
|
||||||
|
// Extract nodes from graph
|
||||||
|
if (cuda_ctx->cuda_graph->num_nodes == 0) {
|
||||||
|
// First call with null argument gets number of nodes in graph
|
||||||
|
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
|
||||||
|
}
|
||||||
|
// Subsequent call with non-null argument gets nodes
|
||||||
|
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||||
|
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
|
||||||
|
if (cuda_ctx->cuda_graph->num_nodes > 0) {
|
||||||
|
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
|
||||||
|
|
||||||
|
// Loop over nodes, and extract kernel parameters from each node
|
||||||
|
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||||
|
cudaGraphNodeType node_type;
|
||||||
|
CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
|
||||||
|
if (node_type == cudaGraphNodeTypeKernel) {
|
||||||
|
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
|
||||||
|
if (stat == cudaErrorInvalidDeviceFunction) {
|
||||||
|
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
|
||||||
|
// We don't need to update blas nodes, so clear error and move on.
|
||||||
|
cudaGetLastError();
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(stat == cudaSuccess);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// One of the arguments to the copy kernel is updated for each token, hence we need to
|
||||||
|
// replace that argument with the updated value in the CUDA graph
|
||||||
|
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
|
||||||
|
int k = 0;
|
||||||
|
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
|
||||||
|
if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
|
||||||
|
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
|
||||||
|
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
|
||||||
|
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Update graph executable
|
||||||
|
cudaGraphExecUpdateResultInfo result_info;
|
||||||
|
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
|
||||||
|
if (stat == cudaErrorGraphExecUpdateFailure) {
|
||||||
|
#ifndef NDEBUG
|
||||||
|
fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
|
||||||
|
#endif
|
||||||
|
// The pre-existing graph exec cannot be updated due to violated constraints
|
||||||
|
// so instead clear error and re-instantiate
|
||||||
|
cudaGetLastError();
|
||||||
|
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
|
||||||
|
cuda_ctx->cuda_graph->instance = nullptr;
|
||||||
|
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
|
||||||
|
} else {
|
||||||
|
GGML_ASSERT(stat == cudaSuccess);
|
||||||
|
}
|
||||||
|
// Launch graph
|
||||||
|
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
|
||||||
|
#else
|
||||||
|
graph_evaluated_or_captured = true;
|
||||||
|
#endif // USE_CUDA_GRAPH
|
||||||
|
}
|
||||||
|
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
|
@ -31,5 +31,4 @@ void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
|
||||||
|
|
||||||
clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
|
clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -19,6 +19,7 @@
|
||||||
#include <cassert>
|
#include <cassert>
|
||||||
#include <cfloat>
|
#include <cfloat>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#include <hip/hip_runtime.h>
|
#include <hip/hip_runtime.h>
|
||||||
|
@ -233,122 +234,6 @@ typedef float dfloat; // dequantize float
|
||||||
typedef float2 dfloat2;
|
typedef float2 dfloat2;
|
||||||
#endif //GGML_CUDA_F16
|
#endif //GGML_CUDA_F16
|
||||||
|
|
||||||
[[noreturn]]
|
|
||||||
static __device__ void no_device_code(
|
|
||||||
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
|
||||||
file_name, line, function_name, arch);
|
|
||||||
GGML_UNUSED(arch_list);
|
|
||||||
#else
|
|
||||||
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
|
||||||
file_name, line, function_name, arch, arch_list);
|
|
||||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
|
||||||
__trap();
|
|
||||||
|
|
||||||
GGML_UNUSED(no_device_code); // suppress unused function warning
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef __CUDA_ARCH__
|
|
||||||
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
|
||||||
#else
|
|
||||||
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
|
||||||
#endif // __CUDA_ARCH__
|
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
|
||||||
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
|
||||||
}
|
|
||||||
return a;
|
|
||||||
#else
|
|
||||||
GGML_UNUSED(a);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
|
|
||||||
#if CUDART_VERSION >= CUDART_HMAX
|
|
||||||
return __hmax(a, b);
|
|
||||||
#else
|
|
||||||
return __half2float(a) > __half2float(b) ? a : b;
|
|
||||||
#endif // CUDART_VERSION >= CUDART_HMAX
|
|
||||||
|
|
||||||
#else
|
|
||||||
GGML_UNUSED(a);
|
|
||||||
GGML_UNUSED(b);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
|
||||||
}
|
|
||||||
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
|
||||||
|
|
||||||
#if CUDART_VERSION >= CUDART_HMAX
|
|
||||||
return __hmax2(a, b);
|
|
||||||
#else
|
|
||||||
half2 ret;
|
|
||||||
reinterpret_cast<half&>(ret.x) = __low2float(a) > __low2float(b) ? __low2half(a) : __low2half(b);
|
|
||||||
reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
|
|
||||||
return ret;
|
|
||||||
#endif // CUDART_VERSION >= CUDART_HMAX
|
|
||||||
|
|
||||||
#else
|
|
||||||
GGML_UNUSED(a);
|
|
||||||
GGML_UNUSED(b);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
|
||||||
}
|
|
||||||
|
|
||||||
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
|
||||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
#pragma unroll
|
|
||||||
for (int mask = 16; mask > 0; mask >>= 1) {
|
|
||||||
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
|
||||||
}
|
|
||||||
return x;
|
|
||||||
#else
|
|
||||||
GGML_UNUSED(x);
|
|
||||||
NO_DEVICE_CODE;
|
|
||||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
}
|
|
||||||
|
|
||||||
#if CUDART_VERSION < CUDART_HMASK
|
|
||||||
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
|
|
||||||
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
|
|
||||||
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
|
||||||
return mask_low | mask_high;
|
|
||||||
}
|
|
||||||
#endif // CUDART_VERSION < 12000
|
|
||||||
|
|
||||||
#if defined(GGML_USE_HIPBLAS)
|
#if defined(GGML_USE_HIPBLAS)
|
||||||
#define __CUDA_ARCH__ 1300
|
#define __CUDA_ARCH__ 1300
|
||||||
|
|
||||||
|
@ -432,11 +317,143 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
||||||
}
|
}
|
||||||
#endif // defined(GGML_USE_HIPBLAS)
|
#endif // defined(GGML_USE_HIPBLAS)
|
||||||
|
|
||||||
#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
|
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
|
||||||
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
|
|
||||||
|
|
||||||
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||||
|
|
||||||
|
static bool fp16_mma_available(const int cc) {
|
||||||
|
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
||||||
|
}
|
||||||
|
|
||||||
|
[[noreturn]]
|
||||||
|
static __device__ void no_device_code(
|
||||||
|
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
|
||||||
|
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
|
||||||
|
file_name, line, function_name, arch);
|
||||||
|
GGML_UNUSED(arch_list);
|
||||||
|
#else
|
||||||
|
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
|
||||||
|
file_name, line, function_name, arch, arch_list);
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
__trap();
|
||||||
|
|
||||||
|
GGML_UNUSED(no_device_code); // suppress unused function warning
|
||||||
|
}
|
||||||
|
|
||||||
|
#ifdef __CUDA_ARCH__
|
||||||
|
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
|
||||||
|
#else
|
||||||
|
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
|
||||||
|
#endif // __CUDA_ARCH__
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float warp_reduce_sum(float x) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
|
||||||
|
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
|
||||||
|
#if FP16_AVAILABLE
|
||||||
|
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
|
||||||
|
reinterpret_cast<half&>(a.x) += __low2half(a_other);
|
||||||
|
reinterpret_cast<half&>(a.y) += __high2half(a_other);
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
#else
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
|
||||||
|
}
|
||||||
|
return a;
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
return a;
|
||||||
|
#endif // FP16_AVAILABLE
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ float warp_reduce_max(float x) {
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
|
||||||
|
#if FP16_AVAILABLE
|
||||||
|
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||||
|
return __float2half(fmaxf(__half2float(a), __half2float(b)));
|
||||||
|
#else
|
||||||
|
return __hmax(a, b);
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
|
||||||
|
|
||||||
|
#else
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
GGML_UNUSED(b);
|
||||||
|
return a;
|
||||||
|
#endif // FP16_AVAILABLE
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
|
||||||
|
#if CUDART_VERSION >= CUDART_HMAX
|
||||||
|
return __hmax2(a, b);
|
||||||
|
#else
|
||||||
|
half2 ret;
|
||||||
|
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
|
||||||
|
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
|
||||||
|
return ret;
|
||||||
|
#endif // CUDART_VERSION >= CUDART_HMAX
|
||||||
|
|
||||||
|
#else
|
||||||
|
GGML_UNUSED(a);
|
||||||
|
GGML_UNUSED(b);
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
}
|
||||||
|
|
||||||
|
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
|
#pragma unroll
|
||||||
|
for (int mask = 16; mask > 0; mask >>= 1) {
|
||||||
|
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
|
||||||
|
}
|
||||||
|
return x;
|
||||||
|
#else
|
||||||
|
GGML_UNUSED(x);
|
||||||
|
NO_DEVICE_CODE;
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
|
||||||
|
}
|
||||||
|
|
||||||
|
#if CUDART_VERSION < CUDART_HMASK
|
||||||
|
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
|
||||||
|
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
|
||||||
|
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
|
||||||
|
return mask_low | mask_high;
|
||||||
|
}
|
||||||
|
#endif // CUDART_VERSION < 12000
|
||||||
|
|
||||||
// TODO: move to ggml-common.h
|
// TODO: move to ggml-common.h
|
||||||
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
|
||||||
|
|
||||||
|
@ -526,6 +543,43 @@ struct ggml_tensor_extra_gpu {
|
||||||
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
|
||||||
|
#define USE_CUDA_GRAPH
|
||||||
|
#endif
|
||||||
|
|
||||||
|
struct ggml_graph_node_properties {
|
||||||
|
void * node_address;
|
||||||
|
ggml_op node_op;
|
||||||
|
int64_t ne[GGML_MAX_DIMS];
|
||||||
|
size_t nb[GGML_MAX_DIMS];
|
||||||
|
void * src_address[GGML_MAX_SRC];
|
||||||
|
};
|
||||||
|
|
||||||
|
struct ggml_cuda_graph {
|
||||||
|
#ifdef USE_CUDA_GRAPH
|
||||||
|
~ggml_cuda_graph() {
|
||||||
|
if (instance != nullptr) {
|
||||||
|
CUDA_CHECK(cudaGraphExecDestroy(instance));
|
||||||
|
}
|
||||||
|
if (graph != nullptr) {
|
||||||
|
CUDA_CHECK(cudaGraphDestroy(graph));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cudaGraph_t graph = nullptr;
|
||||||
|
cudaGraphExec_t instance = nullptr;
|
||||||
|
size_t num_nodes = 0;
|
||||||
|
std::vector<cudaGraphNode_t> nodes;
|
||||||
|
std::vector<cudaKernelNodeParams> params;
|
||||||
|
bool disable_due_to_gpu_arch = false;
|
||||||
|
bool disable_due_to_too_many_updates = false;
|
||||||
|
bool disable_due_to_failed_graph_capture = false;
|
||||||
|
int number_consecutive_updates = 0;
|
||||||
|
std::vector<ggml_graph_node_properties> ggml_graph_properties;
|
||||||
|
std::vector<char **> updated_kernel_arg;
|
||||||
|
#endif
|
||||||
|
};
|
||||||
|
|
||||||
struct ggml_backend_cuda_context {
|
struct ggml_backend_cuda_context {
|
||||||
int device;
|
int device;
|
||||||
std::string name;
|
std::string name;
|
||||||
|
@ -534,6 +588,8 @@ struct ggml_backend_cuda_context {
|
||||||
cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
|
cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
|
||||||
cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
|
||||||
|
|
||||||
|
std::unique_ptr<ggml_cuda_graph> cuda_graph;
|
||||||
|
|
||||||
explicit ggml_backend_cuda_context(int device) :
|
explicit ggml_backend_cuda_context(int device) :
|
||||||
device(device),
|
device(device),
|
||||||
name(GGML_CUDA_NAME + std::to_string(device)) {
|
name(GGML_CUDA_NAME + std::to_string(device)) {
|
||||||
|
|
|
@ -727,7 +727,6 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
|
||||||
}
|
}
|
||||||
|
|
||||||
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||||
int id;
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
case GGML_TYPE_Q4_0:
|
case GGML_TYPE_Q4_0:
|
||||||
return dequantize_row_q4_0_cuda;
|
return dequantize_row_q4_0_cuda;
|
||||||
|
@ -738,8 +737,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
|
||||||
case GGML_TYPE_Q5_1:
|
case GGML_TYPE_Q5_1:
|
||||||
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
|
||||||
case GGML_TYPE_Q8_0:
|
case GGML_TYPE_Q8_0:
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
|
||||||
if (ggml_cuda_info().devices[id].cc >= CC_PASCAL) {
|
|
||||||
return dequantize_block_q8_0_f16_cuda;
|
return dequantize_block_q8_0_f16_cuda;
|
||||||
}
|
}
|
||||||
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;
|
||||||
|
|
|
@ -459,3 +459,32 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
const ggml_tensor * src0 = dst->src[0];
|
const ggml_tensor * src0 = dst->src[0];
|
||||||
ggml_cuda_cpy(ctx, src0, dst);
|
ggml_cuda_cpy(ctx, src0, dst);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
|
||||||
|
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
|
||||||
|
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
|
||||||
|
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
|
||||||
|
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
|
||||||
|
return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
|
||||||
|
return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
|
||||||
|
return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
|
||||||
|
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
|
||||||
|
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
|
||||||
|
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
|
||||||
|
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
|
||||||
|
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
|
||||||
|
} else {
|
||||||
|
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
|
||||||
|
ggml_type_name(src0->type), ggml_type_name(src1->type));
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
|
@ -5,3 +5,5 @@
|
||||||
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
|
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
|
||||||
|
|
||||||
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||||
|
|
||||||
|
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);
|
||||||
|
|
|
@ -11,8 +11,10 @@
|
||||||
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
||||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||||
|
|
||||||
template<int D, int parallel_blocks> // D == head size
|
template<int D, int ncols, int parallel_blocks> // D == head size
|
||||||
__launch_bounds__(((D + WARP_SIZE - 1) / WARP_SIZE)*WARP_SIZE, 1)
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
|
__launch_bounds__(D, 1)
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_vec_ext_f16(
|
static __global__ void flash_attn_vec_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
@ -44,55 +46,77 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
#if FP16_AVAILABLE
|
#if FP16_AVAILABLE
|
||||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||||
|
|
||||||
const int ic = blockIdx.x / parallel_blocks; // Index of the Q/QKV column to work on.
|
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||||
|
|
||||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic);
|
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
||||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
||||||
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
||||||
const half * maskh = (const half *) mask + ne11*ic;
|
const half * maskh = (const half *) mask + ne11*ic0;
|
||||||
|
|
||||||
const int stride_KV = nb11 / sizeof(half);
|
const int stride_KV = nb11 / sizeof(half);
|
||||||
const int stride_KV2 = nb11 / sizeof(half2);
|
const int stride_KV2 = nb11 / sizeof(half2);
|
||||||
|
|
||||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||||
|
constexpr int nwarps = D / WARP_SIZE;
|
||||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||||
__builtin_assume(tid < nwarps*WARP_SIZE);
|
__builtin_assume(tid < D);
|
||||||
|
|
||||||
__shared__ half KQ[nwarps*WARP_SIZE];
|
__shared__ half KQ[ncols*D];
|
||||||
KQ[tid] = -INFINITY;
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
KQ[j*D + tid] = -HALF_MAX_HALF;
|
||||||
|
}
|
||||||
half2 * KQ2 = (half2 *) KQ;
|
half2 * KQ2 = (half2 *) KQ;
|
||||||
|
|
||||||
half kqmax = -HALF_MAX_HALF;
|
half kqmax[ncols];
|
||||||
half kqsum = 0.0f;
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqmax[j] = -HALF_MAX_HALF;
|
||||||
|
}
|
||||||
|
half kqsum[ncols] = {0.0f};
|
||||||
|
|
||||||
__shared__ half kqmax_shared[WARP_SIZE];
|
__shared__ half kqmax_shared[ncols][WARP_SIZE];
|
||||||
__shared__ half kqsum_shared[WARP_SIZE];
|
__shared__ half kqsum_shared[ncols][WARP_SIZE];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
if (threadIdx.y == 0) {
|
if (threadIdx.y == 0) {
|
||||||
kqmax_shared[threadIdx.x] = -HALF_MAX_HALF;
|
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
|
||||||
kqsum_shared[threadIdx.x] = 0.0f;
|
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
// Convert Q to half2 and store in registers:
|
// Convert Q to half2 and store in registers:
|
||||||
half2 Q_h2[(D/2 + WARP_SIZE - 1) / WARP_SIZE];
|
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||||
const int i = i0 + threadIdx.x;
|
const int i = i0 + threadIdx.x;
|
||||||
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
|
|
||||||
break;
|
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||||
|
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Q_h2[i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(Q_f2[i].x, Q_f2[i].y);
|
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||||
}
|
|
||||||
|
|
||||||
half2 VKQ = make_half2(0.0f, 0.0f); // Each thread calculates a single VKQ value.
|
|
||||||
|
|
||||||
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
||||||
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
||||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||||
half kqmax_new = kqmax;
|
|
||||||
|
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||||
|
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
|
||||||
|
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
|
||||||
|
half kqmax_new = kqmax[0];
|
||||||
|
half kqmax_new_arr[ncols];
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
kqmax_new_arr[j] = kqmax[j];
|
||||||
|
}
|
||||||
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
||||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||||
|
@ -101,47 +125,65 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
half2 sum2 = make_half2(0.0f, 0.0f);
|
half2 sum2[ncols] = {{0.0f, 0.0f}};
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||||
if (k_KQ_0 + WARP_SIZE > D/2 && k_KQ >= D/2) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||||
sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sum2 = warp_reduce_sum(sum2);
|
#pragma unroll
|
||||||
half sum = __low2half(sum2) + __high2half(sum2);
|
for (int j = 0; j < ncols; ++j) {
|
||||||
sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
sum2[j] = warp_reduce_sum(sum2[j]);
|
||||||
|
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
|
||||||
|
sum += mask ? maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||||
|
|
||||||
|
if (ncols == 1) {
|
||||||
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
||||||
|
} else {
|
||||||
|
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
|
||||||
|
}
|
||||||
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
KQ[i_KQ] = sum;
|
KQ[j*D + i_KQ] = sum;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
kqmax_new = warp_reduce_max(kqmax_new);
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
|
||||||
|
|
||||||
|
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
kqmax_shared[threadIdx.y] = kqmax_new;
|
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
__syncthreads();
|
||||||
|
|
||||||
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||||
|
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||||
|
|
||||||
|
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
|
||||||
|
kqmax[j] = kqmax_new_j;
|
||||||
|
|
||||||
|
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
|
||||||
|
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
||||||
|
KQ[j*D + tid] = val;
|
||||||
|
|
||||||
|
VKQ[j] *= __half2half2(KQ_max_scale);
|
||||||
}
|
}
|
||||||
__syncthreads();
|
|
||||||
kqmax_new = kqmax_shared[threadIdx.x];
|
|
||||||
kqmax_new = warp_reduce_max(kqmax_new);
|
|
||||||
|
|
||||||
const half KQ_max_scale = hexp(kqmax - kqmax_new);
|
|
||||||
kqmax = kqmax_new;
|
|
||||||
|
|
||||||
const half val = hexp(KQ[tid] - kqmax);
|
|
||||||
kqsum = kqsum*KQ_max_scale + val;
|
|
||||||
KQ[tid] = val;
|
|
||||||
|
|
||||||
VKQ *= __half2half2(KQ_max_scale);
|
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
|
|
||||||
if (tid < D) {
|
|
||||||
#pragma unroll
|
#pragma unroll
|
||||||
for (int k0 = 0; k0 < D; k0 += 2) {
|
for (int k0 = 0; k0 < D; k0 += 2) {
|
||||||
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
|
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
|
||||||
|
@ -151,39 +193,44 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
half2 V_k;
|
half2 V_k;
|
||||||
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
|
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
|
||||||
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
|
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
|
||||||
VKQ += V_k*KQ2[k0/2];
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tid >= D) {
|
#pragma unroll
|
||||||
kqsum = 0.0f;
|
for (int j = 0; j < ncols; ++j) {
|
||||||
}
|
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
||||||
|
|
||||||
kqsum = warp_reduce_sum(kqsum);
|
|
||||||
if (threadIdx.x == 0) {
|
if (threadIdx.x == 0) {
|
||||||
kqsum_shared[threadIdx.y] = kqsum;
|
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
__syncthreads();
|
__syncthreads();
|
||||||
kqsum = kqsum_shared[threadIdx.x];
|
|
||||||
kqsum = warp_reduce_sum(kqsum);
|
|
||||||
|
|
||||||
if (tid >= D) {
|
#pragma unroll
|
||||||
return;
|
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||||
}
|
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||||
|
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||||
|
|
||||||
half dst_val = (__low2half(VKQ) + __high2half(VKQ));
|
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
||||||
if (parallel_blocks == 1) {
|
if (parallel_blocks == 1) {
|
||||||
dst_val /= kqsum;
|
dst_val /= kqsum[j_VKQ];
|
||||||
|
}
|
||||||
|
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
||||||
|
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||||
}
|
}
|
||||||
dst[D*gridDim.y*blockIdx.x + D*blockIdx.y + tid] = dst_val;
|
|
||||||
|
|
||||||
if (parallel_blocks == 1 || tid != 0) {
|
if (parallel_blocks != 1 && tid != 0) {
|
||||||
return;
|
#pragma unroll
|
||||||
|
for (int j = 0; j < ncols; ++j) {
|
||||||
|
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
dst_meta[ic*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax, kqsum);
|
|
||||||
#else
|
#else
|
||||||
NO_DEVICE_CODE;
|
NO_DEVICE_CODE;
|
||||||
#endif // FP16_AVAILABLE
|
#endif // FP16_AVAILABLE
|
||||||
|
@ -191,7 +238,9 @@ static __global__ void flash_attn_vec_ext_f16(
|
||||||
|
|
||||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
__launch_bounds__(nwarps*WARP_SIZE, 1)
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_ext_f16(
|
static __global__ void flash_attn_ext_f16(
|
||||||
const char * __restrict__ Q,
|
const char * __restrict__ Q,
|
||||||
const char * __restrict__ K,
|
const char * __restrict__ K,
|
||||||
|
@ -573,7 +622,9 @@ static __global__ void flash_attn_ext_f16(
|
||||||
}
|
}
|
||||||
|
|
||||||
template<int D, int parallel_blocks> // D == head size
|
template<int D, int parallel_blocks> // D == head size
|
||||||
|
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
__launch_bounds__(D, 1)
|
__launch_bounds__(D, 1)
|
||||||
|
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||||
static __global__ void flash_attn_combine_results(
|
static __global__ void flash_attn_combine_results(
|
||||||
const float * __restrict__ VKQ_parts,
|
const float * __restrict__ VKQ_parts,
|
||||||
const float2 * __restrict__ VKQ_meta,
|
const float2 * __restrict__ VKQ_meta,
|
||||||
|
@ -642,7 +693,7 @@ static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
|
||||||
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
|
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
|
||||||
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
|
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
|
||||||
|
|
||||||
template <int D, int parallel_blocks> void launch_fattn_vec_f16(
|
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
|
||||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||||
) {
|
) {
|
||||||
|
@ -656,13 +707,13 @@ template <int D, int parallel_blocks> void launch_fattn_vec_f16(
|
||||||
|
|
||||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
||||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||||
const dim3 blocks_num(parallel_blocks*Q->ne[1], Q->ne[2], Q->ne[3]);
|
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||||
const int shmem = 0;
|
const int shmem = 0;
|
||||||
|
|
||||||
float scale;
|
float scale;
|
||||||
memcpy(&scale, KQV->op_params, sizeof(float));
|
memcpy(&scale, KQV->op_params, sizeof(float));
|
||||||
|
|
||||||
flash_attn_vec_ext_f16<D, parallel_blocks>
|
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
|
||||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||||
(const char *) Q->data,
|
(const char *) Q->data,
|
||||||
(const char *) K->data,
|
(const char *) K->data,
|
||||||
|
@ -783,10 +834,99 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
|
|
||||||
ggml_cuda_set_device(ctx.device);
|
ggml_cuda_set_device(ctx.device);
|
||||||
|
|
||||||
|
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
|
||||||
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
|
||||||
|
|
||||||
const int32_t precision = KQV->op_params[1];
|
const int32_t precision = KQV->op_params[1];
|
||||||
|
|
||||||
|
if (!fp16_mma_available(cc)) {
|
||||||
|
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||||
|
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||||
|
|
||||||
|
if (Q->ne[1] == 1) {
|
||||||
|
constexpr int cols_per_block = 1;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] == 2) {
|
||||||
|
constexpr int cols_per_block = 2;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] <= 4) {
|
||||||
|
constexpr int cols_per_block = 4;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Q->ne[1] <= 8) {
|
||||||
|
constexpr int cols_per_block = 8;
|
||||||
|
constexpr int parallel_blocks = 4;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr int cols_per_block = 8;
|
||||||
|
constexpr int parallel_blocks = 1;
|
||||||
|
switch (Q->ne[0]) {
|
||||||
|
case 64:
|
||||||
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
case 128:
|
||||||
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
GGML_ASSERT(false);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (precision != GGML_PREC_DEFAULT) {
|
if (precision != GGML_PREC_DEFAULT) {
|
||||||
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
||||||
constexpr int cols_per_block = 16;
|
constexpr int cols_per_block = 16;
|
||||||
|
@ -845,16 +985,17 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
|
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
|
||||||
|
constexpr int cols_per_block = 1;
|
||||||
constexpr int parallel_blocks = 4;
|
constexpr int parallel_blocks = 4;
|
||||||
switch (Q->ne[0]) {
|
switch (Q->ne[0]) {
|
||||||
case 64:
|
case 64:
|
||||||
launch_fattn_vec_f16< 64, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
break;
|
break;
|
||||||
case 128:
|
case 128:
|
||||||
launch_fattn_vec_f16<128, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
break;
|
break;
|
||||||
case 256:
|
case 256:
|
||||||
launch_fattn_vec_f16<256, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||||
break;
|
break;
|
||||||
default:
|
default:
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
|
|
|
@ -1735,8 +1735,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
@ -1780,8 +1779,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
@ -1825,8 +1823,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
@ -1870,8 +1867,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
@ -1915,8 +1911,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
@ -1960,8 +1955,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
@ -2007,8 +2001,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
|
||||||
|
|
||||||
#if QK_K == 256
|
#if QK_K == 256
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
@ -2053,8 +2046,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
@ -2098,8 +2090,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
@ -2143,8 +2134,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
|
||||||
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
|
||||||
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
const int compute_capability = ggml_cuda_info().devices[id].cc;
|
||||||
|
|
||||||
int mmq_x, mmq_y, nwarps;
|
int mmq_x, mmq_y, nwarps;
|
||||||
|
|
|
@ -89,8 +89,7 @@ static void mul_mat_vec_q_cuda(
|
||||||
GGML_ASSERT(ncols_x % qk == 0);
|
GGML_ASSERT(ncols_x % qk == 0);
|
||||||
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
|
|
||||||
int64_t nwarps = 1;
|
int64_t nwarps = 1;
|
||||||
int64_t rows_per_cuda_block = 1;
|
int64_t rows_per_cuda_block = 1;
|
||||||
|
@ -328,8 +327,7 @@ void ggml_cuda_op_mul_mat_vec_q(
|
||||||
|
|
||||||
const int64_t ne0 = dst->ne[0];
|
const int64_t ne0 = dst->ne[0];
|
||||||
|
|
||||||
int id;
|
int id = ggml_cuda_get_device();
|
||||||
CUDA_CHECK(cudaGetDevice(&id));
|
|
||||||
|
|
||||||
// the main device has a larger memory buffer to hold the results from all GPUs
|
// the main device has a larger memory buffer to hold the results from all GPUs
|
||||||
// nrows_dst == nrows of the matrix that the kernel writes into
|
// nrows_dst == nrows of the matrix that the kernel writes into
|
||||||
|
|
|
@ -28,5 +28,4 @@ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||||
memcpy(&scale, dst->op_params, sizeof(float));
|
memcpy(&scale, dst->op_params, sizeof(float));
|
||||||
|
|
||||||
scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
|
scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
|
||||||
CUDA_CHECK(cudaGetLastError());
|
|
||||||
}
|
}
|
||||||
|
|
77
ggml-impl.h
|
@ -17,6 +17,83 @@
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts brain16 to float32.
|
||||||
|
*
|
||||||
|
* The bfloat16 floating point format has the following structure:
|
||||||
|
*
|
||||||
|
* ┌sign
|
||||||
|
* │
|
||||||
|
* │ ┌exponent
|
||||||
|
* │ │
|
||||||
|
* │ │ ┌mantissa
|
||||||
|
* │ │ │
|
||||||
|
* │┌──┴───┐┌─┴───┐
|
||||||
|
* 0b0000000000000000 brain16
|
||||||
|
*
|
||||||
|
* Since bf16 has the same number of exponent bits as a 32bit float,
|
||||||
|
* encoding and decoding numbers becomes relatively straightforward.
|
||||||
|
*
|
||||||
|
* ┌sign
|
||||||
|
* │
|
||||||
|
* │ ┌exponent
|
||||||
|
* │ │
|
||||||
|
* │ │ ┌mantissa
|
||||||
|
* │ │ │
|
||||||
|
* │┌──┴───┐┌─┴───────────────────┐
|
||||||
|
* 0b00000000000000000000000000000000 IEEE binary32
|
||||||
|
*
|
||||||
|
* For comparison, the standard fp16 format has fewer exponent bits.
|
||||||
|
*
|
||||||
|
* ┌sign
|
||||||
|
* │
|
||||||
|
* │ ┌exponent
|
||||||
|
* │ │
|
||||||
|
* │ │ ┌mantissa
|
||||||
|
* │ │ │
|
||||||
|
* │┌─┴─┐┌─┴──────┐
|
||||||
|
* 0b0000000000000000 IEEE binary16
|
||||||
|
*
|
||||||
|
* @see IEEE 754-2008
|
||||||
|
*/
|
||||||
|
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
|
||||||
|
union {
|
||||||
|
float f;
|
||||||
|
uint32_t i;
|
||||||
|
} u;
|
||||||
|
u.i = (uint32_t)h.bits << 16;
|
||||||
|
return u.f;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Converts float32 to brain16.
|
||||||
|
*
|
||||||
|
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
|
||||||
|
* Subnormals shall be flushed to zero, and NANs will be quiet.
|
||||||
|
* This code should vectorize nicely if using modern compilers.
|
||||||
|
*/
|
||||||
|
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
|
||||||
|
ggml_bf16_t h;
|
||||||
|
union {
|
||||||
|
float f;
|
||||||
|
uint32_t i;
|
||||||
|
} u;
|
||||||
|
u.f = s;
|
||||||
|
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
|
||||||
|
h.bits = (u.i >> 16) | 64; /* force to quiet */
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
if (!(u.i & 0x7f800000)) { /* subnormal */
|
||||||
|
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
|
||||||
|
return h;
|
||||||
|
}
|
||||||
|
|
||||||
|
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
|
||||||
|
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
extern "C" {
|
extern "C" {
|
||||||
#endif
|
#endif
|
||||||
|
|
19
ggml-metal.m
|
@ -265,11 +265,20 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
|
||||||
|
|
||||||
static void * ggml_metal_host_malloc(size_t n) {
|
static void * ggml_metal_host_malloc(size_t n) {
|
||||||
void * data = NULL;
|
void * data = NULL;
|
||||||
|
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
|
||||||
|
if (err != KERN_SUCCESS) {
|
||||||
|
GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
#else
|
||||||
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
|
||||||
if (result != 0) {
|
if (result != 0) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
|
GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return data;
|
return data;
|
||||||
}
|
}
|
||||||
|
@ -803,7 +812,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
||||||
case GGML_OP_DIAG_MASK_INF:
|
case GGML_OP_DIAG_MASK_INF:
|
||||||
case GGML_OP_GET_ROWS:
|
case GGML_OP_GET_ROWS:
|
||||||
{
|
{
|
||||||
return op->ne[3] == 1;
|
return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
return false;
|
return false;
|
||||||
|
@ -2840,7 +2849,11 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_
|
||||||
ggml_backend_metal_free_device();
|
ggml_backend_metal_free_device();
|
||||||
|
|
||||||
if (ctx->owned) {
|
if (ctx->owned) {
|
||||||
|
#if TARGET_OS_OSX
|
||||||
|
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
|
||||||
|
#else
|
||||||
free(ctx->all_data);
|
free(ctx->all_data);
|
||||||
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
free(ctx);
|
free(ctx);
|
||||||
|
@ -2944,14 +2957,16 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
|
||||||
ctx->owned = true;
|
ctx->owned = true;
|
||||||
ctx->n_buffers = 1;
|
ctx->n_buffers = 1;
|
||||||
|
|
||||||
|
if (ctx->all_data != NULL) {
|
||||||
ctx->buffers[0].data = ctx->all_data;
|
ctx->buffers[0].data = ctx->all_data;
|
||||||
ctx->buffers[0].size = size;
|
ctx->buffers[0].size = size;
|
||||||
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
||||||
length:size_aligned
|
length:size_aligned
|
||||||
options:MTLResourceStorageModeShared
|
options:MTLResourceStorageModeShared
|
||||||
deallocator:nil];
|
deallocator:nil];
|
||||||
|
}
|
||||||
|
|
||||||
if (ctx->buffers[0].metal == nil) {
|
if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
|
||||||
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
||||||
free(ctx);
|
free(ctx);
|
||||||
ggml_backend_metal_free_device();
|
ggml_backend_metal_free_device();
|
||||||
|
|
|
@ -2175,7 +2175,7 @@ kernel void kernel_flash_attn_ext_f16(
|
||||||
|
|
||||||
const short D4 = D/4;
|
const short D4 = D/4;
|
||||||
const short D8 = D/8;
|
const short D8 = D/8;
|
||||||
const short Q8 = Q/8;
|
//const short Q8 = Q/8;
|
||||||
const short NW = N_SIMDWIDTH;
|
const short NW = N_SIMDWIDTH;
|
||||||
const short SH = (C + Q); // shared memory per simdgroup in (half)
|
const short SH = (C + Q); // shared memory per simdgroup in (half)
|
||||||
|
|
||||||
|
|
|
@ -2119,6 +2119,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
|
||||||
if (alignment == (cl_uint)-1) {
|
if (alignment == (cl_uint)-1) {
|
||||||
ggml_cl_init();
|
ggml_cl_init();
|
||||||
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
|
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
|
||||||
|
alignment /= 8; // bits to bytes
|
||||||
}
|
}
|
||||||
return alignment;
|
return alignment;
|
||||||
|
|
||||||
|
|
|
@ -12450,6 +12450,24 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
|
||||||
const size_t nb = nbytes/ggml_type_size(type);
|
const size_t nb = nbytes/ggml_type_size(type);
|
||||||
|
|
||||||
switch (type) {
|
switch (type) {
|
||||||
|
case GGML_TYPE_BF16:
|
||||||
|
{
|
||||||
|
int nans = 0;
|
||||||
|
int infs = 0;
|
||||||
|
const unsigned short * f = (const unsigned short *) data;
|
||||||
|
for (size_t i = 0; i < nb; ++i) {
|
||||||
|
nans += (f[i] & 0x7fff) > 0x7f80;
|
||||||
|
infs += (f[i] & 0x7fff) == 0x7f80;
|
||||||
|
}
|
||||||
|
if (nans) {
|
||||||
|
fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (infs) {
|
||||||
|
fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
} break;
|
||||||
case GGML_TYPE_F16:
|
case GGML_TYPE_F16:
|
||||||
{
|
{
|
||||||
const ggml_fp16_t * f = (const ggml_fp16_t *) data;
|
const ggml_fp16_t * f = (const ggml_fp16_t *) data;
|
||||||
|
|
|
@ -8330,22 +8330,24 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
|
||||||
const int blocks_per_row = ncols / qk;
|
const int blocks_per_row = ncols / qk;
|
||||||
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
const int blocks_per_warp = vdr * WARP_SIZE / qi;
|
||||||
|
|
||||||
// partial sum for each thread
|
const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
|
||||||
|
|
||||||
|
// partial sum for each thread
|
||||||
float tmp = 0.0f;
|
float tmp = 0.0f;
|
||||||
|
|
||||||
const block_q_t * x = (const block_q_t *) vx;
|
const block_q_t * x = (const block_q_t *) vx;
|
||||||
const block_q8_1 * y = (const block_q8_1 *) vy;
|
const block_q8_1 * y = (const block_q8_1 *) vy;
|
||||||
|
|
||||||
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
|
for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
|
||||||
i += blocks_per_warp) {
|
i += blocks_per_warp) {
|
||||||
const int ibx = row*blocks_per_row + i; // x block index
|
const int ibx = row * blocks_per_row + i; // x block index
|
||||||
|
|
||||||
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
|
const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
|
||||||
|
|
||||||
const int iqs =
|
const int iqs =
|
||||||
vdr *
|
vdr *
|
||||||
(item_ct1.get_local_id(2) %
|
(item_ct1.get_local_id(2) -
|
||||||
(qi / vdr)); // x block quant index when casting the quants to int
|
i * qi_vdr); // x block quant index when casting the quants to int
|
||||||
|
|
||||||
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
|
||||||
}
|
}
|
||||||
|
|
76726
ggml-vulkan-shaders.hpp
1159
ggml-vulkan.cpp
20
ggml.h
|
@ -326,14 +326,20 @@ extern "C" {
|
||||||
// get ggml_status name string
|
// get ggml_status name string
|
||||||
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
|
||||||
|
|
||||||
|
// ieee 754-2008 half-precision float16
|
||||||
|
// todo: make this not an integral type
|
||||||
typedef uint16_t ggml_fp16_t;
|
typedef uint16_t ggml_fp16_t;
|
||||||
|
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
|
||||||
|
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
|
||||||
|
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
|
||||||
|
GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
|
||||||
|
|
||||||
// convert FP16 <-> FP32
|
// google brain half-precision bfloat16
|
||||||
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
|
typedef struct { uint16_t bits; } ggml_bf16_t;
|
||||||
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
|
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
|
||||||
|
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
|
||||||
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
|
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
|
||||||
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
|
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
|
||||||
|
|
||||||
struct ggml_object;
|
struct ggml_object;
|
||||||
struct ggml_context;
|
struct ggml_context;
|
||||||
|
@ -370,6 +376,7 @@ extern "C" {
|
||||||
GGML_TYPE_I64 = 27,
|
GGML_TYPE_I64 = 27,
|
||||||
GGML_TYPE_F64 = 28,
|
GGML_TYPE_F64 = 28,
|
||||||
GGML_TYPE_IQ1_M = 29,
|
GGML_TYPE_IQ1_M = 29,
|
||||||
|
GGML_TYPE_BF16 = 30,
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -410,6 +417,7 @@ extern "C" {
|
||||||
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
||||||
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
||||||
|
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
||||||
};
|
};
|
||||||
|
|
||||||
// available tensor operations:
|
// available tensor operations:
|
||||||
|
|
|
@ -817,6 +817,7 @@ class GGMLQuantizationType(IntEnum):
|
||||||
I64 = 27
|
I64 = 27
|
||||||
F64 = 28
|
F64 = 28
|
||||||
IQ1_M = 29
|
IQ1_M = 29
|
||||||
|
BF16 = 30
|
||||||
|
|
||||||
|
|
||||||
class GGUFEndian(IntEnum):
|
class GGUFEndian(IntEnum):
|
||||||
|
@ -859,7 +860,7 @@ class GGUFValueType(IntEnum):
|
||||||
# Note: Does not support GGML_QKK_64
|
# Note: Does not support GGML_QKK_64
|
||||||
QK_K = 256
|
QK_K = 256
|
||||||
# Items here are (block size, type size)
|
# Items here are (block size, type size)
|
||||||
GGML_QUANT_SIZES = {
|
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
|
||||||
GGMLQuantizationType.F32: (1, 4),
|
GGMLQuantizationType.F32: (1, 4),
|
||||||
GGMLQuantizationType.F16: (1, 2),
|
GGMLQuantizationType.F16: (1, 2),
|
||||||
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
GGMLQuantizationType.Q4_0: (32, 2 + 16),
|
||||||
|
@ -888,6 +889,7 @@ GGML_QUANT_SIZES = {
|
||||||
GGMLQuantizationType.I64: (1, 8),
|
GGMLQuantizationType.I64: (1, 8),
|
||||||
GGMLQuantizationType.F64: (1, 8),
|
GGMLQuantizationType.F64: (1, 8),
|
||||||
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
|
||||||
|
GGMLQuantizationType.BF16: (1, 2),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -65,7 +65,7 @@ class ReaderTensor(NamedTuple):
|
||||||
|
|
||||||
class GGUFReader:
|
class GGUFReader:
|
||||||
# I - same as host, S - swapped
|
# I - same as host, S - swapped
|
||||||
byte_order: Literal['I' | 'S'] = 'I'
|
byte_order: Literal['I'] | Literal['S'] = 'I'
|
||||||
alignment: int = GGUF_DEFAULT_ALIGNMENT
|
alignment: int = GGUF_DEFAULT_ALIGNMENT
|
||||||
|
|
||||||
# Note: Internal helper, API may change.
|
# Note: Internal helper, API may change.
|
||||||
|
@ -83,7 +83,7 @@ class GGUFReader:
|
||||||
GGUFValueType.BOOL: np.bool_,
|
GGUFValueType.BOOL: np.bool_,
|
||||||
}
|
}
|
||||||
|
|
||||||
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'):
|
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
|
||||||
self.data = np.memmap(path, mode = mode)
|
self.data = np.memmap(path, mode = mode)
|
||||||
offs = 0
|
offs = 0
|
||||||
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
|
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
|
||||||
|
@ -128,7 +128,7 @@ class GGUFReader:
|
||||||
return self.tensors[idx]
|
return self.tensors[idx]
|
||||||
|
|
||||||
def _get(
|
def _get(
|
||||||
self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None,
|
self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None,
|
||||||
) -> npt.NDArray[Any]:
|
) -> npt.NDArray[Any]:
|
||||||
count = int(count)
|
count = int(count)
|
||||||
itemsize = int(np.empty([], dtype = dtype).itemsize)
|
itemsize = int(np.empty([], dtype = dtype).itemsize)
|
||||||
|
@ -250,7 +250,7 @@ class GGUFReader:
|
||||||
raise ValueError(f'Found duplicated tensor with name {tensor_name}')
|
raise ValueError(f'Found duplicated tensor with name {tensor_name}')
|
||||||
tensor_names.add(tensor_name)
|
tensor_names.add(tensor_name)
|
||||||
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
ggml_type = GGMLQuantizationType(raw_dtype[0])
|
||||||
n_elems = np.prod(dims)
|
n_elems = int(np.prod(dims))
|
||||||
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
|
||||||
n_bytes = n_elems * type_size // block_size
|
n_bytes = n_elems * type_size // block_size
|
||||||
data_offs = int(start_offs + offset_tensor[0])
|
data_offs = int(start_offs + offset_tensor[0])
|
||||||
|
|
|
@ -7,7 +7,7 @@ import struct
|
||||||
import tempfile
|
import tempfile
|
||||||
from enum import Enum, auto
|
from enum import Enum, auto
|
||||||
from io import BufferedWriter
|
from io import BufferedWriter
|
||||||
from typing import IO, Any, Sequence, Mapping
|
from typing import IO, Any, Callable, Sequence, Mapping
|
||||||
from string import ascii_letters, digits
|
from string import ascii_letters, digits
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
@ -28,6 +28,47 @@ from .constants import (
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class LazyTensor:
|
||||||
|
data: Callable[[], np.ndarray[Any, Any]]
|
||||||
|
# to avoid too deep recursion
|
||||||
|
functions: list[Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]]]
|
||||||
|
dtype: np.dtype[Any]
|
||||||
|
shape: tuple[int, ...]
|
||||||
|
|
||||||
|
def __init__(self, data: Callable[[], np.ndarray[Any, Any]], *, dtype: type, shape: tuple[int, ...]):
|
||||||
|
self.data = data
|
||||||
|
self.functions = []
|
||||||
|
self.dtype = np.dtype(dtype)
|
||||||
|
self.shape = shape
|
||||||
|
|
||||||
|
def astype(self, dtype: type, **kwargs) -> LazyTensor:
|
||||||
|
self.functions.append(lambda n: n.astype(dtype, **kwargs))
|
||||||
|
self.dtype = np.dtype(dtype)
|
||||||
|
return self
|
||||||
|
|
||||||
|
@property
|
||||||
|
def nbytes(self) -> int:
|
||||||
|
size = 1
|
||||||
|
for n in self.shape:
|
||||||
|
size *= n
|
||||||
|
return size * self.dtype.itemsize
|
||||||
|
|
||||||
|
def tofile(self, *args, **kwargs) -> None:
|
||||||
|
data = self.data()
|
||||||
|
for f in self.functions:
|
||||||
|
data = f(data)
|
||||||
|
assert data.shape == self.shape
|
||||||
|
assert data.dtype == self.dtype
|
||||||
|
assert data.nbytes == self.nbytes
|
||||||
|
self.functions = []
|
||||||
|
self.data = lambda: data
|
||||||
|
data.tofile(*args, **kwargs)
|
||||||
|
|
||||||
|
def byteswap(self, *args, **kwargs) -> LazyTensor:
|
||||||
|
self.functions.append(lambda n: n.byteswap(*args, **kwargs))
|
||||||
|
return self
|
||||||
|
|
||||||
|
|
||||||
class WriterState(Enum):
|
class WriterState(Enum):
|
||||||
EMPTY = auto()
|
EMPTY = auto()
|
||||||
HEADER = auto()
|
HEADER = auto()
|
||||||
|
@ -38,7 +79,7 @@ class WriterState(Enum):
|
||||||
class GGUFWriter:
|
class GGUFWriter:
|
||||||
fout: BufferedWriter
|
fout: BufferedWriter
|
||||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
||||||
tensors: list[np.ndarray[Any, Any]]
|
tensors: list[np.ndarray[Any, Any] | LazyTensor]
|
||||||
_simple_value_packing = {
|
_simple_value_packing = {
|
||||||
GGUFValueType.UINT8: "B",
|
GGUFValueType.UINT8: "B",
|
||||||
GGUFValueType.INT8: "b",
|
GGUFValueType.INT8: "b",
|
||||||
|
@ -176,7 +217,7 @@ class GGUFWriter:
|
||||||
if pack_fmt is not None:
|
if pack_fmt is not None:
|
||||||
self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
|
self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
|
||||||
elif vtype == GGUFValueType.STRING:
|
elif vtype == GGUFValueType.STRING:
|
||||||
encoded_val = val.encode("utf8") if isinstance(val, str) else val
|
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
|
||||||
self.kv_data += self._pack("Q", len(encoded_val))
|
self.kv_data += self._pack("Q", len(encoded_val))
|
||||||
self.kv_data += encoded_val
|
self.kv_data += encoded_val
|
||||||
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
|
||||||
|
@ -205,7 +246,7 @@ class GGUFWriter:
|
||||||
raise ValueError(f'Duplicated tensor name {name}')
|
raise ValueError(f'Duplicated tensor name {name}')
|
||||||
self.ti_names.add(name)
|
self.ti_names.add(name)
|
||||||
|
|
||||||
encoded_name = name.encode("utf8")
|
encoded_name = name.encode("utf-8")
|
||||||
self.ti_data += self._pack("Q", len(encoded_name))
|
self.ti_data += self._pack("Q", len(encoded_name))
|
||||||
self.ti_data += encoded_name
|
self.ti_data += encoded_name
|
||||||
n_dims = len(tensor_shape)
|
n_dims = len(tensor_shape)
|
||||||
|
@ -237,7 +278,7 @@ class GGUFWriter:
|
||||||
self.ti_data_count += 1
|
self.ti_data_count += 1
|
||||||
|
|
||||||
def add_tensor(
|
def add_tensor(
|
||||||
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
self, name: str, tensor: np.ndarray[Any, Any] | LazyTensor, raw_shape: Sequence[int] | None = None,
|
||||||
raw_dtype: GGMLQuantizationType | None = None,
|
raw_dtype: GGMLQuantizationType | None = None,
|
||||||
) -> None:
|
) -> None:
|
||||||
if self.endianess == GGUFEndian.BIG:
|
if self.endianess == GGUFEndian.BIG:
|
||||||
|
@ -262,7 +303,7 @@ class GGUFWriter:
|
||||||
if pad != 0:
|
if pad != 0:
|
||||||
fp.write(bytes([0] * pad))
|
fp.write(bytes([0] * pad))
|
||||||
|
|
||||||
def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
|
def write_tensor_data(self, tensor: np.ndarray[Any, Any] | LazyTensor) -> None:
|
||||||
if self.state is not WriterState.TI_DATA:
|
if self.state is not WriterState.TI_DATA:
|
||||||
raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
|
raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
|
||||||
|
|
||||||
|
@ -272,15 +313,33 @@ class GGUFWriter:
|
||||||
tensor.tofile(self.fout)
|
tensor.tofile(self.fout)
|
||||||
self.write_padding(self.fout, tensor.nbytes)
|
self.write_padding(self.fout, tensor.nbytes)
|
||||||
|
|
||||||
def write_tensors_to_file(self) -> None:
|
def write_tensors_to_file(self, *, progress: bool = False) -> None:
|
||||||
self.write_ti_data_to_file()
|
self.write_ti_data_to_file()
|
||||||
|
|
||||||
self.write_padding(self.fout, self.fout.tell())
|
self.write_padding(self.fout, self.fout.tell())
|
||||||
|
|
||||||
if self.temp_file is None:
|
if self.temp_file is None:
|
||||||
|
self.tensors.reverse() # to pop from the "beginning" in constant time
|
||||||
|
|
||||||
|
if progress:
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
total_bytes = sum(t.nbytes for t in self.tensors)
|
||||||
|
|
||||||
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
try:
|
try:
|
||||||
tensor = self.tensors.pop(0)
|
tensor = self.tensors.pop()
|
||||||
|
except IndexError:
|
||||||
|
break
|
||||||
|
tensor.tofile(self.fout)
|
||||||
|
bar.update(tensor.nbytes)
|
||||||
|
self.write_padding(self.fout, tensor.nbytes)
|
||||||
|
return
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
tensor = self.tensors.pop()
|
||||||
except IndexError:
|
except IndexError:
|
||||||
break
|
break
|
||||||
tensor.tofile(self.fout)
|
tensor.tofile(self.fout)
|
||||||
|
@ -479,7 +538,7 @@ class GGUFWriter:
|
||||||
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
|
||||||
|
|
||||||
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
|
||||||
if isinstance(value, list):
|
if not isinstance(value, str):
|
||||||
template_default = None
|
template_default = None
|
||||||
template_names = set()
|
template_names = set()
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@ import logging
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Callable
|
from typing import Any, Callable, Sequence, Mapping, Iterable
|
||||||
|
|
||||||
from .gguf_writer import GGUFWriter
|
from .gguf_writer import GGUFWriter
|
||||||
|
|
||||||
|
@ -15,11 +15,11 @@ class SpecialVocab:
|
||||||
merges: list[str]
|
merges: list[str]
|
||||||
add_special_token: dict[str, bool]
|
add_special_token: dict[str, bool]
|
||||||
special_token_ids: dict[str, int]
|
special_token_ids: dict[str, int]
|
||||||
chat_template: str | None
|
chat_template: str | Sequence[Mapping[str, str]] | None
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self, path: str | os.PathLike[str], load_merges: bool = False,
|
self, path: str | os.PathLike[str], load_merges: bool = False,
|
||||||
special_token_types: tuple[str, ...] | None = None,
|
special_token_types: Iterable[str] | None = None,
|
||||||
n_vocab: int | None = None,
|
n_vocab: int | None = None,
|
||||||
):
|
):
|
||||||
self.special_token_ids = {}
|
self.special_token_ids = {}
|
||||||
|
|
|
@ -21,6 +21,7 @@ classifiers = [
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = ">=3.8"
|
python = ">=3.8"
|
||||||
numpy = ">=1.17"
|
numpy = ">=1.17"
|
||||||
|
tqdm = ">=4.27"
|
||||||
|
|
||||||
[tool.poetry.dev-dependencies]
|
[tool.poetry.dev-dependencies]
|
||||||
pytest = "^5.2"
|
pytest = "^5.2"
|
||||||
|
|
|
@ -47,7 +47,7 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
|
||||||
if len(field.types) == 1:
|
if len(field.types) == 1:
|
||||||
curr_type = field.types[0]
|
curr_type = field.types[0]
|
||||||
if curr_type == GGUFValueType.STRING:
|
if curr_type == GGUFValueType.STRING:
|
||||||
log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60]))
|
log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60]))
|
||||||
elif field.types[0] in reader.gguf_scalar_to_np:
|
elif field.types[0] in reader.gguf_scalar_to_np:
|
||||||
log_message += ' = {0}'.format(field.parts[-1][0])
|
log_message += ' = {0}'.format(field.parts[-1][0])
|
||||||
print(log_message) # noqa: NP100
|
print(log_message) # noqa: NP100
|
||||||
|
|
100
gguf-py/scripts/gguf-new-metadata.py
Normal file → Executable file
|
@ -7,7 +7,8 @@ import json
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from typing import Any, Mapping, Sequence
|
from tqdm import tqdm
|
||||||
|
from typing import Any, Sequence, NamedTuple
|
||||||
|
|
||||||
# Necessary to load the local gguf package
|
# Necessary to load the local gguf package
|
||||||
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
|
||||||
|
@ -18,6 +19,12 @@ import gguf
|
||||||
logger = logging.getLogger("gguf-new-metadata")
|
logger = logging.getLogger("gguf-new-metadata")
|
||||||
|
|
||||||
|
|
||||||
|
class MetadataDetails(NamedTuple):
|
||||||
|
type: gguf.GGUFValueType
|
||||||
|
value: Any
|
||||||
|
description: str = ''
|
||||||
|
|
||||||
|
|
||||||
def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
|
def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
|
||||||
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
|
||||||
# Host is little endian
|
# Host is little endian
|
||||||
|
@ -34,7 +41,7 @@ def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
|
||||||
return host_endian
|
return host_endian
|
||||||
|
|
||||||
|
|
||||||
def decode_field(field: gguf.ReaderField) -> Any:
|
def decode_field(field: gguf.ReaderField | None) -> Any:
|
||||||
if field and field.types:
|
if field and field.types:
|
||||||
main_type = field.types[0]
|
main_type = field.types[0]
|
||||||
|
|
||||||
|
@ -42,11 +49,11 @@ def decode_field(field: gguf.ReaderField) -> Any:
|
||||||
sub_type = field.types[-1]
|
sub_type = field.types[-1]
|
||||||
|
|
||||||
if sub_type == gguf.GGUFValueType.STRING:
|
if sub_type == gguf.GGUFValueType.STRING:
|
||||||
return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
|
return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data]
|
||||||
else:
|
else:
|
||||||
return [pv for idx in field.data for pv in field.parts[idx].tolist()]
|
return [pv for idx in field.data for pv in field.parts[idx].tolist()]
|
||||||
if main_type == gguf.GGUFValueType.STRING:
|
if main_type == gguf.GGUFValueType.STRING:
|
||||||
return str(bytes(field.parts[-1]), encoding='utf8')
|
return str(bytes(field.parts[-1]), encoding='utf-8')
|
||||||
else:
|
else:
|
||||||
return field.parts[-1][0]
|
return field.parts[-1][0]
|
||||||
|
|
||||||
|
@ -59,7 +66,16 @@ def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
|
||||||
return decode_field(field)
|
return decode_field(field)
|
||||||
|
|
||||||
|
|
||||||
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: Mapping[str, str], remove_metadata: Sequence[str]) -> None:
|
def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
|
||||||
|
token_ids = [index for index, value in enumerate(token_list) if value == token]
|
||||||
|
|
||||||
|
if len(token_ids) == 0:
|
||||||
|
raise LookupError(f'Unable to find "{token}" in token list!')
|
||||||
|
|
||||||
|
return token_ids
|
||||||
|
|
||||||
|
|
||||||
|
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None:
|
||||||
for field in reader.fields.values():
|
for field in reader.fields.values():
|
||||||
# Suppress virtual fields and fields written by GGUFWriter
|
# Suppress virtual fields and fields written by GGUFWriter
|
||||||
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
|
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
|
||||||
|
@ -75,54 +91,64 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
|
||||||
logger.debug(f'Removing {field.name}')
|
logger.debug(f'Removing {field.name}')
|
||||||
continue
|
continue
|
||||||
|
|
||||||
old_val = decode_field(field)
|
old_val = MetadataDetails(field.types[0], decode_field(field))
|
||||||
val = new_metadata.get(field.name, old_val)
|
val = new_metadata.get(field.name, old_val)
|
||||||
|
|
||||||
if field.name in new_metadata:
|
if field.name in new_metadata:
|
||||||
logger.debug(f'Modifying {field.name}: "{old_val}" -> "{val}"')
|
logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}')
|
||||||
del new_metadata[field.name]
|
del new_metadata[field.name]
|
||||||
elif val is not None:
|
elif val.value is not None:
|
||||||
logger.debug(f'Copying {field.name}')
|
logger.debug(f'Copying {field.name}')
|
||||||
|
|
||||||
if val is not None:
|
if val.value is not None:
|
||||||
writer.add_key(field.name)
|
writer.add_key(field.name)
|
||||||
writer.add_val(val, field.types[0])
|
writer.add_val(val.value, val.type)
|
||||||
|
|
||||||
if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
|
if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
|
||||||
logger.debug('Adding chat template(s)')
|
logger.debug('Adding chat template(s)')
|
||||||
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE])
|
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
|
||||||
del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
|
del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
|
||||||
|
|
||||||
# TODO: Support other types than string?
|
|
||||||
for key, val in new_metadata.items():
|
for key, val in new_metadata.items():
|
||||||
logger.debug(f'Adding {key}: {val}')
|
logger.debug(f'Adding {key}: "{val.value}" {val.description}')
|
||||||
writer.add_key(key)
|
writer.add_key(key)
|
||||||
writer.add_val(val, gguf.GGUFValueType.STRING)
|
writer.add_val(val.value, val.type)
|
||||||
|
|
||||||
|
total_bytes = 0
|
||||||
|
|
||||||
for tensor in reader.tensors:
|
for tensor in reader.tensors:
|
||||||
|
total_bytes += tensor.n_bytes
|
||||||
# Dimensions are written in reverse order, so flip them first
|
# Dimensions are written in reverse order, so flip them first
|
||||||
shape = np.flipud(tensor.shape)
|
shape = np.flipud(tensor.shape).tolist()
|
||||||
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
|
||||||
|
|
||||||
|
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
|
||||||
|
|
||||||
writer.write_header_to_file()
|
writer.write_header_to_file()
|
||||||
writer.write_kv_data_to_file()
|
writer.write_kv_data_to_file()
|
||||||
writer.write_ti_data_to_file()
|
writer.write_ti_data_to_file()
|
||||||
|
|
||||||
for tensor in reader.tensors:
|
for tensor in reader.tensors:
|
||||||
writer.write_tensor_data(tensor.data)
|
writer.write_tensor_data(tensor.data)
|
||||||
|
bar.update(tensor.n_bytes)
|
||||||
|
|
||||||
writer.close()
|
writer.close()
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
|
tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_'))
|
||||||
|
token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id'))
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
|
parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
|
||||||
parser.add_argument("input", type=Path, help="GGUF format model input filename")
|
parser.add_argument("input", type=Path, help="GGUF format model input filename")
|
||||||
parser.add_argument("output", type=Path, help="GGUF format model output filename")
|
parser.add_argument("output", type=Path, help="GGUF format model output filename")
|
||||||
parser.add_argument("--general-name", type=str, help="The models general.name")
|
parser.add_argument("--general-name", type=str, help="The models general.name", metavar='"name"')
|
||||||
parser.add_argument("--general-description", type=str, help="The models general.description")
|
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
|
||||||
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)")
|
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
|
||||||
parser.add_argument("--chat-template-config", type=Path, help="Config file (tokenizer_config.json) containing chat template(s)")
|
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
|
||||||
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model")
|
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
|
||||||
|
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
|
||||||
|
parser.add_argument("--special-token-by-id", action="append", type=str, help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
|
||||||
parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation")
|
parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation")
|
||||||
parser.add_argument("--verbose", action="store_true", help="Increase output verbosity")
|
parser.add_argument("--verbose", action="store_true", help="Increase output verbosity")
|
||||||
args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
|
args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
|
||||||
|
@ -133,20 +159,20 @@ def main() -> None:
|
||||||
remove_metadata = args.remove_metadata or []
|
remove_metadata = args.remove_metadata or []
|
||||||
|
|
||||||
if args.general_name:
|
if args.general_name:
|
||||||
new_metadata[gguf.Keys.General.NAME] = args.general_name
|
new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name)
|
||||||
|
|
||||||
if args.general_description:
|
if args.general_description:
|
||||||
new_metadata[gguf.Keys.General.DESCRIPTION] = args.general_description
|
new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description)
|
||||||
|
|
||||||
if args.chat_template:
|
if args.chat_template:
|
||||||
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template
|
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
|
||||||
|
|
||||||
if args.chat_template_config:
|
if args.chat_template_config:
|
||||||
with open(args.chat_template_config, 'r') as fp:
|
with open(args.chat_template_config, 'r') as fp:
|
||||||
config = json.load(fp)
|
config = json.load(fp)
|
||||||
template = config.get('chat_template')
|
template = config.get('chat_template')
|
||||||
if template:
|
if template:
|
||||||
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = template
|
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
|
||||||
|
|
||||||
if remove_metadata:
|
if remove_metadata:
|
||||||
logger.warning('*** Warning *** Warning *** Warning **')
|
logger.warning('*** Warning *** Warning *** Warning **')
|
||||||
|
@ -166,6 +192,32 @@ def main() -> None:
|
||||||
arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
|
arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
|
||||||
endianess = get_byteorder(reader)
|
endianess = get_byteorder(reader)
|
||||||
|
|
||||||
|
token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
|
||||||
|
|
||||||
|
for name, token in args.special_token or []:
|
||||||
|
if name not in token_names:
|
||||||
|
logger.warning(f'Unknown special token "{name}", ignoring...')
|
||||||
|
else:
|
||||||
|
ids = find_token(token_list, token)
|
||||||
|
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}')
|
||||||
|
|
||||||
|
if len(ids) > 1:
|
||||||
|
logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:')
|
||||||
|
logger.warning(', '.join(str(i) for i in ids))
|
||||||
|
|
||||||
|
for name, id_string in args.special_token_by_id or []:
|
||||||
|
if name not in token_names:
|
||||||
|
logger.warning(f'Unknown special token "{name}", ignoring...')
|
||||||
|
elif not id_string.isdecimal():
|
||||||
|
raise LookupError(f'Token ID "{id_string}" is not a valid ID!')
|
||||||
|
else:
|
||||||
|
id_int = int(id_string)
|
||||||
|
|
||||||
|
if id_int >= 0 and id_int < len(token_list):
|
||||||
|
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}')
|
||||||
|
else:
|
||||||
|
raise LookupError(f'Token ID {id_int} is not within token list!')
|
||||||
|
|
||||||
if os.path.isfile(args.output) and not args.force:
|
if os.path.isfile(args.output) and not args.force:
|
||||||
logger.warning('*** Warning *** Warning *** Warning **')
|
logger.warning('*** Warning *** Warning *** Warning **')
|
||||||
logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')
|
logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')
|
||||||
|
|
|
@ -51,7 +51,7 @@ single-line ::= [^\n]+ "\n"`
|
||||||
|
|
||||||
## Sequences and Alternatives
|
## Sequences and Alternatives
|
||||||
|
|
||||||
The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
|
The order of symbols in a sequence matters. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
|
||||||
|
|
||||||
Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
|
Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.
|
||||||
|
|
||||||
|
|
49
llama.cpp
|
@ -3175,6 +3175,7 @@ struct llama_model_loader {
|
||||||
switch (type_max) {
|
switch (type_max) {
|
||||||
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
|
||||||
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
|
||||||
|
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
|
||||||
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
|
||||||
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
|
||||||
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
|
||||||
|
@ -3666,6 +3667,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
||||||
switch (ftype) {
|
switch (ftype) {
|
||||||
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
case LLAMA_FTYPE_ALL_F32: return "all F32";
|
||||||
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
|
||||||
|
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
|
||||||
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
|
||||||
|
@ -4389,6 +4391,15 @@ static void llm_load_vocab(
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "command-r") {
|
tokenizer_pre == "command-r") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "qwen2") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "olmo") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
|
||||||
|
} else if (
|
||||||
|
tokenizer_pre == "dbrx") {
|
||||||
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
|
||||||
} else {
|
} else {
|
||||||
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
|
||||||
}
|
}
|
||||||
|
@ -6126,6 +6137,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
||||||
|| !(
|
|| !(
|
||||||
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
model.ftype == LLAMA_FTYPE_ALL_F32 ||
|
||||||
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
|
||||||
|
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
|
||||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
|
||||||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
|
||||||
)
|
)
|
||||||
|
@ -12194,6 +12206,7 @@ struct llm_tokenizer_bpe {
|
||||||
case LLAMA_VOCAB_TYPE_BPE:
|
case LLAMA_VOCAB_TYPE_BPE:
|
||||||
switch (vocab.type_pre) {
|
switch (vocab.type_pre) {
|
||||||
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_DBRX:
|
||||||
word_collection = unicode_regex_split(text, {
|
word_collection = unicode_regex_split(text, {
|
||||||
// original regex from tokenizer.json
|
// original regex from tokenizer.json
|
||||||
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
@ -12248,10 +12261,18 @@ struct llm_tokenizer_bpe {
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
case LLAMA_VOCAB_PRE_TYPE_GPT2:
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_OLMO:
|
||||||
word_collection = unicode_regex_split(text, {
|
word_collection = unicode_regex_split(text, {
|
||||||
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
||||||
});
|
});
|
||||||
break;
|
break;
|
||||||
|
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
|
||||||
|
word_collection = unicode_regex_split(text, {
|
||||||
|
// original regex from tokenizer.json
|
||||||
|
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
|
||||||
|
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
|
||||||
|
});
|
||||||
|
break;
|
||||||
default:
|
default:
|
||||||
// default regex for BPE tokenization pre-processing
|
// default regex for BPE tokenization pre-processing
|
||||||
word_collection = unicode_regex_split(text, {
|
word_collection = unicode_regex_split(text, {
|
||||||
|
@ -12467,7 +12488,7 @@ struct llm_tokenizer_wpm {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
code = unicode_tolower(code);
|
code = unicode_tolower(code);
|
||||||
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
if (type == CODEPOINT_TYPE_SEPARATOR) {
|
||||||
code = ' ';
|
code = ' ';
|
||||||
}
|
}
|
||||||
std::string s = unicode_cpt_to_utf8(code);
|
std::string s = unicode_cpt_to_utf8(code);
|
||||||
|
@ -14154,13 +14175,16 @@ static void llama_tensor_dequantize_internal(
|
||||||
if (qtype.to_float == NULL) {
|
if (qtype.to_float == NULL) {
|
||||||
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
|
||||||
}
|
}
|
||||||
} else if (tensor->type != GGML_TYPE_F16) {
|
} else if (tensor->type != GGML_TYPE_F16 &&
|
||||||
|
tensor->type != GGML_TYPE_BF16) {
|
||||||
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nthread < 2) {
|
if (nthread < 2) {
|
||||||
if (tensor->type == GGML_TYPE_F16) {
|
if (tensor->type == GGML_TYPE_F16) {
|
||||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
|
||||||
|
} else if (tensor->type == GGML_TYPE_BF16) {
|
||||||
|
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
|
||||||
} else if (ggml_is_quantized(tensor->type)) {
|
} else if (ggml_is_quantized(tensor->type)) {
|
||||||
qtype.to_float(tensor->data, f32_output, nelements);
|
qtype.to_float(tensor->data, f32_output, nelements);
|
||||||
} else {
|
} else {
|
||||||
|
@ -14169,7 +14193,14 @@ static void llama_tensor_dequantize_internal(
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
size_t block_size;
|
||||||
|
if (tensor->type == GGML_TYPE_F16 ||
|
||||||
|
tensor->type == GGML_TYPE_BF16) {
|
||||||
|
block_size = 1;
|
||||||
|
} else {
|
||||||
|
block_size = (size_t)ggml_blck_size(tensor->type);
|
||||||
|
}
|
||||||
|
|
||||||
size_t block_size_bytes = ggml_type_size(tensor->type);
|
size_t block_size_bytes = ggml_type_size(tensor->type);
|
||||||
|
|
||||||
GGML_ASSERT(nelements % block_size == 0);
|
GGML_ASSERT(nelements % block_size == 0);
|
||||||
|
@ -14188,6 +14219,8 @@ static void llama_tensor_dequantize_internal(
|
||||||
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
||||||
if (typ == GGML_TYPE_F16) {
|
if (typ == GGML_TYPE_F16) {
|
||||||
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
|
||||||
|
} else if (typ == GGML_TYPE_BF16) {
|
||||||
|
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
|
||||||
} else {
|
} else {
|
||||||
qtype.to_float(inbuf, outbuf, nels);
|
qtype.to_float(inbuf, outbuf, nels);
|
||||||
}
|
}
|
||||||
|
@ -14548,6 +14581,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
||||||
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
|
||||||
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
|
||||||
|
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
|
||||||
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
|
||||||
|
|
||||||
// K-quants
|
// K-quants
|
||||||
|
@ -15485,13 +15519,6 @@ struct llama_context * llama_new_context_with_model(
|
||||||
cparams.flash_attn = false;
|
cparams.flash_attn = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_HIPBLAS
|
|
||||||
if (cparams.flash_attn) {
|
|
||||||
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
|
|
||||||
cparams.flash_attn = false;
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
if (params.seed == LLAMA_DEFAULT_SEED) {
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
||||||
params.seed = time(NULL);
|
params.seed = time(NULL);
|
||||||
}
|
}
|
||||||
|
@ -17845,7 +17872,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
|
||||||
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
|
||||||
|
|
||||||
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
/*.n_sample =*/ std::max(1, ctx->n_sample),
|
||||||
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
|
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
|
||||||
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
/*.n_eval =*/ std::max(1, ctx->n_eval),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
4
llama.h
|
@ -81,6 +81,9 @@ extern "C" {
|
||||||
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
|
||||||
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
|
||||||
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
|
||||||
|
LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
|
||||||
};
|
};
|
||||||
|
|
||||||
// note: these values should be synchronized with ggml_rope
|
// note: these values should be synchronized with ggml_rope
|
||||||
|
@ -136,6 +139,7 @@ extern "C" {
|
||||||
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
|
||||||
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
|
||||||
|
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
|
||||||
|
|
||||||
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
|
||||||
};
|
};
|
||||||
|
|
BIN
models/ggml-vocab-qwen2.gguf
Normal file
106
models/ggml-vocab-qwen2.gguf.inp
Normal file
|
@ -0,0 +1,106 @@
|
||||||
|
ied 4 ½ months
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Führer
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello world
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello world
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello World
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello World
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello World!
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello, world!
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello, world!
|
||||||
|
__ggml_vocab_test__
|
||||||
|
this is 🦙.cpp
|
||||||
|
__ggml_vocab_test__
|
||||||
|
w048 7tuijk dsdfhu
|
||||||
|
__ggml_vocab_test__
|
||||||
|
нещо на Български
|
||||||
|
__ggml_vocab_test__
|
||||||
|
កាន់តែពិសេសអាចខលចេញ
|
||||||
|
__ggml_vocab_test__
|
||||||
|
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello
|
||||||
|
Hello
|
||||||
|
__ggml_vocab_test__
|
||||||
|
(
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
=
|
||||||
|
__ggml_vocab_test__
|
||||||
|
' era
|
||||||
|
__ggml_vocab_test__
|
||||||
|
Hello, y'all! How are you 😁 ?我想在apple工作1314151天~
|
||||||
|
__ggml_vocab_test__
|
||||||
|
3
|
||||||
|
__ggml_vocab_test__
|
||||||
|
33
|
||||||
|
__ggml_vocab_test__
|
||||||
|
333
|
||||||
|
__ggml_vocab_test__
|
||||||
|
3333
|
||||||
|
__ggml_vocab_test__
|
||||||
|
33333
|
||||||
|
__ggml_vocab_test__
|
||||||
|
333333
|
||||||
|
__ggml_vocab_test__
|
||||||
|
3333333
|
||||||
|
__ggml_vocab_test__
|
||||||
|
33333333
|
||||||
|
__ggml_vocab_test__
|
||||||
|
333333333
|
||||||
|
__ggml_vocab_test__
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天~ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
|
||||||
|
__ggml_vocab_test__
|
43
models/ggml-vocab-qwen2.gguf.out
Normal file
|
@ -0,0 +1,43 @@
|
||||||
|
1122 220 19 220 26062 3951
|
||||||
|
37 50753 261
|
||||||
|
|
||||||
|
220
|
||||||
|
256
|
||||||
|
262
|
||||||
|
197
|
||||||
|
198
|
||||||
|
271
|
||||||
|
1406
|
||||||
|
1572
|
||||||
|
9707 1879
|
||||||
|
21927 1879
|
||||||
|
9707 4337
|
||||||
|
21927 4337
|
||||||
|
21927 4337 0
|
||||||
|
9707 11 1879 0
|
||||||
|
21927 11 1879 0
|
||||||
|
419 374 11162 99 247 13 10821
|
||||||
|
86 15 19 23 220 22 83 1963 41808 11472 2940 16739
|
||||||
|
78762 14144 1456 13073 63471 33594 3038 133178 79012
|
||||||
|
146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
|
||||||
|
145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
|
||||||
|
9707
|
||||||
|
21927
|
||||||
|
220 21927
|
||||||
|
256 21927
|
||||||
|
262 21927
|
||||||
|
262 21927 198 262 21927
|
||||||
|
320
|
||||||
|
198 284
|
||||||
|
6 11385
|
||||||
|
9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
|
||||||
|
18
|
||||||
|
18 18
|
||||||
|
18 18 18
|
||||||
|
18 18 18 18
|
||||||
|
18 18 18 18 18
|
||||||
|
18 18 18 18 18 18
|
||||||
|
18 18 18 18 18 18 18
|
||||||
|
18 18 18 18 18 18 18 18
|
||||||
|
18 18 18 18 18 18 18 18 18
|
||||||
|
198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
|
3
pyrightconfig.json
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"extraPaths": ["gguf-py"],
|
||||||
|
}
|
|
@ -1,3 +1,2 @@
|
||||||
-r ./requirements-convert.txt
|
-r ./requirements-convert.txt
|
||||||
torch~=2.1.1
|
torch~=2.1.1
|
||||||
einops~=0.7.0
|
|
||||||
|
|
|
@ -1,3 +1,2 @@
|
||||||
-r ./requirements-convert.txt
|
-r ./requirements-convert.txt
|
||||||
torch~=2.1.1
|
torch~=2.1.1
|
||||||
einops~=0.7.0
|
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
numpy~=1.24.4
|
numpy~=1.24.4
|
||||||
sentencepiece~=0.1.98
|
sentencepiece~=0.2.0
|
||||||
transformers>=4.40.1,<5.0.0
|
transformers>=4.40.1,<5.0.0
|
||||||
gguf>=0.1.0
|
gguf>=0.1.0
|
||||||
protobuf>=4.21.0,<5.0.0
|
protobuf>=4.21.0,<5.0.0
|
||||||
|
|
|
@ -93,11 +93,14 @@ help_s = (
|
||||||
"specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
|
"specified values are averaged WITHOUT weighing by the --repetitions parameter of llama-bench."
|
||||||
)
|
)
|
||||||
parser.add_argument("-s", "--show", help=help_s)
|
parser.add_argument("-s", "--show", help=help_s)
|
||||||
|
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||||
|
|
||||||
known_args, unknown_args = parser.parse_known_args()
|
known_args, unknown_args = parser.parse_known_args()
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG if known_args.verbose else logging.INFO)
|
||||||
|
|
||||||
if unknown_args:
|
if unknown_args:
|
||||||
logger.error(f"Received unknown args: {unknown_args}.")
|
logger.error(f"Received unknown args: {unknown_args}.\n")
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
@ -110,7 +113,7 @@ if input_file is None:
|
||||||
input_file = sqlite_files[0]
|
input_file = sqlite_files[0]
|
||||||
|
|
||||||
if input_file is None:
|
if input_file is None:
|
||||||
logger.error("Cannot find a suitable input file, please provide one.")
|
logger.error("Cannot find a suitable input file, please provide one.\n")
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
@ -202,12 +205,12 @@ elif repo is not None:
|
||||||
hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)
|
hexsha8_baseline = find_parent_in_data(repo.heads.master.commit)
|
||||||
|
|
||||||
if hexsha8_baseline is None:
|
if hexsha8_baseline is None:
|
||||||
logger.error("No baseline was provided and did not find data for any master branch commits.")
|
logger.error("No baseline was provided and did not find data for any master branch commits.\n")
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
logger.error("No baseline was provided and the current working directory "
|
logger.error("No baseline was provided and the current working directory "
|
||||||
"is not part of a git repository from which a baseline could be inferred.")
|
"is not part of a git repository from which a baseline could be inferred.\n")
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
@ -238,7 +241,7 @@ elif repo is not None:
|
||||||
break
|
break
|
||||||
|
|
||||||
if hexsha8_compare is None:
|
if hexsha8_compare is None:
|
||||||
logger.error("No compare target was provided and did not find data for any non-master commits.")
|
logger.error("No compare target was provided and did not find data for any non-master commits.\n")
|
||||||
parser.print_help()
|
parser.print_help()
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
else:
|
else:
|
||||||
|
@ -361,7 +364,7 @@ if "gpu_info" in show:
|
||||||
headers = [PRETTY_NAMES[p] for p in show]
|
headers = [PRETTY_NAMES[p] for p in show]
|
||||||
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
headers += ["Test", f"t/s {name_baseline}", f"t/s {name_compare}", "Speedup"]
|
||||||
|
|
||||||
logger.info(tabulate(
|
print(tabulate( # noqa: NP100
|
||||||
table,
|
table,
|
||||||
headers=headers,
|
headers=headers,
|
||||||
floatfmt=".2f",
|
floatfmt=".2f",
|
||||||
|
|
|
@ -1,31 +1,14 @@
|
||||||
import regex
|
import regex
|
||||||
|
|
||||||
|
|
||||||
def cpt_to_utf8_str(cpt):
|
|
||||||
if cpt <= 0xFF:
|
|
||||||
return bytes([cpt, 0, 0, 0])
|
|
||||||
elif cpt <= 0xFFFF:
|
|
||||||
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
|
|
||||||
elif cpt <= 0xFFFFFF:
|
|
||||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
|
|
||||||
else:
|
|
||||||
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
|
|
||||||
|
|
||||||
|
|
||||||
def is_match(codepoint, regex_expr):
|
|
||||||
try:
|
|
||||||
res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
|
|
||||||
return res is not None
|
|
||||||
except Exception:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def get_matches(regex_expr):
|
def get_matches(regex_expr):
|
||||||
|
regex_expr_compiled = regex.compile(regex_expr)
|
||||||
unicode_ranges = []
|
unicode_ranges = []
|
||||||
current_range = None
|
current_range = None
|
||||||
|
|
||||||
for codepoint in range(0x110000):
|
for codepoint in range(0x110000):
|
||||||
if is_match(codepoint, regex_expr):
|
char = chr(codepoint)
|
||||||
|
if regex_expr_compiled.match(char):
|
||||||
if current_range is None:
|
if current_range is None:
|
||||||
current_range = [codepoint, codepoint]
|
current_range = [codepoint, codepoint]
|
||||||
else:
|
else:
|
||||||
|
@ -40,27 +23,42 @@ def get_matches(regex_expr):
|
||||||
return unicode_ranges
|
return unicode_ranges
|
||||||
|
|
||||||
|
|
||||||
def print_cat(cat, ranges):
|
def print_cat(mode, cat, ranges):
|
||||||
|
if mode == "range":
|
||||||
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
|
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
|
||||||
cnt = 0
|
if mode == "map":
|
||||||
for start, end in ranges:
|
print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat)) # noqa: NP100
|
||||||
if cnt % 4 != 0:
|
for i, values in enumerate(ranges):
|
||||||
print(" ", end="") # noqa: NP100
|
end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", "
|
||||||
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
|
values = ["0x%08X" % value for value in values]
|
||||||
if cnt % 4 == 3:
|
print("{" + ", ".join(values) + "}", end=end) # noqa: NP100
|
||||||
print("") # noqa: NP100
|
|
||||||
cnt += 1
|
|
||||||
|
|
||||||
if cnt % 4 != 0:
|
|
||||||
print("") # noqa: NP100
|
|
||||||
print("};") # noqa: NP100
|
print("};") # noqa: NP100
|
||||||
print("") # noqa: NP100
|
print("") # noqa: NP100
|
||||||
|
|
||||||
|
|
||||||
print_cat("number", get_matches(r'\p{N}'))
|
print_cat("range", "number", get_matches(r'\p{N}'))
|
||||||
print_cat("letter", get_matches(r'\p{L}'))
|
print_cat("range", "letter", get_matches(r'\p{L}'))
|
||||||
print_cat("whitespace", get_matches(r'\p{Z}'))
|
print_cat("range", "separator", get_matches(r'\p{Z}'))
|
||||||
print_cat("accent_mark", get_matches(r'\p{M}'))
|
print_cat("range", "accent_mark", get_matches(r'\p{M}'))
|
||||||
print_cat("punctuation", get_matches(r'\p{P}'))
|
print_cat("range", "punctuation", get_matches(r'\p{P}'))
|
||||||
print_cat("symbol", get_matches(r'\p{S}'))
|
print_cat("range", "symbol", get_matches(r'\p{S}'))
|
||||||
print_cat("control", get_matches(r'\p{C}'))
|
print_cat("range", "control", get_matches(r'\p{C}'))
|
||||||
|
|
||||||
|
print_cat("range", "whitespace", get_matches(r'\s'))
|
||||||
|
|
||||||
|
|
||||||
|
map_lowercase = []
|
||||||
|
map_uppercase = []
|
||||||
|
for codepoint in range(0x110000):
|
||||||
|
char = chr(codepoint)
|
||||||
|
lower = ord(char.lower()[0])
|
||||||
|
upper = ord(char.upper()[0])
|
||||||
|
if codepoint != lower:
|
||||||
|
map_lowercase.append((codepoint, lower))
|
||||||
|
if codepoint != upper:
|
||||||
|
map_uppercase.append((codepoint, upper))
|
||||||
|
print_cat("map", "lowercase", map_lowercase)
|
||||||
|
print_cat("map", "uppercase", map_uppercase)
|
||||||
|
|
||||||
|
|
||||||
|
# TODO: generate unicode_map_nfd
|
||||||
|
|
67
sgemm.cpp
|
@ -1,6 +1,3 @@
|
||||||
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
|
|
||||||
// vi: set et ft=c++ ts=4 sts=4 sw=4 fenc=utf-8 :vi
|
|
||||||
//
|
|
||||||
// Copyright 2024 Mozilla Foundation
|
// Copyright 2024 Mozilla Foundation
|
||||||
//
|
//
|
||||||
// Permission is hereby granted, free of charge, to any person obtaining
|
// Permission is hereby granted, free of charge, to any person obtaining
|
||||||
|
@ -585,11 +582,11 @@ class tinyBLAS_Q0_ARM {
|
||||||
};
|
};
|
||||||
#endif // __ARM_FEATURE_DOTPROD
|
#endif // __ARM_FEATURE_DOTPROD
|
||||||
|
|
||||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
||||||
template <typename TA, typename TB, typename TC>
|
template <typename TA, typename TB, typename TC>
|
||||||
class tinyBLAS_Q0_AVX2 {
|
class tinyBLAS_Q0_AVX {
|
||||||
public:
|
public:
|
||||||
tinyBLAS_Q0_AVX2(int64_t k,
|
tinyBLAS_Q0_AVX(int64_t k,
|
||||||
const TA *A, int64_t lda,
|
const TA *A, int64_t lda,
|
||||||
const TB *B, int64_t ldb,
|
const TB *B, int64_t ldb,
|
||||||
TC *C, int64_t ldc,
|
TC *C, int64_t ldc,
|
||||||
|
@ -728,14 +725,34 @@ class tinyBLAS_Q0_AVX2 {
|
||||||
__m256 Cv[RN][RM] = {};
|
__m256 Cv[RN][RM] = {};
|
||||||
for (int64_t l = 0; l < k; ++l)
|
for (int64_t l = 0; l < k; ++l)
|
||||||
for (int64_t j = 0; j < RN; ++j)
|
for (int64_t j = 0; j < RN; ++j)
|
||||||
for (int64_t i = 0; i < RM; ++i)
|
for (int64_t i = 0; i < RM; ++i) {
|
||||||
Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
|
#if defined(__AVX2__)
|
||||||
unhalf(B[ldb * (jj + j) + l].d)),
|
__m256 udTmp = updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
||||||
updot(_mm256_sign_epi8(load(A + lda * (ii + i) + l),
|
|
||||||
load(A + lda * (ii + i) + l)),
|
load(A + lda * (ii + i) + l)),
|
||||||
_mm256_sign_epi8(load(B + ldb * (jj + j) + l),
|
_mm256_sign_epi8(load(B + ldb * (jj + j) + l),
|
||||||
load(A + lda * (ii + i) + l))),
|
load(A + lda * (ii + i) + l)));
|
||||||
|
#else
|
||||||
|
__m128i ali0 = load0(A + lda * (ii + i) + l);
|
||||||
|
__m128i ali1 = load1(A + lda * (ii + i) + l);
|
||||||
|
__m128i blj0 = load0(B + ldb * (jj + j) + l);
|
||||||
|
__m128i blj1 = load1(B + ldb * (jj + j) + l);
|
||||||
|
|
||||||
|
__m128i sepAA0 = _mm_sign_epi8(ali0, ali0);
|
||||||
|
__m128i sepAA1 = _mm_sign_epi8(ali1, ali1);
|
||||||
|
__m128i sepBA0 = _mm_sign_epi8(blj0, ali0);
|
||||||
|
__m128i sepBA1 = _mm_sign_epi8(blj1, ali1);
|
||||||
|
|
||||||
|
// updot
|
||||||
|
const __m128i oneFill = _mm_set1_epi16(1);
|
||||||
|
__m128i mad0 = _mm_maddubs_epi16(sepAA0, sepBA0);
|
||||||
|
__m128i mad1 = _mm_maddubs_epi16(sepAA1, sepBA1);
|
||||||
|
__m256 udTmp = _mm256_cvtepi32_ps(MM256_SET_M128I(_mm_madd_epi16(oneFill, mad1), _mm_madd_epi16(oneFill, mad0)));
|
||||||
|
#endif
|
||||||
|
Cv[j][i] = madd(_mm256_set1_ps(unhalf(A[lda * (ii + i) + l].d) *
|
||||||
|
unhalf(B[ldb * (jj + j) + l].d)),
|
||||||
|
udTmp,
|
||||||
Cv[j][i]);
|
Cv[j][i]);
|
||||||
|
}
|
||||||
for (int64_t j = 0; j < RN; ++j)
|
for (int64_t j = 0; j < RN; ++j)
|
||||||
for (int64_t i = 0; i < RM; ++i)
|
for (int64_t i = 0; i < RM; ++i)
|
||||||
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
C[ldc * (jj + j) + (ii + i)] = hsum(Cv[j][i]);
|
||||||
|
@ -746,10 +763,28 @@ class tinyBLAS_Q0_AVX2 {
|
||||||
return _mm256_loadu_si256((const __m256i *)b->qs);
|
return _mm256_loadu_si256((const __m256i *)b->qs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline __m128i load0(const block_q8_0 *b) {
|
||||||
|
return _mm_loadu_si128((const __m128i *)b->qs);
|
||||||
|
}
|
||||||
|
|
||||||
|
inline __m128i load1(const block_q8_0 *b) {
|
||||||
|
return _mm_loadu_si128(((const __m128i *)b->qs) + 1);
|
||||||
|
}
|
||||||
|
|
||||||
inline __m256i load(const block_q4_0 *b) {
|
inline __m256i load(const block_q4_0 *b) {
|
||||||
return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
|
return _mm256_sub_epi8(denibble(b->qs), _mm256_set1_epi8(8));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
inline __m128i load0(const block_q4_0 *b) {
|
||||||
|
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
||||||
|
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), x), _mm_set1_epi8(8));
|
||||||
|
}
|
||||||
|
|
||||||
|
inline __m128i load1(const block_q4_0 *b) {
|
||||||
|
const __m128i x = _mm_loadu_si128((const __m128i *)(b->qs));
|
||||||
|
return _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(x, 4)), _mm_set1_epi8(8));
|
||||||
|
}
|
||||||
|
|
||||||
inline __m256 updot(__m256i u, __m256i s) {
|
inline __m256 updot(__m256i u, __m256i s) {
|
||||||
__m256i res;
|
__m256i res;
|
||||||
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
#if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
|
||||||
|
@ -777,7 +812,7 @@ class tinyBLAS_Q0_AVX2 {
|
||||||
const int ith;
|
const int ith;
|
||||||
const int nth;
|
const int nth;
|
||||||
};
|
};
|
||||||
#endif // __AVX2__
|
#endif // __AVX__
|
||||||
|
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
|
@ -928,8 +963,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
case GGML_TYPE_Q8_0: {
|
case GGML_TYPE_Q8_0: {
|
||||||
if (Btype != GGML_TYPE_Q8_0)
|
if (Btype != GGML_TYPE_Q8_0)
|
||||||
return false;
|
return false;
|
||||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
||||||
tinyBLAS_Q0_AVX2<block_q8_0, block_q8_0, float> tb{
|
tinyBLAS_Q0_AVX<block_q8_0, block_q8_0, float> tb{
|
||||||
k, (const block_q8_0 *)A, lda,
|
k, (const block_q8_0 *)A, lda,
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
|
@ -952,8 +987,8 @@ bool llamafile_sgemm(int64_t m, int64_t n, int64_t k, const void *A, int64_t lda
|
||||||
case GGML_TYPE_Q4_0: {
|
case GGML_TYPE_Q4_0: {
|
||||||
if (Btype != GGML_TYPE_Q8_0)
|
if (Btype != GGML_TYPE_Q8_0)
|
||||||
return false;
|
return false;
|
||||||
#if defined(__AVX2__) || defined(__AVX512F__)
|
#if defined(__AVX2__) || defined(__AVX512F__) || defined(__AVX__)
|
||||||
tinyBLAS_Q0_AVX2<block_q4_0, block_q8_0, float> tb{
|
tinyBLAS_Q0_AVX<block_q4_0, block_q8_0, float> tb{
|
||||||
k, (const block_q4_0 *)A, lda,
|
k, (const block_q4_0 *)A, lda,
|
||||||
(const block_q8_0 *)B, ldb,
|
(const block_q8_0 *)B, ldb,
|
||||||
(float *)C, ldc,
|
(float *)C, ldc,
|
||||||
|
|
|
@ -84,6 +84,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder ARGS ${CMAKE
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
|
||||||
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
|
||||||
|
llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2 ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
|
||||||
|
|
||||||
# build test-tokenizer-1-bpe target once and add many tests
|
# build test-tokenizer-1-bpe target once and add many tests
|
||||||
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
|
||||||
|
|
|
@ -50,7 +50,7 @@ static void init_tensor_uniform(ggml_tensor * tensor, float min = -1.0f, float m
|
||||||
|
|
||||||
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
if (tensor->type == GGML_TYPE_F32 || tensor->type == GGML_TYPE_I32) {
|
||||||
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
ggml_backend_tensor_set(tensor, data.data(), 0, size * sizeof(float));
|
||||||
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16) {
|
} else if (ggml_is_quantized(tensor->type) || tensor->type == GGML_TYPE_F16 || tensor->type == GGML_TYPE_BF16) {
|
||||||
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
GGML_ASSERT(size % ggml_blck_size(tensor->type) == 0);
|
||||||
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
std::vector<uint8_t> dataq(ggml_row_size(tensor->type, size));
|
||||||
std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
|
std::vector<float> imatrix(tensor->ne[0], 1.0f); // dummy importance matrix
|
||||||
|
@ -92,6 +92,8 @@ static std::vector<float> tensor_to_float(const ggml_tensor * t) {
|
||||||
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
|
size_t i = i3*t->nb[3] + i2*t->nb[2] + i1*t->nb[1] + i0/bs*t->nb[0];
|
||||||
if (t->type == GGML_TYPE_F16) {
|
if (t->type == GGML_TYPE_F16) {
|
||||||
tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
|
tv.push_back(ggml_fp16_to_fp32(*(ggml_fp16_t*)&buf[i]));
|
||||||
|
} else if (t->type == GGML_TYPE_BF16) {
|
||||||
|
tv.push_back(ggml_bf16_to_fp32(*(ggml_bf16_t*)&buf[i]));
|
||||||
} else if (t->type == GGML_TYPE_F32) {
|
} else if (t->type == GGML_TYPE_F32) {
|
||||||
tv.push_back(*(float *) &buf[i]);
|
tv.push_back(*(float *) &buf[i]);
|
||||||
} else if (t->type == GGML_TYPE_I32) {
|
} else if (t->type == GGML_TYPE_I32) {
|
||||||
|
@ -1898,7 +1900,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
std::default_random_engine rng(0);
|
std::default_random_engine rng(0);
|
||||||
|
|
||||||
const ggml_type all_types[] = {
|
const ggml_type all_types[] = {
|
||||||
GGML_TYPE_F32, GGML_TYPE_F16,
|
GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_BF16,
|
||||||
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
GGML_TYPE_Q4_0, GGML_TYPE_Q4_1,
|
||||||
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
GGML_TYPE_Q5_0, GGML_TYPE_Q5_1,
|
||||||
GGML_TYPE_Q8_0,
|
GGML_TYPE_Q8_0,
|
||||||
|
@ -2173,7 +2175,11 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
||||||
test_cases.emplace_back(new test_timestep_embedding());
|
test_cases.emplace_back(new test_timestep_embedding());
|
||||||
test_cases.emplace_back(new test_leaky_relu());
|
test_cases.emplace_back(new test_leaky_relu());
|
||||||
|
|
||||||
|
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
|
for (int hs : { 64, 128, }) { // other head sizes not implemented
|
||||||
|
#else
|
||||||
for (int hs : { 64, 80, 128, 256, }) {
|
for (int hs : { 64, 80, 128, 256, }) {
|
||||||
|
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||||
for (int nh : { 32, }) {
|
for (int nh : { 32, }) {
|
||||||
for (int kv : { 512, 1024, }) {
|
for (int kv : { 512, 1024, }) {
|
||||||
for (int nb : { 1, 2, 4, 8, }) {
|
for (int nb : { 1, 2, 4, 8, }) {
|
||||||
|
|
|
@ -2,6 +2,7 @@
|
||||||
#undef NDEBUG
|
#undef NDEBUG
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#include <cassert>
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
#include <regex>
|
#include <regex>
|
||||||
|
|
295
tests/test-tokenizer-random.py
Normal file
|
@ -0,0 +1,295 @@
|
||||||
|
# Test libllama tokenizer == AutoTokenizer.
|
||||||
|
# Brute force random tokens/text generation.
|
||||||
|
#
|
||||||
|
# Sample usage:
|
||||||
|
#
|
||||||
|
# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
|
||||||
|
#
|
||||||
|
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
import argparse
|
||||||
|
import subprocess
|
||||||
|
import random
|
||||||
|
|
||||||
|
from typing import Iterator
|
||||||
|
|
||||||
|
import cffi
|
||||||
|
from transformers import AutoTokenizer, PreTrainedTokenizerBase
|
||||||
|
|
||||||
|
logger = logging.getLogger("test-tokenizer-random-bpe")
|
||||||
|
|
||||||
|
|
||||||
|
class LibLlama:
|
||||||
|
|
||||||
|
DEFAULT_PATH_LLAMA_H = "./llama.h"
|
||||||
|
DEFAULT_PATH_LIBLLAMA = "./build/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON
|
||||||
|
|
||||||
|
def __init__(self, path_llama_h: str = None, path_libllama: str = None):
|
||||||
|
path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
|
||||||
|
path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
|
||||||
|
(self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_libllama)
|
||||||
|
self.lib.llama_backend_init()
|
||||||
|
|
||||||
|
def _load_libllama_cffi(self, path_llama_h: str, path_libllama: str):
|
||||||
|
cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)=", path_llama_h]
|
||||||
|
res = subprocess.run(cmd, stdout=subprocess.PIPE)
|
||||||
|
assert (res.returncode == 0)
|
||||||
|
source = res.stdout.decode()
|
||||||
|
ffi = cffi.FFI()
|
||||||
|
if True: # workarounds for pycparser
|
||||||
|
source = "typedef struct { } __builtin_va_list;" + "\n" + source
|
||||||
|
source = source.replace("sizeof (int)", str(ffi.sizeof("int")))
|
||||||
|
source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
|
||||||
|
source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
|
||||||
|
source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
|
||||||
|
ffi.cdef(source, override=True)
|
||||||
|
lib = ffi.dlopen(path_libllama)
|
||||||
|
return (ffi, lib)
|
||||||
|
|
||||||
|
def model_default_params(self, **kwargs):
|
||||||
|
mparams = self.lib.llama_model_default_params()
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
setattr(mparams, k, v)
|
||||||
|
return mparams
|
||||||
|
|
||||||
|
def context_default_params(self, **kwargs):
|
||||||
|
cparams = self.lib.llama_context_default_params()
|
||||||
|
for k, v in kwargs.items():
|
||||||
|
setattr(cparams, k, v)
|
||||||
|
return cparams
|
||||||
|
|
||||||
|
|
||||||
|
class LibLlamaModel:
|
||||||
|
|
||||||
|
def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
|
||||||
|
self.lib = libllama.lib
|
||||||
|
self.ffi = libllama.ffi
|
||||||
|
if isinstance(mparams, dict):
|
||||||
|
mparams = libllama.model_default_params(**mparams)
|
||||||
|
self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
|
||||||
|
if not self.model:
|
||||||
|
raise RuntimeError("error: failed to load model '%s'" % path_model)
|
||||||
|
if isinstance(cparams, dict):
|
||||||
|
cparams = libllama.context_default_params(**cparams)
|
||||||
|
self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
|
||||||
|
if not self.ctx:
|
||||||
|
raise RuntimeError("error: failed to create context for model '%s'" % path_model)
|
||||||
|
n_tokens_max = self.lib.llama_n_ctx(self.ctx)
|
||||||
|
self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
|
||||||
|
|
||||||
|
def free(self):
|
||||||
|
if self.ctx:
|
||||||
|
self.lib.llama_free(self.ctx)
|
||||||
|
if self.model:
|
||||||
|
self.lib.llama_free_model(self.model)
|
||||||
|
self.ctx = None
|
||||||
|
self.model = None
|
||||||
|
self.lib = None
|
||||||
|
|
||||||
|
def tokenize(self, text: str, n_tokens_max: int = 0, add_special: bool = False, parse_special: bool = False) -> list[int]:
|
||||||
|
n_tokens_max = n_tokens_max if n_tokens_max > 0 else len(self.token_ids)
|
||||||
|
text = text.encode("utf-8")
|
||||||
|
num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, n_tokens_max, add_special, parse_special)
|
||||||
|
if num < 0:
|
||||||
|
return []
|
||||||
|
return list(self.token_ids[0:num])
|
||||||
|
|
||||||
|
|
||||||
|
def generator_custom_text() -> Iterator[str]:
|
||||||
|
"""General tests"""
|
||||||
|
yield from [
|
||||||
|
"",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
" ",
|
||||||
|
"\t",
|
||||||
|
"\n",
|
||||||
|
"\n\n",
|
||||||
|
"\n\n\n",
|
||||||
|
"\t\n",
|
||||||
|
"Hello world",
|
||||||
|
" Hello world",
|
||||||
|
"Hello World",
|
||||||
|
" Hello World",
|
||||||
|
" Hello World!",
|
||||||
|
"Hello, world!",
|
||||||
|
" Hello, world!",
|
||||||
|
" this is 🦙.cpp",
|
||||||
|
"w048 7tuijk dsdfhu",
|
||||||
|
"нещо на Български",
|
||||||
|
"កាន់តែពិសេសអាចខលចេញ",
|
||||||
|
"🚀 (normal) 😶🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
|
||||||
|
"Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello",
|
||||||
|
" Hello\n Hello",
|
||||||
|
" (",
|
||||||
|
"\n =",
|
||||||
|
"' era",
|
||||||
|
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天~",
|
||||||
|
"3",
|
||||||
|
"33",
|
||||||
|
"333",
|
||||||
|
"3333",
|
||||||
|
"33333",
|
||||||
|
"333333",
|
||||||
|
"3333333",
|
||||||
|
"33333333",
|
||||||
|
"333333333",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def generator_custom_text_edge_cases() -> Iterator[str]:
|
||||||
|
"""Edge cases found while debugging"""
|
||||||
|
yield from [
|
||||||
|
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
|
||||||
|
'¼-a', # unicode_ranges_digit, 0x00BC
|
||||||
|
'½-a', # unicode_ranges_digit, 0x00BD
|
||||||
|
'¾-a', # unicode_ranges_digit, 0x00BE
|
||||||
|
'a 〇b', # unicode_ranges_digit, 0x3007
|
||||||
|
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
|
||||||
|
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
|
||||||
|
'<s>a' # TODO: Phi-3 fail
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def generator_random_chars(iterations = 100) -> Iterator[str]:
|
||||||
|
"""Brute force random text with simple characters"""
|
||||||
|
|
||||||
|
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
|
||||||
|
CHARS = list(set("""
|
||||||
|
ABCDEFGHIJKLMNOPQRSTUVWXYZ
|
||||||
|
abcdefghijklmnopqrstuvwxyz
|
||||||
|
ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
|
||||||
|
áéíóúàèìòùâêîôûäëïöü
|
||||||
|
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
|
||||||
|
"""))
|
||||||
|
|
||||||
|
rand = random.Random()
|
||||||
|
for m in range(iterations):
|
||||||
|
rand.seed(m)
|
||||||
|
text = []
|
||||||
|
num_words = rand.randint(300, 400)
|
||||||
|
for i in range(num_words):
|
||||||
|
k = rand.randint(1, 7)
|
||||||
|
word = rand.choices(CHARS, k=k)
|
||||||
|
space = rand.choice(WHITESPACES)
|
||||||
|
text.append("".join(word) + space)
|
||||||
|
yield "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
|
||||||
|
"""Brute force random text with vocab characters"""
|
||||||
|
|
||||||
|
vocab_ids = list(tokenizer.vocab.values())
|
||||||
|
vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True)
|
||||||
|
vocab_chars = list(set(vocab_text))
|
||||||
|
del vocab_ids, vocab_text
|
||||||
|
|
||||||
|
rand = random.Random()
|
||||||
|
for m in range(iterations):
|
||||||
|
rand.seed(m)
|
||||||
|
text = rand.choices(vocab_chars, k=1024)
|
||||||
|
yield "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
|
||||||
|
"""Brute force random text from vocab tokens"""
|
||||||
|
|
||||||
|
space_id = tokenizer.encode(" ", add_special_tokens=False)[0]
|
||||||
|
vocab_ids = list(tokenizer.vocab.values())
|
||||||
|
vocab_ids = list(sorted(vocab_ids + vocab_ids))
|
||||||
|
for i in range(1, len(vocab_ids), 2):
|
||||||
|
vocab_ids[i] = space_id
|
||||||
|
vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True)
|
||||||
|
vocab_tokens = vocab_tokens.split(" ")
|
||||||
|
del vocab_ids
|
||||||
|
|
||||||
|
yield from vocab_tokens
|
||||||
|
|
||||||
|
rand = random.Random()
|
||||||
|
for m in range(iterations):
|
||||||
|
rand.seed(m)
|
||||||
|
text = []
|
||||||
|
num_words = rand.randint(300, 400)
|
||||||
|
for i in range(num_words):
|
||||||
|
k = rand.randint(1, 3)
|
||||||
|
tokens = rand.choices(vocab_tokens, k=k)
|
||||||
|
tokens = [t.strip(" \n\r\t") for t in tokens]
|
||||||
|
sep = rand.choice(" \n\r\t")
|
||||||
|
text.append("".join(tokens) + sep)
|
||||||
|
yield "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def generator_random_bytes(iterations = 100) -> Iterator[str]:
|
||||||
|
"""Brute force random bytes"""
|
||||||
|
|
||||||
|
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
|
||||||
|
|
||||||
|
rand = random.Random()
|
||||||
|
for m in range(iterations):
|
||||||
|
rand.seed(m)
|
||||||
|
text = []
|
||||||
|
num_words = rand.randint(300, 400)
|
||||||
|
for i in range(num_words):
|
||||||
|
k = rand.randint(1, 8)
|
||||||
|
word = [chr(r) for r in rand.randbytes(k) if r]
|
||||||
|
word.append(rand.choice(WHITESPACES))
|
||||||
|
text.append("".join(word))
|
||||||
|
yield "".join(text)
|
||||||
|
|
||||||
|
|
||||||
|
def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]):
|
||||||
|
|
||||||
|
def find_first_mismatch(ids1: list[int], ids2: list[int]):
|
||||||
|
for i, (a,b) in enumerate(zip(ids1, ids2)):
|
||||||
|
if a != b:
|
||||||
|
return i
|
||||||
|
if len(ids1) == len(ids2):
|
||||||
|
return -1
|
||||||
|
return min(len(ids1), len(ids2))
|
||||||
|
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
logger.info("%s: %s" % (generator.__name__, "ini"))
|
||||||
|
for text in generator:
|
||||||
|
ids1 = model.tokenize(text, add_special=False, parse_special=False)
|
||||||
|
ids2 = tokenizer.encode(text, add_special_tokens=False)
|
||||||
|
if ids1 != ids2:
|
||||||
|
i = find_first_mismatch(ids1, ids2)
|
||||||
|
ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
|
||||||
|
ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
|
||||||
|
text2 = tokenizer.decode(ids2, skip_special_tokens=True)
|
||||||
|
assert (text2 in text)
|
||||||
|
logger.info(" Text: " + repr(text2))
|
||||||
|
logger.info(" TokenIDs: " + str(ids1))
|
||||||
|
logger.info(" Expected: " + str(ids2))
|
||||||
|
raise Exception()
|
||||||
|
t1 = time.perf_counter()
|
||||||
|
logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
|
||||||
|
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
|
||||||
|
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
|
||||||
|
|
||||||
|
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
|
||||||
|
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_custom_text())
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases())
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000))
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000))
|
||||||
|
test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000))
|
||||||
|
# test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL
|
||||||
|
|
||||||
|
model.free()
|
1262
unicode-data.cpp
|
@ -7,6 +7,7 @@
|
||||||
|
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
|
||||||
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
|
||||||
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;
|
||||||
|
|
368
unicode.cpp
|
@ -9,6 +9,7 @@
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <unordered_map>
|
#include <unordered_map>
|
||||||
|
#include <unordered_set>
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <locale>
|
#include <locale>
|
||||||
|
@ -111,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
|
||||||
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
||||||
std::unordered_map<uint32_t, int> cpt_types;
|
std::unordered_map<uint32_t, int> cpt_types;
|
||||||
for (auto p : unicode_ranges_number) {
|
for (auto p : unicode_ranges_number) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
|
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_letter) {
|
for (auto p : unicode_ranges_letter) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_LETTER;
|
cpt_types[i] = CODEPOINT_TYPE_LETTER;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_whitespace) {
|
for (auto p : unicode_ranges_separator) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
|
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_accent_mark) {
|
for (auto p : unicode_ranges_accent_mark) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_punctuation) {
|
for (auto p : unicode_ranges_punctuation) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for (auto p : unicode_ranges_control) {
|
for (auto p : unicode_ranges_control) {
|
||||||
for (auto i = p.first; i <= p.second; ++ i) {
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
|
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
|
||||||
std::vector<size_t> bpe_offsets; // store the offset of each word
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||||
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||||
|
|
||||||
|
const auto cpts = unicode_cpts_from_utf8(text);
|
||||||
|
|
||||||
size_t start = 0;
|
size_t start = 0;
|
||||||
|
for (auto offset : offsets) {
|
||||||
|
const size_t offset_ini = start;
|
||||||
|
const size_t offset_end = start + offset;
|
||||||
|
assert(offset_end <= cpts.size());
|
||||||
|
start = offset_end;
|
||||||
|
|
||||||
|
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
||||||
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
||||||
|
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
||||||
|
};
|
||||||
|
|
||||||
|
size_t _prev_end = offset_ini;
|
||||||
|
auto _add_token = [&] (const size_t end) -> size_t {
|
||||||
|
assert(_prev_end <= end && end <= offset_end);
|
||||||
|
size_t len = end - _prev_end;
|
||||||
|
if (len > 0) {
|
||||||
|
bpe_offsets.push_back(len);
|
||||||
|
}
|
||||||
|
_prev_end = end;
|
||||||
|
//if (len > 0) {
|
||||||
|
// std::string s = "";
|
||||||
|
// for(size_t p = end-len; p < end; p++)
|
||||||
|
// s += unicode_cpt_to_utf8(cpts[p]);
|
||||||
|
// printf(">>> '%s'\n", s.c_str());
|
||||||
|
//}
|
||||||
|
return len;
|
||||||
|
};
|
||||||
|
|
||||||
|
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||||
|
const char32_t cpt = _get_cpt(pos);
|
||||||
|
const int cpt_type = _get_cpt_type(pos);
|
||||||
|
|
||||||
|
// regex: 's|'t|'re|'ve|'m|'ll|'d
|
||||||
|
if (cpt == '\'' && pos+1 < offset_end) {
|
||||||
|
char32_t cpt_next = _get_cpt(pos+1);
|
||||||
|
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||||
|
pos += _add_token(pos+2);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (pos+2 < offset_end) {
|
||||||
|
char32_t cpt_next_next = _get_cpt(pos+2);
|
||||||
|
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||||
|
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||||
|
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||||
|
pos += _add_token(pos+3);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||||
|
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||||
|
// regex: <space>?\p{L}+
|
||||||
|
if (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
||||||
|
pos += (cpt == ' ');
|
||||||
|
while (cpt2_type == CODEPOINT_TYPE_LETTER) {
|
||||||
|
cpt2_type = _get_cpt_type(++pos);
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// regex: <space>?\p{N}+
|
||||||
|
if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
||||||
|
pos += (cpt == ' ');
|
||||||
|
while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
|
||||||
|
cpt2_type = _get_cpt_type(++pos);
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// regex: <space>?[^\s\p{L}\p{N}]+
|
||||||
|
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||||
|
pos += (cpt == ' ');
|
||||||
|
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||||
|
cpt2_type = _get_cpt_type(++pos);
|
||||||
|
cpt2 = _get_cpt(pos);
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t num_whitespaces = 0;
|
||||||
|
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||||
|
num_whitespaces++;
|
||||||
|
}
|
||||||
|
|
||||||
|
// regex: \s+(?!\S)
|
||||||
|
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
||||||
|
pos += num_whitespaces - 1;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// regex: \s+
|
||||||
|
if (num_whitespaces > 0) {
|
||||||
|
pos += num_whitespaces;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// no matches
|
||||||
|
_add_token(++pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return bpe_offsets;
|
||||||
|
}
|
||||||
|
|
||||||
|
// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
|
||||||
|
static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
|
||||||
|
std::vector<size_t> bpe_offsets; // store the offset of each word
|
||||||
|
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
|
||||||
|
|
||||||
const auto cpts = unicode_cpts_from_utf8(text);
|
const auto cpts = unicode_cpts_from_utf8(text);
|
||||||
|
|
||||||
|
size_t start = 0;
|
||||||
for (auto offset : offsets) {
|
for (auto offset : offsets) {
|
||||||
std::string token;
|
const size_t offset_ini = start;
|
||||||
|
const size_t offset_end = start + offset;
|
||||||
|
assert(offset_end <= cpts.size());
|
||||||
|
start = offset_end;
|
||||||
|
|
||||||
bool collecting_numeric = false;
|
auto _get_cpt = [&] (const size_t pos) -> char32_t {
|
||||||
bool collecting_letter = false;
|
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
|
||||||
bool collecting_special = false;
|
};
|
||||||
bool collecting_whitespace_lookahead = false;
|
|
||||||
bool collecting = false;
|
|
||||||
|
|
||||||
std::vector<std::string> text_utf;
|
auto _get_cpt_type = [&] (const size_t pos) -> int {
|
||||||
text_utf.reserve(offset);
|
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
|
||||||
|
};
|
||||||
|
|
||||||
for (size_t i = start; i < start + offset; ++i) {
|
size_t _prev_end = offset_ini;
|
||||||
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
|
auto _add_token = [&] (const size_t end) -> size_t {
|
||||||
|
assert(_prev_end <= end && end <= offset_end);
|
||||||
|
size_t len = end - _prev_end;
|
||||||
|
if (len > 0) {
|
||||||
|
bpe_offsets.push_back(len);
|
||||||
}
|
}
|
||||||
|
_prev_end = end;
|
||||||
|
//if (len > 0) {
|
||||||
|
// std::string s = "";
|
||||||
|
// for(size_t p = end-len; p < end; p++)
|
||||||
|
// s += unicode_cpt_to_utf8(cpts[p]);
|
||||||
|
// printf(">>> '%s'\n", s.c_str());
|
||||||
|
//}
|
||||||
|
return len;
|
||||||
|
};
|
||||||
|
|
||||||
for (int i = 0; i < (int)text_utf.size(); i++) {
|
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
|
||||||
const std::string & utf_char = text_utf[i];
|
const char32_t cpt = _get_cpt(pos);
|
||||||
bool split_condition = false;
|
const int cpt_type = _get_cpt_type(pos);
|
||||||
int bytes_remain = text_utf.size() - i;
|
|
||||||
|
|
||||||
// forward backward lookups
|
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
|
||||||
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
|
if (cpt == '\'' && pos+1 < offset_end) {
|
||||||
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
|
char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
|
||||||
|
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
|
||||||
// handling contractions
|
pos += _add_token(pos+2);
|
||||||
if (!split_condition && bytes_remain >= 2) {
|
continue;
|
||||||
// 's|'t|'m|'d
|
|
||||||
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
}
|
||||||
if (split_condition) {
|
if (pos+2 < offset_end) {
|
||||||
if (token.size()) {
|
char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
|
||||||
}
|
(cpt_next == 'v' && cpt_next_next == 'e') ||
|
||||||
token = utf_char + utf_char_next;
|
(cpt_next == 'l' && cpt_next_next == 'l')) {
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
pos += _add_token(pos+3);
|
||||||
token = "";
|
|
||||||
i++;
|
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!split_condition && bytes_remain >= 3) {
|
|
||||||
// 're|'ve|'ll
|
|
||||||
if (utf_char == "\'" && (
|
|
||||||
(utf_char_next == "r" && utf_char_next_next == "e") ||
|
|
||||||
(utf_char_next == "v" && utf_char_next_next == "e") ||
|
|
||||||
(utf_char_next == "l" && utf_char_next_next == "l"))
|
|
||||||
) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
}
|
||||||
if (split_condition) {
|
|
||||||
// current token + next token can be defined
|
|
||||||
if (token.size()) {
|
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
|
||||||
}
|
|
||||||
token = utf_char;
|
|
||||||
token += utf_char_next;
|
|
||||||
token += utf_char_next_next;
|
|
||||||
|
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
|
||||||
token = "";
|
if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
|
||||||
i += 2;
|
if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
|
||||||
|
pos++;
|
||||||
|
while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
|
||||||
|
pos++;
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!split_condition && !collecting) {
|
// regex: \p{N}{1,3}
|
||||||
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
|
if (cpt_type == CODEPOINT_TYPE_NUMBER) {
|
||||||
collecting_letter = true;
|
size_t ini = pos;
|
||||||
collecting = true;
|
while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
|
||||||
}
|
if (++pos - ini >= 3 ) {
|
||||||
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
|
_add_token(pos);
|
||||||
collecting_numeric = true;
|
ini = pos;
|
||||||
collecting = true;
|
|
||||||
}
|
|
||||||
else if (
|
|
||||||
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
|
|
||||||
(token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
|
|
||||||
) {
|
|
||||||
collecting_special = true;
|
|
||||||
collecting = true;
|
|
||||||
}
|
|
||||||
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
|
|
||||||
collecting_whitespace_lookahead = true;
|
|
||||||
collecting = true;
|
|
||||||
}
|
|
||||||
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (!split_condition && collecting) {
|
_add_token(pos);
|
||||||
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
|
continue;
|
||||||
split_condition = true;
|
|
||||||
}
|
|
||||||
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
|
||||||
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
|
||||||
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
|
|
||||||
split_condition = true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (utf_char_next == "") {
|
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
|
||||||
split_condition = true; // final
|
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
|
||||||
token += utf_char;
|
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
|
||||||
|
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||||
|
pos += (cpt == ' ');
|
||||||
|
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
|
||||||
|
cpt2_type = _get_cpt_type(++pos);
|
||||||
|
cpt2 = _get_cpt(pos);
|
||||||
|
}
|
||||||
|
while (cpt2 == '\r' || cpt2 == '\n') {
|
||||||
|
cpt2 = _get_cpt(++pos);
|
||||||
|
}
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (split_condition) {
|
size_t num_whitespaces = 0;
|
||||||
if (token.size()) {
|
size_t last_end_r_or_n = 0;
|
||||||
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
|
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
|
||||||
}
|
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
|
||||||
token = utf_char;
|
if (cpt2 == '\r' || cpt2 == '\n') {
|
||||||
collecting = false;
|
last_end_r_or_n = pos + num_whitespaces + 1;
|
||||||
collecting_letter = false;
|
|
||||||
collecting_numeric = false;
|
|
||||||
collecting_special = false;
|
|
||||||
collecting_whitespace_lookahead = false;
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
token += utf_char;
|
|
||||||
}
|
}
|
||||||
|
num_whitespaces++;
|
||||||
}
|
}
|
||||||
|
|
||||||
start += offset;
|
// regex: \s*[\r\n]+
|
||||||
|
if (last_end_r_or_n > 0) {
|
||||||
|
pos = last_end_r_or_n;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// regex: \s+(?!\S)
|
||||||
|
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
|
||||||
|
pos += num_whitespaces - 1;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// regex: \s+
|
||||||
|
if (num_whitespaces > 0) {
|
||||||
|
pos += num_whitespaces;
|
||||||
|
_add_token(pos);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// no matches
|
||||||
|
_add_token(++pos);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return bpe_offsets;
|
return bpe_offsets;
|
||||||
|
@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
|
||||||
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
|
||||||
std::vector<size_t> bpe_offsets;
|
std::vector<size_t> bpe_offsets;
|
||||||
|
|
||||||
(void)(text);
|
if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
||||||
(void)(regex_expr);
|
bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
||||||
(void)(offsets);
|
} else if (
|
||||||
// TODO: this implementation is actually wrong, uncomment and run:
|
regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
|
||||||
// make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
|
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
|
||||||
//if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
|
|
||||||
// bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
|
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
|
||||||
//}
|
}
|
||||||
|
|
||||||
return bpe_offsets;
|
return bpe_offsets;
|
||||||
}
|
}
|
||||||
|
@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
|
||||||
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool unicode_cpt_is_whitespace(uint32_t cp) {
|
||||||
|
static const std::unordered_set<uint32_t> is_whitespace = [] {
|
||||||
|
std::unordered_set<uint32_t> is_whitespace;
|
||||||
|
for (auto p : unicode_ranges_whitespace) {
|
||||||
|
for (auto i = p.first; i <= p.second; ++i) {
|
||||||
|
is_whitespace.insert(i);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return is_whitespace;
|
||||||
|
}();
|
||||||
|
return (bool)is_whitespace.count(cp);
|
||||||
|
}
|
||||||
|
|
||||||
std::string unicode_byte_to_utf8(uint8_t byte) {
|
std::string unicode_byte_to_utf8(uint8_t byte) {
|
||||||
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
|
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
|
||||||
return map.at(byte);
|
return map.at(byte);
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
#define CODEPOINT_TYPE_UNIDENTIFIED 0
|
||||||
#define CODEPOINT_TYPE_NUMBER 1
|
#define CODEPOINT_TYPE_NUMBER 1
|
||||||
#define CODEPOINT_TYPE_LETTER 2
|
#define CODEPOINT_TYPE_LETTER 2
|
||||||
#define CODEPOINT_TYPE_WHITESPACE 3
|
#define CODEPOINT_TYPE_SEPARATOR 3
|
||||||
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
#define CODEPOINT_TYPE_ACCENT_MARK 4
|
||||||
#define CODEPOINT_TYPE_PUNCTUATION 5
|
#define CODEPOINT_TYPE_PUNCTUATION 5
|
||||||
#define CODEPOINT_TYPE_SYMBOL 6
|
#define CODEPOINT_TYPE_SYMBOL 6
|
||||||
|
@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
|
||||||
int unicode_cpt_type(uint32_t cp);
|
int unicode_cpt_type(uint32_t cp);
|
||||||
int unicode_cpt_type(const std::string & utf8);
|
int unicode_cpt_type(const std::string & utf8);
|
||||||
|
|
||||||
|
bool unicode_cpt_is_whitespace(uint32_t cp);
|
||||||
|
|
||||||
std::string unicode_byte_to_utf8(uint8_t byte);
|
std::string unicode_byte_to_utf8(uint8_t byte);
|
||||||
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
uint8_t unicode_utf8_to_byte(const std::string & utf8);
|
||||||
|
|
||||||
|
|