Merge branch 'ggerganov:master' into snowflake-arctic-clean

This commit is contained in:
fairydreaming 2024-05-09 18:00:09 +02:00 committed by GitHub
commit f3d1227ca4
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
136 changed files with 8527 additions and 3357 deletions

16
.flake8
View file

@ -1,3 +1,17 @@
[flake8]
max-line-length = 125
ignore = W503
ignore = E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503
exclude =
# Do not traverse examples
examples,
# Do not include package initializers
__init__.py,
# No need to traverse our git directory
.git,
# There's no value in checking cache directories
__pycache__,
# No need to include the build path
build,
# This contains builds that we don't want to check
dist # This is generated with `python build .` for package releases
# max-complexity = 10

View file

@ -52,7 +52,19 @@ jobs:
ftype: q4_0
pr_comment_enabled: "true"
if: ${{ github.event.inputs.gpu-series == 'Standard_NC4as_T4_v3' || github.event.schedule || github.event.pull_request || github.head_ref == 'master' || github.ref_name == 'master' || github.event.push.ref == 'refs/heads/master' }}
if: |
inputs.gpu-series == 'Standard_NC4as_T4_v3'
|| (
github.event_name == 'schedule'
&& github.ref_name == 'master'
&& github.repository_owner == 'ggerganov'
)
|| github.event_name == 'pull_request_target'
|| (
github.event_name == 'push'
&& github.event.ref == 'refs/heads/master'
&& github.repository_owner == 'ggerganov'
)
steps:
- name: Clone
id: checkout

View file

@ -20,5 +20,4 @@ jobs:
- name: flake8 Lint
uses: py-actions/flake8@v2
with:
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
exclude: "examples/*,examples/*/**,*/**/__init__.py,convert-hf-to-gguf-update.py"
plugins: "flake8-no-print"

View file

@ -3,13 +3,14 @@
exclude: prompts/.*.txt
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v3.2.0
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-added-large-files
- repo: https://github.com/PyCQA/flake8
rev: 6.0.0
rev: 7.0.0
hooks:
- id: flake8
additional_dependencies: [flake8-no-print]

View file

@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
"llama: max. batch size for using peer access")
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@ -403,12 +405,16 @@ if (LLAMA_CUDA)
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
add_compile_definitions(GGML_USE_CUDA)
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
if (LLAMA_CUDA_FORCE_DMMV)
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
endif()
if (LLAMA_CUDA_FORCE_MMQ)
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
endif()
if (LLAMA_CUDA_NO_VMM)
add_compile_definitions(GGML_CUDA_NO_VMM)
endif()
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
if (DEFINED LLAMA_CUDA_DMMV_Y)
@ -425,7 +431,7 @@ if (LLAMA_CUDA)
if (LLAMA_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
else ()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@ -434,7 +440,11 @@ if (LLAMA_CUDA)
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
if (LLAMA_CUDA_NO_VMM)
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
else()
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
endif()
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# 52 == lowest CUDA 12 standard

View file

@ -77,11 +77,10 @@ test: $(TEST_TARGETS)
./$$test_target $(CURDIR)/models/ggml-vocab-llama-bpe.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-phi-3.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-falcon.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-coder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-deepseek-llm.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-bert-bge.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-starcoder.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-gpt-2.gguf; \
./$$test_target $(CURDIR)/models/ggml-vocab-refact.gguf; \
elif [ "$$test_target" = "tests/test-tokenizer-1-spm" ]; then \
continue; \
elif [ "$$test_target" = "tests/test-tokenizer-1-bpe" ]; then \
@ -434,7 +433,7 @@ ifdef LLAMA_CUDA
else
CUDA_PATH ?= /usr/local/cuda
endif
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
OBJS += ggml-cuda.o
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))

View file

@ -2,7 +2,7 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
### Hot topics
- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
@ -139,7 +140,6 @@ Typically finetunes of the base models below are supported as well.
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
**HTTP server**
@ -175,6 +175,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
@ -712,6 +713,8 @@ Building the program with BLAS support may lead to some performance improvements
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
```bash
# obtain the official LLaMA model weights and place them in ./models
ls ./models
@ -933,17 +936,25 @@ If your issue is with model generation quality, then please at least scan the fo
### Android
#### Build on Android using Termux
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
```
apt update && apt upgrade -y
apt install git make cmake
```
It's recommended to move your model inside the `~/` directory for best performance:
```
cd storage/downloads
mv model.gguf ~/
```
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
#### Building the Project using Android NDK
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
First, install the essential packages for termux:
```
pkg install clang wget git cmake
```
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
```
$ mkdir build-android
$ cd build-android
@ -951,7 +962,9 @@ $ export NDK=<your_ndk_directory>
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
$ make
```
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
@ -973,53 +986,10 @@ $cd /data/data/com.termux/files/home/bin
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
```
Here is a demo of an interactive session running on Pixel 5 phone:
Here's a demo of an interactive session running on Pixel 5 phone:
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
#### Building the Project using Termux (F-Droid)
Termux from F-Droid offers an alternative route to execute the project on an Android device. This method empowers you to construct the project right from within the terminal, negating the requirement for a rooted device or SD Card.
Outlined below are the directives for installing the project using OpenBLAS and CLBlast. This combination is specifically designed to deliver peak performance on recent devices that feature a GPU.
If you opt to utilize OpenBLAS, you'll need to install the corresponding package.
```
apt install libopenblas
```
Subsequently, if you decide to incorporate CLBlast, you'll first need to install the requisite OpenCL packages:
```
apt install ocl-icd opencl-headers opencl-clhpp clinfo
```
In order to compile CLBlast, you'll need to first clone the respective Git repository, which can be found at this URL: https://github.com/CNugteren/CLBlast. Alongside this, clone this repository into your home directory. Once this is done, navigate to the CLBlast folder and execute the commands detailed below:
```
cmake .
make
cp libclblast.so* $PREFIX/lib
cp ./include/clblast.h ../llama.cpp
```
Following the previous steps, navigate to the LlamaCpp directory. To compile it with OpenBLAS and CLBlast, execute the command provided below:
```
cp /data/data/com.termux/files/usr/include/openblas/cblas.h .
cp /data/data/com.termux/files/usr/include/openblas/openblas_config.h .
make LLAMA_CLBLAST=1 //(sometimes you need to run this command twice)
```
Upon completion of the aforementioned steps, you will have successfully compiled the project. To run it using CLBlast, a slight adjustment is required: a command must be issued to direct the operations towards your device's physical GPU, rather than the virtual one. The necessary command is detailed below:
```
GGML_OPENCL_PLATFORM=0
GGML_OPENCL_DEVICE=0
export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
```
(Note: some Android devices, like the Zenfone 8, need the following command instead - "export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH". Source: https://www.reddit.com/r/termux/comments/kc3ynp/opencl_working_in_termux_more_in_comments/ )
For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
Place your desired model into the `~/llama.cpp/models/` directory and execute the `./main (...)` script.
### Docker
#### Prerequisites

View file

@ -160,9 +160,8 @@ function gg_run_test_scripts_debug {
set -e
# TODO: too slow, run on dedicated node
#(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
set +e
}
@ -695,8 +694,10 @@ test $ret -eq 0 && gg_run ctest_release
if [ -z ${GG_BUILD_LOW_PERF} ]; then
test $ret -eq 0 && gg_run embd_bge_small
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
test $ret -eq 0 && gg_run test_scripts_debug
test $ret -eq 0 && gg_run test_scripts_release
fi
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
if [ -z ${GG_BUILD_CUDA} ]; then

View file

@ -1,4 +1,6 @@
#include "common.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include "json-schema-to-grammar.h"
#include "llama.h"
@ -76,7 +78,7 @@ int32_t get_num_physical_cores() {
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
std::ifstream thread_siblings("/sys/devices/system/cpu"
std::ifstream thread_siblings("/sys/devices/system/cpu/cpu"
+ std::to_string(cpu) + "/topology/thread_siblings");
if (!thread_siblings.is_open()) {
break; // no more cpus
@ -911,6 +913,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
params.instruct = true;
return true;
}
if (arg == "-cnv" || arg == "--conversation") {
params.conversation = true;
return true;
}
if (arg == "-cml" || arg == "--chatml") {
params.chatml = true;
return true;
@ -1417,6 +1423,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
printf(" --version show version and build info\n");
printf(" -i, --interactive run in interactive mode\n");
printf(" --interactive-first run in interactive mode and wait for input right away\n");
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
@ -1964,18 +1971,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
try {
metadata_in >> metadata;
fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
if (metadata.contains("url") && metadata["url"].is_string()) {
auto previous_url = metadata["url"].get<std::string>();
if (metadata.contains("url") && metadata.at("url").is_string()) {
auto previous_url = metadata.at("url").get<std::string>();
if (previous_url != url) {
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
return false;
}
}
if (metadata.contains("etag") && metadata["etag"].is_string()) {
etag = metadata["etag"];
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
etag = metadata.at("etag");
}
if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
last_modified = metadata["lastModified"];
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
last_modified = metadata.at("lastModified");
}
} catch (const nlohmann::json::exception & e) {
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());

View file

@ -140,6 +140,7 @@ struct gpt_params {
bool random_prompt = false; // do not randomize prompt if none provided
bool use_color = false; // use color to distinguish generations and inputs
bool interactive = false; // interactive mode
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
bool prompt_cache_all = false; // save user input and generations to prompt cache
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

View file

@ -1,4 +1,8 @@
#pragma once
#include "ggml.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);

View file

@ -234,7 +234,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE
// USE LOG() INSTEAD
//
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \
@ -257,7 +257,7 @@ inline std::string log_filename_generator_impl(LogTriState multilog, const std::
// INTERNAL, DO NOT USE
// USE LOG_TEE() INSTEAD
//
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER)
#if !defined(_MSC_VER) || defined(__INTEL_LLVM_COMPILER) || defined(__clang__)
#define LOG_TEE_IMPL(str, ...) \
do { \
if (LOG_TARGET != nullptr) \

View file

@ -35,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
result->prev.resize(params.n_prev);
result->n_considered = 0;
llama_sampling_set_rng_seed(result, params.seed);
return result;
@ -64,6 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
ctx->cur.clear();
ctx->n_considered = 0;
}
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@ -253,6 +256,8 @@ static llama_token llama_sampling_sample_impl(
}
}
ctx_sampling->n_considered = cur_p.size;
return id;
}

View file

@ -81,6 +81,7 @@ struct llama_sampling_context {
// TODO: replace with ring-buffer
std::vector<llama_token> prev;
std::vector<llama_token_data> cur;
size_t n_considered;
std::mt19937 rng;
};

178
convert-hf-to-gguf-update.py Normal file → Executable file
View file

@ -1,3 +1,5 @@
#!/usr/bin/env python3
# This script downloads the tokenizer models of the specified models from Huggingface and
# generates the get_vocab_base_pre() function for convert-hf-to-gguf.py
#
@ -21,6 +23,7 @@
# TODO: automate the update of convert-hf-to-gguf.py
#
import logging
import os
import requests
import sys
@ -28,49 +31,66 @@ import json
from hashlib import sha256
from enum import IntEnum, auto
from transformers import AutoTokenizer
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("convert-hf-to-gguf-update")
class TOKENIZER_TYPE(IntEnum):
SPM = auto()
BPE = auto()
WPM = auto()
# TODO: this string has to exercise as much pre-tokenizer functionality as possible
# will be updated with time - contributions welcome
chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български \'\'\'\'\'\'```````\"\"\"\"......!!!!!!?????? I\'ve been \'told he\'s there, \'RE you sure? \'M not sure I\'ll make it, \'D you like some tea? We\'Ve a\'lL'
if len(sys.argv) == 2:
token = sys.argv[1]
if not token.startswith("hf_"):
logger.info("Huggingface token seems invalid")
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
else:
print("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
# TODO: add models here, base models preferred
models = [
{ "name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{ "name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{ "name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{ "name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{ "name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{ "name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{ "name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{ "name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{ "name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{ "name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
]
{"name": "llama-spm", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/meta-llama/Llama-2-7b-hf", },
{"name": "llama-bpe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/meta-llama/Meta-Llama-3-8B", },
{"name": "phi-3", "tokt": TOKENIZER_TYPE.SPM, "repo": "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct", },
{"name": "deepseek-llm", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-llm-7b-base", },
{"name": "deepseek-coder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/deepseek-ai/deepseek-coder-6.7b-base", },
{"name": "falcon", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/tiiuae/falcon-7b", },
{"name": "bert-bge", "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/BAAI/bge-small-en-v1.5", },
{"name": "mpt", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
{"name": "starcoder", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
]
# make directory "models/tokenizers" if it doesn't exist
if not os.path.exists("models/tokenizers"):
os.makedirs("models/tokenizers")
def download_file_with_auth(url, token, save_path):
headers = {"Authorization": f"Bearer {token}"}
response = requests.get(url, headers=headers)
if response.status_code == 200:
with open(save_path, 'wb') as f:
f.write(response.content)
print(f"File {save_path} downloaded successfully")
logger.info(f"File {save_path} downloaded successfully")
else:
print(f"Failed to download file. Status code: {response.status_code}")
logger.info(f"Failed to download file. Status code: {response.status_code}")
# download the tokenizer models
for model in models:
@ -81,10 +101,10 @@ for model in models:
if not os.path.exists(f"models/tokenizers/{name}"):
os.makedirs(f"models/tokenizers/{name}")
else:
print(f"Directory models/tokenizers/{name} already exists - skipping")
logger.info(f"Directory models/tokenizers/{name} already exists - skipping")
continue
print(f"Downloading {name} to models/tokenizers/{name}")
logger.info(f"Downloading {name} to models/tokenizers/{name}")
url = f"{repo}/raw/main/config.json"
save_path = f"models/tokenizers/{name}/config.json"
@ -94,6 +114,14 @@ for model in models:
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
# if downloaded file is less than 1KB, we likely need to download an LFS instead
if os.path.getsize(save_path) < 1024:
# remove the file
os.remove(save_path)
url = f"{repo}/resolve/main/tokenizer.json"
save_path = f"models/tokenizers/{name}/tokenizer.json"
download_file_with_auth(url, token, save_path)
if tokt == TOKENIZER_TYPE.SPM:
url = f"{repo}/resolve/main/tokenizer.model"
save_path = f"models/tokenizers/{name}/tokenizer.model"
@ -115,80 +143,84 @@ for model in models:
continue
# create the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
print(f"model: {name}")
print(f"tokt: {tokt}")
print(f"repo: {model['repo']}")
print(f"chktok: {chktok}")
print(f"chkhsh: {chkhsh}")
logger.info(f"model: {name}")
logger.info(f"tokt: {tokt}")
logger.info(f"repo: {model['repo']}")
logger.info(f"chktok: {chktok}")
logger.info(f"chkhsh: {chkhsh}")
# print the "pre_tokenizer" content from the tokenizer.json
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
cfg = json.load(f)
normalizer = cfg["normalizer"]
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
pre_tokenizer = cfg["pre_tokenizer"]
print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
print(f"\n")
logger.info("")
src_ifs += f" if chkhsh == \"{chkhsh}\":\n"
src_ifs += f" # ref: {model['repo']}\n"
src_ifs += f" res = \"{name}\"\n"
src_func = ""
src_func += " def get_vocab_base_pre(self, tokenizer) -> str:\n"
src_func += " # encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that\n"
src_func += " # is specific for the BPE pre-tokenizer used by the model\n"
src_func += " # we will use this unique identifier to write a \"tokenizer.ggml.pre\" entry in the GGUF file which we can\n"
src_func += " # use in llama.cpp to implement the same pre-tokenizer\n"
src_func += "\n"
src_func += f" chktxt = {repr(chktxt)}\n"
src_func += "\n"
src_func += " chktok = tokenizer.encode(chktxt)\n"
src_func += " chkhsh = sha256(str(chktok).encode()).hexdigest()\n"
src_func += "\n"
src_func += " print(f\"chktok: {chktok}\")\n"
src_func += " print(f\"chkhsh: {chkhsh}\")\n"
src_func += "\n"
src_func += " res = None\n"
src_func += "\n"
src_func += " # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
src_func += " # or pull the latest version of the model from Huggingface\n"
src_func += " # don't edit the hashes manually!\n"
src_func += f"{src_ifs}\n"
src_func += " if res is None:\n"
src_func += " print(\"\\n\")\n"
src_func += " print(\"**************************************************************************************\")\n"
src_func += " print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
src_func += " print(\"** There are 2 possible reasons for this:\")\n"
src_func += " print(\"** - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
src_func += " print(\"** - the pre-tokenization config has changed upstream\")\n"
src_func += " print(\"** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
src_func += " print(\"** ref: https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
src_func += " print(\"**\")\n"
src_func += " print(f\"** chkhsh: {chkhsh}\")\n"
src_func += " print(\"**************************************************************************************\")\n"
src_func += " print(\"\\n\")\n"
src_func += " raise NotImplementedError(\"BPE pre-tokenizer was not recognized - update get_vocab_base_pre()\")\n"
src_func += "\n"
src_func += " print(f\"tokenizer.ggml.pre: {res}\")\n"
src_func += " print(f\"chkhsh: {chkhsh}\")\n"
src_func += "\n"
src_func += " return res\n"
src_func = f"""
def get_vocab_base_pre(self, tokenizer) -> str:
# encoding this string and hashing the resulting tokens would (hopefully) give us a unique identifier that
# is specific for the BPE pre-tokenizer used by the model
# we will use this unique identifier to write a "tokenizer.ggml.pre" entry in the GGUF file which we can
# use in llama.cpp to implement the same pre-tokenizer
print(src_func)
chktxt = {repr(chktxt)}
print("\n")
print("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
print("\n")
chktok = tokenizer.encode(chktxt)
chkhsh = sha256(str(chktok).encode()).hexdigest()
logger.debug(f"chktok: {{chktok}}")
logger.debug(f"chkhsh: {{chkhsh}}")
res = None
# NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
# or pull the latest version of the model from Huggingface
# don't edit the hashes manually!
{src_ifs}
if res is None:
logger.warning("\\n")
logger.warning("**************************************************************************************")
logger.warning("** WARNING: The BPE pre-tokenizer was not recognized!")
logger.warning("** There are 2 possible reasons for this:")
logger.warning("** - the model has not been added to convert-hf-to-gguf-update.py yet")
logger.warning("** - the pre-tokenization config has changed upstream")
logger.warning("** Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
logger.warning("** ref: https://github.com/ggerganov/llama.cpp/pull/6920")
logger.warning("**")
logger.warning(f"** chkhsh: {{chkhsh}}")
logger.warning("**************************************************************************************")
logger.warning("\\n")
raise NotImplementedError("BPE pre-tokenizer was not recognized - update get_vocab_base_pre()")
logger.debug(f"tokenizer.ggml.pre: {{repr(res)}}")
logger.debug(f"chkhsh: {{chkhsh}}")
return res
"""
print(src_func) # noqa: NP100
logger.info("\n")
logger.info("!!! Copy-paste the function above into convert-hf-to-gguf.py !!!")
logger.info("\n")
# generate tests for each tokenizer model
tests = [
"ied 4 ½ months",
"Führer",
"",
" ",
" ",
@ -229,6 +261,7 @@ tests = [
"3333333",
"33333333",
"333333333",
# "Cửa Việt", # llama-bpe fails on this
chktxt,
]
@ -250,7 +283,6 @@ for model in models:
tokt = model["tokt"]
# create the tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
@ -265,15 +297,15 @@ for model in models:
f.write(f" {r}")
f.write("\n")
print(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
logger.info(f"Tests for {name} written in ./models/ggml-vocab-{name}.gguf.*")
# generate commands for creating vocab files
print("\nRun the following commands to generate the vocab files for testing:\n")
logger.info("\nRun the following commands to generate the vocab files for testing:\n")
for model in models:
name = model["name"]
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only")
print(f"python3 convert-hf-to-gguf.py models/tokenizers/{name}/ --outfile models/ggml-vocab-{name}.gguf --vocab-only") # noqa: NP100
print("\n")
logger.info("\n")

File diff suppressed because it is too large Load diff

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import argparse
import os
import struct
@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
logger = logging.getLogger("ggml-to-gguf")
class GGMLFormat(IntEnum):
GGML = 0
@ -125,7 +128,6 @@ class Tensor:
self.start_offset = offset
self.len_bytes = n_bytes
offset += n_bytes
# print(n_dims, name_len, dtype, self.dims, self.name, pad)
return offset - orig_offset
@ -175,7 +177,7 @@ class GGMLModel:
offset += self.validate_header(data, offset)
hp = Hyperparameters()
offset += hp.load(data, offset)
print(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
logger.info(f'* File format: {self.file_format.name}v{self.format_version} with ftype {hp.ftype.name}')
self.validate_conversion(hp.ftype)
vocab = Vocab(load_scores = self.file_format > GGMLFormat.GGML)
offset += vocab.load(data, offset, hp.n_vocab)
@ -215,12 +217,12 @@ class GGMLToGGUF:
if float(hp.n_head) / float(x) == gqa:
n_kv_head = x
assert n_kv_head is not None, "Couldn't determine n_kv_head from GQA param"
print(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
logger.info(f'- Guessed n_kv_head = {n_kv_head} based on GQA {cfg.gqa}')
self.n_kv_head = n_kv_head
self.name_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.LLAMA, ggml_model.hyperparameters.n_layer)
def save(self):
print('* Preparing to save GGUF file')
logger.info('* Preparing to save GGUF file')
gguf_writer = gguf.GGUFWriter(
self.cfg.output,
gguf.MODEL_ARCH_NAMES[gguf.MODEL_ARCH.LLAMA],
@ -230,11 +232,11 @@ class GGMLToGGUF:
if self.special_vocab is not None:
self.special_vocab.add_to_gguf(gguf_writer)
self.add_tensors(gguf_writer)
print(" gguf: write header")
logger.info(" gguf: write header")
gguf_writer.write_header_to_file()
print(" gguf: write metadata")
logger.info(" gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print(" gguf: write tensors")
logger.info(" gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
@ -250,7 +252,7 @@ class GGMLToGGUF:
name = cfg.name if cfg.name is not None else cfg.input.name
except UnicodeDecodeError:
name = None
print('* Adding model parameters and KV items')
logger.info('* Adding model parameters and KV items')
if name is not None:
gguf_writer.add_name(name)
gguf_writer.add_description(desc)
@ -287,7 +289,7 @@ class GGMLToGGUF:
toktypes = []
if self.vocab_override is not None:
vo = self.vocab_override
print('* Adding vocab item(s)')
logger.info('* Adding vocab item(s)')
for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
tokens.append(vbytes)
scores.append(score)
@ -299,7 +301,7 @@ class GGMLToGGUF:
if len(toktypes) > 0:
gguf_writer.add_token_types(toktypes)
return
print(f'* Adding {hp.n_vocab} vocab item(s)')
logger.info(f'* Adding {hp.n_vocab} vocab item(s)')
assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
tt = 1 # Normal
@ -334,7 +336,7 @@ class GGMLToGGUF:
def add_tensors(self, gguf_writer):
tensor_map = self.name_map
data = self.data
print(f'* Adding {len(self.model.tensors)} tensor(s)')
logger.info(f'* Adding {len(self.model.tensors)} tensor(s)')
for tensor in self.model.tensors:
name = str(tensor.name, 'UTF-8')
mapped_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
@ -344,7 +346,6 @@ class GGMLToGGUF:
temp = tempdims[1]
tempdims[1] = tempdims[0]
tempdims[0] = temp
# print(f'+ {tensor.name} | {mapped_name} {tensor.dims} :: {tempdims}')
gguf_writer.add_tensor(
mapped_name,
data[tensor.start_offset:tensor.start_offset + tensor.len_bytes],
@ -401,33 +402,35 @@ def handle_args():
help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir")
parser.add_argument("--vocabtype", default="spm,hfft",
help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
return parser.parse_args()
def main():
cfg = handle_args()
print(f'* Using config: {cfg}')
print('\n=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===\n')
logging.basicConfig(level=logging.DEBUG if cfg.verbose else logging.INFO)
logger.info(f'* Using config: {cfg}')
logger.warning('=== WARNING === Be aware that this conversion script is best-effort. Use a native GGUF model if possible. === WARNING ===')
if cfg.model_metadata_dir is None and (cfg.gqa == 1 or cfg.eps == '5.0e-06'):
print('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
logger.info('- Note: If converting LLaMA2, specifying "--eps 1e-5" is required. 70B models also need "--gqa 8".')
data = np.memmap(cfg.input, mode = 'r')
model = GGMLModel()
print('* Scanning GGML input file')
logger.info('* Scanning GGML input file')
offset = model.load(data, 0) # noqa
print(f'* GGML model hyperparameters: {model.hyperparameters}')
logger.info(f'* GGML model hyperparameters: {model.hyperparameters}')
vocab_override = None
params_override = None
special_vocab = None
if cfg.model_metadata_dir is not None:
(params_override, vocab_override, special_vocab) = handle_metadata(cfg, model.hyperparameters)
print('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
print(f'* Overriding params: {params_override}')
print(f'* Overriding vocab: {vocab_override}')
print(f'* Special vocab: {special_vocab}')
logger.info('!! Note: When overriding params the --gqa, --eps and --context-length options are ignored.')
logger.info(f'* Overriding params: {params_override}')
logger.info(f'* Overriding vocab: {vocab_override}')
logger.info(f'* Special vocab: {special_vocab}')
else:
print('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
logger.warning('\n=== WARNING === Special tokens may not be converted correctly. Use --model-metadata-dir if possible === WARNING ===\n')
if model.file_format == GGMLFormat.GGML:
print('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
logger.info('! This is a very old GGML file that does not contain vocab scores. Strongly recommend using model metadata!')
converter = GGMLToGGUF(
model, data, cfg,
params_override = params_override,
@ -435,7 +438,7 @@ def main():
special_vocab = special_vocab
)
converter.save()
print(f'* Successful completion. Output saved to: {cfg.output}')
logger.info(f'* Successful completion. Output saved to: {cfg.output}')
if __name__ == '__main__':

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import json
import os
import struct
@ -15,6 +16,9 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
import gguf
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("lora-to-gguf")
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
@ -48,11 +52,9 @@ def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_ty
if __name__ == '__main__':
if len(sys.argv) < 2:
print(f"Usage: python {sys.argv[0]} <path> [arch]")
print(
"Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'"
)
print(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
sys.exit(1)
input_json = os.path.join(sys.argv[1], "adapter_config.json")
@ -70,7 +72,7 @@ if __name__ == '__main__':
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
print(f"Error: unsupported architecture {arch_name}")
logger.error(f"Error: unsupported architecture {arch_name}")
sys.exit(1)
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
@ -80,21 +82,21 @@ if __name__ == '__main__':
params = json.load(f)
if params["peft_type"] != "LORA":
print(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
sys.exit(1)
if params["fan_in_fan_out"] is True:
print("Error: param fan_in_fan_out is not supported")
logger.error("Error: param fan_in_fan_out is not supported")
sys.exit(1)
if params["bias"] is not None and params["bias"] != "none":
print("Error: param bias is not supported")
logger.error("Error: param bias is not supported")
sys.exit(1)
# TODO: these seem to be layers that have been trained but without lora.
# doesn't seem widely used but eventually should be supported
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
print("Error: param modules_to_save is not supported")
logger.error("Error: param modules_to_save is not supported")
sys.exit(1)
with open(output_path, "wb") as fout:
@ -125,13 +127,13 @@ if __name__ == '__main__':
suffix = k[-len(lora_suffixes[0]):]
k = k[: -len(lora_suffixes[0])]
else:
print(f"Error: unrecognized tensor name {orig_k}")
logger.error(f"Error: unrecognized tensor name {orig_k}")
sys.exit(1)
tname = name_map.get_name(k)
if tname is None:
print(f"Error: could not map tensor name {orig_k}")
print(" Note: the arch parameter must be specified if the model is not llama")
logger.error(f"Error: could not map tensor name {orig_k}")
logger.error(" Note: the arch parameter must be specified if the model is not llama")
sys.exit(1)
if suffix == ".lora_A.weight":
@ -141,8 +143,8 @@ if __name__ == '__main__':
else:
assert False
print(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
write_tensor_header(fout, tname, t.shape, t.dtype)
t.tofile(fout)
print(f"Converted {input_json} and {input_model} to {output_path}")
logger.info(f"Converted {input_json} and {input_model} to {output_path}")

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import argparse
import os
import sys
@ -14,6 +15,8 @@ if 'NO_LOCAL_GGUF' not in os.environ:
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py'))
import gguf
logger = logging.getLogger("persimmon-to-gguf")
def _flatten_dict(dct, tensors, prefix=None):
assert isinstance(dct, dict)
@ -30,9 +33,9 @@ def _flatten_dict(dct, tensors, prefix=None):
def _get_sentencepiece_tokenizer_info(dir_model: Path):
tokenizer_path = dir_model / 'adept_vocab.model'
print('gguf: getting sentencepiece tokenizer from', tokenizer_path)
logger.info('getting sentencepiece tokenizer from', tokenizer_path)
tokenizer = SentencePieceProcessor(str(tokenizer_path))
print('gguf: adding tokens')
logger.info('adding tokens')
tokens: list[bytes] = []
scores: list[float] = []
toktypes: list[int] = []
@ -68,7 +71,9 @@ def main():
parser.add_argument("--ckpt-path", type=Path, help="path to persimmon checkpoint .pt file")
parser.add_argument("--model-dir", type=Path, help="directory containing model e.g. 8b_chat_model_release")
parser.add_argument("--adept-inference-dir", type=str, help="path to adept-inference code directory")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
sys.path.append(str(args.adept_inference_dir))
persimmon_model = torch.load(args.ckpt_path)
hparams = persimmon_model['args']
@ -107,7 +112,7 @@ def main():
gguf_writer.add_eos_token_id(71013)
tensor_map = gguf.get_tensor_name_map(arch, block_count)
print(tensor_map)
logger.info(tensor_map)
for name in tensors.keys():
data_torch = tensors[name]
if name.endswith(".self_attention.rotary_emb.inv_freq"):
@ -117,22 +122,21 @@ def main():
data = data_torch.to(torch.float32).squeeze().numpy()
new_name = tensor_map.get_name(name, try_suffixes = (".weight", ".bias"))
if new_name is None:
print("Can not map tensor '" + name + "'")
sys.exit()
raise ValueError(f"Can not map tensor '{name}'")
n_dims = len(data.shape)
print(new_name + ", n_dims = " + str(n_dims) + ", " + str(old_dtype) + " --> " + str(data.dtype))
logger.debug(f"{new_name}, n_dims = {str(n_dims)}, {str(old_dtype)} --> {str(data.dtype)}")
gguf_writer.add_tensor(new_name, data)
print("gguf: write header")
logger.info("gguf: write header")
gguf_writer.write_header_to_file()
print("gguf: write metadata")
logger.info("gguf: write metadata")
gguf_writer.write_kv_data_to_file()
print("gguf: write tensors")
logger.info("gguf: write tensors")
gguf_writer.write_tensors_to_file()
gguf_writer.close()
print(f"gguf: model successfully exported to '{args.outfile}'")
print("")
logger.info(f"gguf: model successfully exported to '{args.outfile}'")
if __name__ == '__main__':

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import argparse
import concurrent.futures
import enum
@ -35,6 +36,8 @@ import gguf
if TYPE_CHECKING:
from typing_extensions import Self, TypeAlias
logger = logging.getLogger("convert")
if hasattr(faulthandler, 'register') and hasattr(signal, 'SIGUSR1'):
faulthandler.register(signal.SIGUSR1)
@ -281,6 +284,7 @@ class Params:
n_experts = None
n_experts_used = None
f_rope_freq_base = None
n_ff = None
# hack to determine LLaMA v1 vs v2 vs CodeLlama
if config.get("moe"):
@ -305,6 +309,8 @@ class Params:
n_experts_used = config["moe"]["num_experts_per_tok"]
f_rope_freq_base = 1e6
assert n_ff is not None
return Params(
n_vocab = model["tok_embeddings.weight"].shape[0],
n_embd = config["dim"],
@ -459,7 +465,8 @@ class SentencePieceVocab(Vocab):
# not found in alternate location either
raise FileNotFoundError('Cannot find tokenizer.model')
self.sentencepiece_tokenizer = SentencePieceProcessor(str(fname_tokenizer))
self.sentencepiece_tokenizer = SentencePieceProcessor()
self.sentencepiece_tokenizer.LoadFromFile(str(fname_tokenizer))
vocab_size = self.sentencepiece_tokenizer.vocab_size()
new_tokens = {id: piece for piece, id in added_tokens.items() if id >= vocab_size}
@ -479,23 +486,23 @@ class SentencePieceVocab(Vocab):
def sentencepiece_tokens(self) -> Iterable[tuple[bytes, float, gguf.TokenType]]:
tokenizer = self.sentencepiece_tokenizer
for i in range(tokenizer.vocab_size()):
piece = tokenizer.id_to_piece(i)
piece = tokenizer.IdToPiece(i)
text = piece.encode("utf-8")
score: float = tokenizer.get_score(i)
score: float = tokenizer.GetScore(i)
toktype = gguf.TokenType.NORMAL
if tokenizer.is_unknown(i):
if tokenizer.IsUnknown(i):
toktype = gguf.TokenType.UNKNOWN
if tokenizer.is_control(i):
if tokenizer.IsControl(i):
toktype = gguf.TokenType.CONTROL
# NOTE: I think added_tokens are user defined.
# ref: https://github.com/google/sentencepiece/blob/master/src/sentencepiece_model.proto
# if tokenizer.is_user_defined(i): toktype = gguf.TokenType.USER_DEFINED
if tokenizer.is_unused(i):
if tokenizer.IsUnused(i):
toktype = gguf.TokenType.UNUSED
if tokenizer.is_byte(i):
if tokenizer.IsByte(i):
toktype = gguf.TokenType.BYTE
yield text, score, toktype
@ -643,7 +650,6 @@ class LlamaHfVocab(Vocab):
def permute(weights: NDArray, n_head: int, n_head_kv: int) -> NDArray:
# print( "permute debug " + str(weights.shape[0]) + " x " + str(weights.shape[1]) + " nhead " + str(n_head) + " nheadkv " + str(n_kv_head) )
if n_head_kv is not None and n_head != n_head_kv:
n_head = n_head_kv
return (weights.reshape(n_head, 2, weights.shape[0] // n_head // 2, *weights.shape[1:])
@ -904,7 +910,7 @@ class LazyUnpickler(pickle.Unpickler):
def rebuild_from_type_v2(func, new_type, args, state):
return func(*args)
CLASSES = {
CLASSES: dict[tuple[str, str], type[LazyTensor] | LazyStorageKind] = {
# getattr used here as a workaround for mypy not being smart enough to determine
# the staticmethods have a __func__ attribute.
('torch._tensor', '_rebuild_from_type_v2'): getattr(rebuild_from_type_v2, '__func__'),
@ -1033,12 +1039,12 @@ def check_vocab_size(params: Params, vocab: BaseVocab, pad_vocab: bool = False)
# Check for a vocab size mismatch
if params.n_vocab == vocab.vocab_size:
print("Ignoring added_tokens.json since model matches vocab size without it.")
logger.warning("Ignoring added_tokens.json since model matches vocab size without it.")
return
if pad_vocab and params.n_vocab > vocab.vocab_size:
pad_count = params.n_vocab - vocab.vocab_size
print(
logger.debug(
f"Padding vocab with {pad_count} token(s) - <dummy00001> through <dummy{pad_count:05}>"
)
for i in range(1, pad_count + 1):
@ -1166,7 +1172,7 @@ class OutputFile:
elapsed = time.time() - start
size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
padi = len(str(len(model)))
print(
logger.info(
f"[{i + 1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}"
)
self.gguf.write_tensor_data(ndarray)
@ -1281,12 +1287,12 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
# HF models permut or pack some of the tensors, so we need to undo that
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
print(f"Permuting layer {i}")
logger.debug(f"Permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.q_proj.weight"], params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_lazy(model[f"model.layers.{i}.self_attn.k_proj.weight"], params.n_head, params.n_head_kv)
# tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = model[f"model.layers.{i}.self_attn.v_proj.weight"]
elif f"model.layers.{i}.self_attn.W_pack.weight" in model:
print(f"Unpacking and permuting layer {i}")
logger.debug(f"Unpacking and permuting layer {i}")
tmp[f"model.layers.{i}.self_attn.q_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 0, params.n_head, params.n_head)
tmp[f"model.layers.{i}.self_attn.k_proj.weight"] = permute_part_lazy(model[f"model.layers.{i}.self_attn.W_pack.weight"], 1, params.n_head, params.n_head_kv)
tmp[f"model.layers.{i}.self_attn.v_proj.weight"] = part_lazy (model[f"model.layers.{i}.self_attn.W_pack.weight"], 2)
@ -1299,15 +1305,15 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->
tensor_type, name_new = tmap.get_type_and_name(name, try_suffixes = (".weight", ".bias")) or (None, None)
if name_new is None:
if skip_unknown:
print(f"Unexpected tensor name: {name} - skipping")
logger.warning(f"Unexpected tensor name: {name} - skipping")
continue
raise ValueError(f"Unexpected tensor name: {name}. Use --skip-unknown to ignore it (e.g. LLaVA)")
if tensor_type in should_skip:
print(f"skipping tensor {name_new}")
logger.debug(f"skipping tensor {name_new}")
continue
print(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
logger.debug(f"{name:48s} -> {name_new:40s} | {lazy_tensor.data_type.name:6s} | {lazy_tensor.shape}")
out[name_new] = lazy_tensor
return out
@ -1372,7 +1378,7 @@ def load_some_model(path: Path) -> ModelPlus:
paths = find_multifile_paths(path)
models_plus: list[ModelPlus] = []
for path in paths:
print(f"Loading model file {path}")
logger.info(f"Loading model file {path}")
models_plus.append(lazy_load_file(path))
model_plus = merge_multifile_models(models_plus)
@ -1413,7 +1419,7 @@ class VocabFactory:
else:
raise FileNotFoundError(f"Could not find a tokenizer matching any of {vocab_types}")
print(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
logger.info(f"Loaded vocab file {vocab.fname_tokenizer!r}, type {vocab.name!r}")
return vocab
def load_vocab(self, vocab_types: list[str] | None, model_parent_path: Path) -> tuple[BaseVocab, gguf.SpecialVocab]:
@ -1438,19 +1444,19 @@ def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
}[file_type]
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
if ret in model_paths:
sys.stderr.write(
logger.error(
f"Error: Default output path ({ret}) would overwrite the input. "
"Please explicitly specify a path using --outfile.\n")
"Please explicitly specify a path using --outfile.")
sys.exit(1)
return ret
def do_dump_model(model_plus: ModelPlus) -> None:
print(f"model_plus.paths = {model_plus.paths!r}")
print(f"model_plus.format = {model_plus.format!r}")
print(f"model_plus.vocab = {model_plus.vocab!r}")
print(f"model_plus.paths = {model_plus.paths!r}") # noqa: NP100
print(f"model_plus.format = {model_plus.format!r}") # noqa: NP100
print(f"model_plus.vocab = {model_plus.vocab!r}") # noqa: NP100
for name, lazy_tensor in model_plus.model.items():
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}")
print(f"{name}: shape={lazy_tensor.shape} type={lazy_tensor.data_type}; {lazy_tensor.description}") # noqa: NP100
def main(args_in: list[str] | None = None) -> None:
@ -1473,8 +1479,18 @@ def main(args_in: list[str] | None = None) -> None:
parser.add_argument("--big-endian", action="store_true", help="model is executed on big endian machine")
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(args_in)
if args.verbose:
logging.basicConfig(level=logging.DEBUG)
elif args.dump_single or args.dump:
# Avoid printing anything besides the dump output
logging.basicConfig(level=logging.WARNING)
else:
logging.basicConfig(level=logging.INFO)
if args.no_vocab and args.vocab_only:
raise ValueError("--vocab-only does not make sense with --no-vocab")
@ -1491,10 +1507,13 @@ def main(args_in: list[str] | None = None) -> None:
if args.dump:
do_dump_model(model_plus)
return
endianess = gguf.GGUFEndian.LITTLE
if args.big_endian:
endianess = gguf.GGUFEndian.BIG
params = None
if args.pad_vocab or not args.vocab_only:
params = Params.load(model_plus)
if params.n_ctx == -1:
if args.ctx is None:
@ -1513,7 +1532,7 @@ def main(args_in: list[str] | None = None) -> None:
"q8_0": GGMLFileType.MostlyQ8_0,
}[args.outtype]
print(f"params = {params}")
logger.info(f"params = {params}")
model_parent_path = model_plus.paths[0].parent
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@ -1526,17 +1545,27 @@ def main(args_in: list[str] | None = None) -> None:
if not args.outfile:
raise ValueError("need --outfile if using --vocab-only")
outfile = args.outfile
if params is None:
params = Params(
n_vocab = vocab.vocab_size,
n_embd = 1,
n_layer = 1,
n_ctx = 1,
n_ff = 1,
n_head = 1,
n_head_kv = 1,
f_norm_eps = 1e-5,
)
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
endianess=endianess, pad_vocab=args.pad_vocab)
print(f"Wrote {outfile}")
logger.info(f"Wrote {outfile}")
return
if model_plus.vocab is not None and args.vocab_dir is None and not args.no_vocab:
vocab = model_plus.vocab
print(f"Vocab info: {vocab}")
print(f"Special vocab info: {special_vocab}")
logger.info(f"Vocab info: {vocab}")
logger.info(f"Special vocab info: {special_vocab}")
model = model_plus.model
model = convert_model_names(model, params, args.skip_unknown)
ftype = pick_output_type(model, args.outtype)
@ -1544,11 +1573,11 @@ def main(args_in: list[str] | None = None) -> None:
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
params.ftype = ftype
print(f"Writing {outfile}, format {ftype}")
logger.info(f"Writing {outfile}, format {ftype}")
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
print(f"Wrote {outfile}")
logger.info(f"Wrote {outfile}")
if __name__ == '__main__':

View file

@ -23,7 +23,7 @@ Install BLIS:
sudo make install
```
We recommend using openmp since it's easier to modify the cores been used.
We recommend using openmp since it's easier to modify the cores being used.
### llama.cpp compilation

View file

@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).

View file

@ -2,7 +2,7 @@
This example reads weights from project [llama2.c](https://github.com/karpathy/llama2.c) and saves them in ggml compatible format. The vocab that is available in `models/ggml-vocab.bin` is used by default.
To convert the model first download the models from the [llma2.c](https://github.com/karpathy/llama2.c) repository:
To convert the model first download the models from the [llama2.c](https://github.com/karpathy/llama2.c) repository:
`$ make -j`

View file

@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
} else if (a->type == GGML_TYPE_F32) {
return ggml_add(ctx, a, b);

View file

@ -32,6 +32,7 @@ struct split_params {
int n_split_tensors = 128;
std::string input;
std::string output;
bool no_tensor_first_split = false;
bool dry_run = false;
};
@ -49,6 +50,7 @@ static void split_print_usage(const char * executable) {
printf(" --merge merge multiple GGUF to a single GGUF\n");
printf(" --split-max-tensors max tensors in each split (default: %d)\n", default_params.n_split_tensors);
printf(" --split-max-size N(M|G) max size per split\n");
printf(" --no-tensor-first-split do not add tensors to the first split (disabled by default)\n");
printf(" --dry-run only print out a split plan and exit, without writing any new files\n");
printf("\n");
}
@ -100,6 +102,10 @@ static void split_params_parse_ex(int argc, const char ** argv, split_params & p
arg_found = true;
params.dry_run = true;
}
if (arg == "--no-tensor-first-split") {
arg_found = true;
params.no_tensor_first_split = true;
}
if (is_op_set) {
throw std::invalid_argument("error: either --split or --merge can be specified, but not both");
@ -200,10 +206,10 @@ struct split_strategy {
// because we need to know list of tensors for each file in advance, we will build all the ctx_out for all output splits
int i_split = -1;
struct gguf_context * ctx_out = NULL;
auto new_ctx_out = [&]() {
auto new_ctx_out = [&](bool allow_no_tensors) {
i_split++;
if (ctx_out != NULL) {
if (gguf_get_n_tensors(ctx_out) == 0) {
if (gguf_get_n_tensors(ctx_out) == 0 && !allow_no_tensors) {
fprintf(stderr, "error: one of splits have 0 tensors. Maybe size or tensors limit is too small\n");
exit(EXIT_FAILURE);
}
@ -220,7 +226,12 @@ struct split_strategy {
};
// initialize ctx_out for the first split
new_ctx_out();
new_ctx_out(false);
// skip first split if no_tensor_first_split is set
if (params.no_tensor_first_split) {
new_ctx_out(true);
}
// process tensors one by one
size_t curr_tensors_size = 0; // current size by counting only tensors size (without metadata)
@ -230,7 +241,7 @@ struct split_strategy {
size_t n_bytes = GGML_PAD(ggml_nbytes(t), GGUF_DEFAULT_ALIGNMENT);
size_t next_tensors_size = curr_tensors_size + n_bytes;
if (should_split(i, next_tensors_size)) {
new_ctx_out();
new_ctx_out(false);
curr_tensors_size = n_bytes;
} else {
curr_tensors_size = next_tensors_size;

View file

@ -55,15 +55,15 @@ $MAIN --model $WORK_PATH/ggml-model-merge.gguf --random-prompt --n-predict 32
echo PASS
echo
# 4. Split with no tensor in metadata
#$SPLIT --split-max-tensors 32 --no-tensor-in-metadata $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
#echo PASS
#echo
# 4. Split with no tensors in the first split
$SPLIT --split-max-tensors 32 --no-tensor-first-split $WORK_PATH/ggml-model-merge.gguf $WORK_PATH/ggml-model-split-32-tensors
echo PASS
echo
# 4b. Test the sharded model is loading properly
#$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf --random-prompt --n-predict 32
#echo PASS
#echo
$MAIN --model $WORK_PATH/ggml-model-split-32-tensors-00001-of-00007.gguf --random-prompt --n-predict 32
echo PASS
echo
# 5. Merge
#$SPLIT --merge $WORK_PATH/ggml-model-split-32-tensors-00001-of-00006.gguf $WORK_PATH/ggml-model-merge-2.gguf

View file

@ -19,6 +19,7 @@
struct Stats {
std::vector<float> values;
std::vector<int> counts;
int ncall = 0;
};
@ -121,12 +122,10 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto & e = m_stats[wname];
++e.ncall;
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed by replacing the line above with:
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
if (e.values.empty()) {
e.values.resize(src1->ne[0]*n_as, 0);
e.counts.resize(src1->ne[0]*n_as, 0);
}
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
@ -153,6 +152,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[e_start + j] += x[j]*x[j];
e.counts[e_start + j]++;
}
}
}
@ -170,6 +170,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
auto& e = m_stats[wname];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
e.counts.resize(src1->ne[0], 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
@ -183,6 +184,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.counts[j]++;
}
}
if (e.ncall > m_last_call) {
@ -222,7 +224,13 @@ void IMatrixCollector::save_imatrix(const char * fname, const char * dataset) co
out.write((const char *) &p.second.ncall, sizeof(p.second.ncall));
int nval = p.second.values.size();
out.write((const char *) &nval, sizeof(nval));
if (nval > 0) out.write((const char *) p.second.values.data(), nval * sizeof(float));
if (nval > 0) {
std::vector<float> tmp(nval);
for (int i = 0; i < nval; i++) {
tmp[i] = (p.second.values[i] / static_cast<float>(p.second.counts[i])) * static_cast<float>(p.second.ncall);
}
out.write((const char*)tmp.data(), nval*sizeof(float));
}
}
// Write the number of call the matrix was computed with
@ -270,14 +278,28 @@ bool IMatrixCollector::load_imatrix(const char * imatrix_file, std::unordered_ma
imatrix_data = {};
return false;
}
e.values.resize(nval);
in.read((char*)e.values.data(), nval*sizeof(float));
// When re-called from load_imatrix() with add set, this will already be created.
if (e.values.empty()) {
e.values.resize(nval, 0);
e.counts.resize(nval, 0);
}
std::vector<float> tmp(nval);
in.read((char*)tmp.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n",__func__,i);
imatrix_data = {};
return false;
}
e.ncall = ncall;
// Recreate the state as expected by save_imatrix(), and corerct for weighted sum.
for (int i = 0; i < nval; i++) {
e.values[i] += tmp[i];
e.counts[i] += ncall;
}
e.ncall += ncall;
}
return true;
}

View file

@ -178,6 +178,7 @@ struct cmd_params {
std::vector<std::vector<float>> tensor_split;
std::vector<bool> use_mmap;
std::vector<bool> embeddings;
ggml_numa_strategy numa;
int reps;
bool verbose;
output_formats output_format;
@ -200,6 +201,7 @@ static const cmd_params cmd_params_defaults = {
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
/* use_mmap */ {true},
/* embeddings */ {false},
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
/* reps */ 5,
/* verbose */ false,
/* output_format */ MARKDOWN
@ -224,6 +226,7 @@ static void print_usage(int /* argc */, char ** argv) {
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
@ -396,6 +399,17 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
}
auto p = split<bool>(argv[i], split_delim);
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
} else if (arg == "--numa") {
if (++i >= argc) {
invalid_param = true;
break;
} else {
std::string value(argv[i]);
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
else { invalid_param = true; break; }
}
} else if (arg == "-fa" || arg == "--flash-attn") {
if (++i >= argc) {
invalid_param = true;
@ -1215,6 +1229,7 @@ int main(int argc, char ** argv) {
llama_log_set(llama_null_log_callback, NULL);
}
llama_backend_init();
llama_numa_init(params.numa);
// initialize printer
std::unique_ptr<printer> p;

View file

@ -56,7 +56,7 @@ python ./examples/llava/convert-image-encoder-to-gguf.py -m ../clip-vit-large-pa
python ./convert.py ../llava-v1.5-7b --skip-unknown
```
Now both the LLaMA part and the image encoder is in the `llava-v1.5-7b` directory.
Now both the LLaMA part and the image encoder are in the `llava-v1.5-7b` directory.
## LLaVA 1.6 gguf conversion
1) First clone a LLaVA 1.6 model:

View file

@ -104,7 +104,6 @@ static std::string format(const char * fmt, ...) {
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@ -426,7 +425,6 @@ struct clip_vision_model {
// embeddings
struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings;
struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings;
struct ggml_tensor * pre_ln_w;
@ -503,11 +501,6 @@ struct clip_ctx {
bool use_gelu = false;
int32_t ftype = 1;
bool has_class_embedding = true;
bool has_pre_norm = true;
bool has_post_norm = false;
bool has_patch_bias = false;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data;
@ -533,7 +526,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int num_positions = num_patches + 1;
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
@ -564,23 +557,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
}
// concat class_embeddings and patch_embeddings
struct ggml_tensor * embeddings = inp;
if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions");
@ -590,7 +576,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
// pre-layernorm
if (ctx->has_pre_norm) {
{
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln");
@ -678,14 +664,6 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = cur;
}
// post-layernorm
if (ctx->has_post_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
// llava projector
{
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@ -1171,38 +1149,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
new_clip->has_class_embedding = true;
} catch (const std::exception& e) {
new_clip->has_class_embedding = false;
}
try {
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
new_clip->has_pre_norm = true;
} catch (std::exception & e) {
new_clip->has_pre_norm = false;
}
try {
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
new_clip->has_post_norm = true;
} catch (std::exception & e) {
new_clip->has_post_norm = false;
}
try {
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
new_clip->has_patch_bias = true;
} catch (std::exception & e) {
new_clip->has_patch_bias = false;
}
try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& e) {
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
}

View file

@ -143,7 +143,7 @@ The `--ctx-size` option allows you to set the size of the prompt context used by
### Extended Context Size
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model have a context length (max sequence length) of 4096 (4k) and the fine-tuned model have 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
Some fine-tuned models have extended the context length by scaling RoPE. For example, if the original pre-trained model has a context length (max sequence length) of 4096 (4k) and the fine-tuned model has 32k. That is a scaling factor of 8, and should work by setting the above `--ctx-size` to 32768 (32k) and `--rope-scale` to 8.
- `--rope-scale N`: Where N is the linear scaling factor used by the fine-tuned model.
@ -286,7 +286,7 @@ These options help improve the performance and memory usage of the LLaMA models.
- `--numa distribute`: Pin an equal proportion of the threads to the cores on each NUMA node. This will spread the load amongst all cores on the system, utilitizing all memory channels at the expense of potentially requiring memory to travel over the slow links between nodes.
- `--numa isolate`: Pin all threads to the NUMA node that the program starts on. This limits the number of cores and amount of memory that can be used, but guarantees all memory access remains local to the NUMA node.
- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitraty core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
- `--numa numactl`: Pin threads to the CPUMAP that is passed to the program by starting it with the numactl utility. This is the most flexible mode, and allow arbitrary core usage patterns, for example a map that uses all the cores on one NUMA nodes, and just enough cores on a second node to saturate the inter-node memory bus.
These flags attempt optimizations that help on some systems with non-uniform memory access. This currently consists of one of the above strategies, and disabling prefetch and readahead for mmap. The latter causes mapped pages to be faulted in on first access instead of all at once, and in combination with pinning threads to NUMA nodes, more of the pages end up on the NUMA node where they are used. Note that if the model is already in the system page cache, for example because of a previous run without this option, this will have little effect unless you drop the page cache first. This can be done by rebooting the system or on Linux by writing '3' to '/proc/sys/vm/drop_caches' as root.

View file

@ -362,6 +362,9 @@ int main(int argc, char ** argv) {
params.interactive_first = true;
params.antiprompt.emplace_back("<|im_start|>user\n");
}
else if (params.conversation) {
params.interactive_first = true;
}
// enable interactive mode if interactive start is specified
if (params.interactive_first) {
@ -544,7 +547,7 @@ int main(int argc, char ** argv) {
// if we run out of context:
// - take the n_keep first tokens from the original prompt (via n_past)
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) > n_ctx) {
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
if (params.n_predict == -2) {
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
break;
@ -733,7 +736,7 @@ int main(int argc, char ** argv) {
// display text
if (input_echo && display) {
for (auto id : embd) {
const std::string token_str = llama_token_to_piece(ctx, id);
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
printf("%s", token_str.c_str());
if (embd.size() > 1) {
@ -796,7 +799,7 @@ int main(int argc, char ** argv) {
// deal with end of generation tokens in interactive mode
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
LOG("found EOS token\n");
LOG("found an EOG token\n");
if (params.interactive) {
if (!params.antiprompt.empty()) {
@ -816,7 +819,7 @@ int main(int argc, char ** argv) {
if (n_past > 0 && is_interacting) {
LOG("waiting for user input\n");
if (params.instruct || params.chatml) {
if (params.conversation || params.instruct || params.chatml) {
printf("\n> ");
}
@ -826,7 +829,7 @@ int main(int argc, char ** argv) {
}
std::string buffer;
if (!params.input_prefix.empty()) {
if (!params.input_prefix.empty() && !params.conversation) {
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
printf("%s", params.input_prefix.c_str());
}
@ -850,7 +853,7 @@ int main(int argc, char ** argv) {
// Entering a empty line lets the user pass control back
if (buffer.length() > 1) {
// append input suffix if any
if (!params.input_suffix.empty()) {
if (!params.input_suffix.empty() && !params.conversation) {
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
printf("%s", params.input_suffix.c_str());
}

View file

@ -46,7 +46,8 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
{ "Q5_K_M", LLAMA_FTYPE_MOSTLY_Q5_K_M, " 4.45G, +0.0122 ppl @ LLaMA-v1-7B", },
{ "Q6_K", LLAMA_FTYPE_MOSTLY_Q6_K, " 5.15G, +0.0008 ppl @ LLaMA-v1-7B", },
{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "14.00G, -0.0020 ppl @ Mistral-7B", },
{ "BF16", LLAMA_FTYPE_MOSTLY_BF16, "14.00G, -0.0050 ppl @ Mistral-7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
// Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
{ "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },

View file

@ -62,6 +62,18 @@ page cache before using this. See https://github.com/ggerganov/llama.cpp/issues/
- `--chat-template JINJA_TEMPLATE`: Set custom jinja chat template. This parameter accepts a string, not a file name. Default: template taken from model's metadata. We only support [some pre-defined templates](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template)
- `--log-disable`: Output logs to stdout only, not to `llama.log`. Default: enabled
- `--log-format FORMAT`: Define the log output to FORMAT: json or text Default: `json`
- `--rope-scaling` : RoPE scaling method. Defaults to linear unless otherwise specified by the model. Options are `none`, `linear`, `yarn`
- `--rope-freq-base N` : RoPE frequency base (default: loaded from model)
- `--rope-freq-scale N`: RoPE frequency scaling factor, expands context by a factor of 1/N (e.g. 0.25)
- `--yarn-ext-factor N` : YaRN: extrapolation mix factor (Default: 1.0, 0.0 = full interpolation)
- `--yarn-attn-factor N` : YaRN: scale sqrt(t) or attention magnitude (default: 1.0)
- `--yarn-beta-slow N`: YaRN: High correction dim or alpha (default: 1.0)
- `--yarn-beta-fast N`: YaRN: low correction dim or beta (default: 32.0)
- `--pooling` : Pooling type for embeddings, use model default if unspecified. Options are `none`, `mean`, `cls`
- `-dt N`, `--defrag-thold N`: KV cache defragmentation threshold (default: -1.0, < 0 = disabled)
- `-fa`, `--flash-attn` : enable flash attention (default: disabled).
- `-ctk TYPE`, `--cache-type-k TYPE` : KV cache data type for K (default: `f16`, options `f32`, `f16`, `q8_0`, `q4_0`, `q4_1`, `iq4_nl`, `q5_0`, or `q5_1`)
- `-ctv TYPE`, `--cache-type-v TYPE` : KV cache type for V (default `f16`, see `-ctk` for options)
**If compiled with `LLAMA_SERVER_SSL=ON`**
- `--ssl-key-file FNAME`: path to file a PEM-encoded SSL private key
@ -260,7 +272,7 @@ node index.js
`logit_bias`: Modify the likelihood of a token appearing in the generated text completion. For example, use `"logit_bias": [[15043,1.0]]` to increase the likelihood of the token 'Hello', or `"logit_bias": [[15043,-1.0]]` to decrease its likelihood. Setting the value to false, `"logit_bias": [[15043,false]]` ensures that the token `Hello` is never produced. The tokens can also be represented as strings, e.g. `[["Hello, World!",-0.5]]` will reduce the likelihood of all the individual tokens that represent the string `Hello, World!`, just like the `presence_penalty` does. Default: `[]`
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token. Default: `0`
`n_probs`: If greater than 0, the response also contains the probabilities of top N tokens for each generated token given the sampling settings. Note that for temperature < 0 the tokens are sampled greedily but token probabilities are still being calculated via a simple softmax of the logits without considering any other sampler settings. Default: `0`
`min_keep`: If greater than 0, force samplers to return N possible tokens at minimum. Default: `0`
@ -319,7 +331,7 @@ Notice that each `probs` is an array of length `n_probs`.
`content`: Set the text to tokenize.
Note that a special `BOS` token is never inserted.
`add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
- **POST** `/detokenize`: Convert tokens to text.

Binary file not shown.

After

Width:  |  Height:  |  Size: 4 KiB

View file

@ -12,6 +12,8 @@
// increase max payload length to allow use of larger context size
#define CPPHTTPLIB_FORM_URL_ENCODED_PAYLOAD_MAX_LENGTH 1048576
#include "httplib.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
// auto generated files (update with ./deps.sh)
@ -859,7 +861,7 @@ struct server_context {
slot.sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
// process "json_schema" and "grammar"
if (data.contains("json_schema") && !data["json_schema"].is_null() && data.contains("grammar") && !data["grammar"].is_null()) {
if (data.contains("json_schema") && !data.at("json_schema").is_null() && data.contains("grammar") && !data.at("grammar").is_null()) {
send_error(task, "Either \"json_schema\" or \"grammar\" can be specified, but not both", ERROR_TYPE_INVALID_REQUEST);
return false;
} else if (data.contains("json_schema") && !data.contains("grammar")) {
@ -1383,9 +1385,10 @@ struct server_context {
if (!slot.params.stream && slot.stopped_word) {
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
size_t safe_offset = std::min(slot.generated_token_probs.size(), stop_word_toks.size());
probs = std::vector<completion_token_output>(
slot.generated_token_probs.begin(),
slot.generated_token_probs.end() - stop_word_toks.size());
slot.generated_token_probs.end() - safe_offset);
} else {
probs = std::vector<completion_token_output>(
slot.generated_token_probs.begin(),
@ -1511,7 +1514,7 @@ struct server_context {
// add subtasks
for (int i = 0; i < prompt_count; i++) {
json subtask_data = multiprompt_task.data;
subtask_data["prompt"] = subtask_data["prompt"][i];
subtask_data["prompt"] = subtask_data.at("prompt")[i];
// subtasks inherit everything else (infill mode, embedding mode, etc.)
request_completion(subtask_ids[i], id_multi, subtask_data, multiprompt_task.infill, multiprompt_task.embedding);
@ -1531,7 +1534,7 @@ struct server_context {
}
if (task.data.contains("system_prompt")) {
system_prompt_set(task.data["system_prompt"]);
system_prompt_set(task.data.at("system_prompt"));
for (server_slot & slot : slots) {
slot.n_past = 0;
@ -1643,7 +1646,7 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SLOT_SAVE:
{
int id_slot = task.data["id_slot"];
int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -1653,8 +1656,8 @@ struct server_context {
const size_t token_count = slot->cache_tokens.size();
const int64_t t_start = ggml_time_us();
std::string filename = task.data["filename"];
std::string filepath = task.data["filepath"];
std::string filename = task.data.at("filename");
std::string filepath = task.data.at("filepath");
const size_t nwrite = llama_state_seq_save_file(ctx, filepath.c_str(), slot->id + 1, slot->cache_tokens.data(), token_count);
@ -1678,7 +1681,7 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SLOT_RESTORE:
{
int id_slot = task.data["id_slot"];
int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -1687,8 +1690,8 @@ struct server_context {
const int64_t t_start = ggml_time_us();
std::string filename = task.data["filename"];
std::string filepath = task.data["filepath"];
std::string filename = task.data.at("filename");
std::string filepath = task.data.at("filepath");
slot->cache_tokens.resize(slot->n_ctx);
size_t token_count = 0;
@ -1720,7 +1723,7 @@ struct server_context {
} break;
case SERVER_TASK_TYPE_SLOT_ERASE:
{
int id_slot = task.data["id_slot"];
int id_slot = task.data.at("id_slot");
server_slot * slot = get_slot(id_slot);
if (slot == nullptr) {
send_error(task, "Invalid slot ID", ERROR_TYPE_INVALID_REQUEST);
@ -2265,18 +2268,32 @@ struct server_context {
llama_token_data_array cur_p = { slot.ctx_sampling->cur.data(), slot.ctx_sampling->cur.size(), false };
result.tok = id;
const int32_t n_probs = slot.sparams.n_probs;
if (slot.sparams.temp <= 0 && n_probs > 0) {
// for llama_sample_token_greedy we need to sort candidates
llama_sample_softmax(ctx, &cur_p);
const size_t n_probs = std::min(cur_p.size, (size_t) slot.sparams.n_probs);
if (n_probs > 0) {
const size_t n_considered = slot.ctx_sampling->n_considered;
// Make sure at least n_probs top tokens are at the front of the vector:
if (slot.sparams.temp == 0.0f && n_probs > n_considered) {
llama_sample_top_k(ctx, &cur_p, n_probs, 0);
}
for (size_t i = 0; i < std::min(cur_p.size, (size_t) n_probs); ++i) {
if (slot.sparams.temp == 0.0f) {
// With greedy sampling the probabilities have possibly not been calculated.
for (size_t i = 0; i < n_probs; ++i) {
result.probs.push_back({
cur_p.data[i].id,
cur_p.data[i].p
i == 0 ? 1.0f : 0.0f
});
}
} else {
for (size_t i = 0; i < n_probs; ++i) {
result.probs.push_back({
cur_p.data[i].id,
i >= n_considered ? 0.0f : cur_p.data[i].p // Tokens filtered out due to e.g. top_k have 0 probability.
});
}
}
}
if (!process_token(result, slot)) {
slot.release();
@ -3121,8 +3138,8 @@ int main(int argc, char ** argv) {
server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
const int n_idle_slots = result.data["idle"];
const int n_processing_slots = result.data["processing"];
const int n_idle_slots = result.data.at("idle");
const int n_processing_slots = result.data.at("processing");
json health = {
{"status", "ok"},
@ -3132,7 +3149,7 @@ int main(int argc, char ** argv) {
res.status = 200; // HTTP OK
if (sparams.slots_endpoint && req.has_param("include_slots")) {
health["slots"] = result.data["slots"];
health["slots"] = result.data.at("slots");
}
if (n_idle_slots == 0) {
@ -3176,7 +3193,7 @@ int main(int argc, char ** argv) {
server_task_result result = ctx_server.queue_results.recv(task.id);
ctx_server.queue_results.remove_waiting_task_id(task.id);
res.set_content(result.data["slots"].dump(), "application/json");
res.set_content(result.data.at("slots").dump(), "application/json");
res.status = 200; // HTTP OK
};
@ -3203,32 +3220,32 @@ int main(int argc, char ** argv) {
json data = result.data;
const uint64_t n_prompt_tokens_processed = data["n_prompt_tokens_processed"];
const uint64_t t_prompt_processing = data["t_prompt_processing"];
const uint64_t n_prompt_tokens_processed = data.at("n_prompt_tokens_processed");
const uint64_t t_prompt_processing = data.at("t_prompt_processing");
const uint64_t n_tokens_predicted = data["n_tokens_predicted"];
const uint64_t t_tokens_generation = data["t_tokens_generation"];
const uint64_t n_tokens_predicted = data.at("n_tokens_predicted");
const uint64_t t_tokens_generation = data.at("t_tokens_generation");
const int32_t kv_cache_used_cells = data["kv_cache_used_cells"];
const int32_t kv_cache_used_cells = data.at("kv_cache_used_cells");
// metrics definition: https://prometheus.io/docs/practices/naming/#metric-names
json all_metrics_def = json {
{"counter", {{
{"name", "prompt_tokens_total"},
{"help", "Number of prompt tokens processed."},
{"value", (uint64_t) data["n_prompt_tokens_processed_total"]}
{"value", (uint64_t) data.at("n_prompt_tokens_processed_total")}
}, {
{"name", "prompt_seconds_total"},
{"help", "Prompt process time"},
{"value", (uint64_t) data["t_prompt_processing_total"] / 1.e3}
{"value", (uint64_t) data.at("t_prompt_processing_total") / 1.e3}
}, {
{"name", "tokens_predicted_total"},
{"help", "Number of generation tokens processed."},
{"value", (uint64_t) data["n_tokens_predicted_total"]}
{"value", (uint64_t) data.at("n_tokens_predicted_total")}
}, {
{"name", "tokens_predicted_seconds_total"},
{"help", "Predict process time"},
{"value", (uint64_t) data["t_tokens_generation_total"] / 1.e3}
{"value", (uint64_t) data.at("t_tokens_generation_total") / 1.e3}
}}},
{"gauge", {{
{"name", "prompt_tokens_seconds"},
@ -3245,15 +3262,15 @@ int main(int argc, char ** argv) {
},{
{"name", "kv_cache_tokens"},
{"help", "KV-cache tokens."},
{"value", (uint64_t) data["kv_cache_tokens_count"]}
{"value", (uint64_t) data.at("kv_cache_tokens_count")}
},{
{"name", "requests_processing"},
{"help", "Number of request processing."},
{"value", (uint64_t) data["processing"]}
{"value", (uint64_t) data.at("processing")}
},{
{"name", "requests_deferred"},
{"help", "Number of request deferred."},
{"value", (uint64_t) data["deferred"]}
{"value", (uint64_t) data.at("deferred")}
}}}
};
@ -3264,8 +3281,8 @@ int main(int argc, char ** argv) {
const auto & metrics_def = el.value();
for (const auto & metric_def : metrics_def) {
const std::string name = metric_def["name"];
const std::string help = metric_def["help"];
const std::string name = metric_def.at("name");
const std::string help = metric_def.at("help");
auto value = json_value(metric_def, "value", 0.);
prometheus << "# HELP llamacpp:" << name << " " << help << "\n"
@ -3274,7 +3291,7 @@ int main(int argc, char ** argv) {
}
}
const int64_t t_start = data["t_start"];
const int64_t t_start = data.at("t_start");
res.set_header("Process-Start-Time-Unix", std::to_string(t_start));
res.set_content(prometheus.str(), "text/plain; version=0.0.4");
@ -3283,7 +3300,7 @@ int main(int argc, char ** argv) {
const auto handle_slots_save = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data["filename"];
std::string filename = request_data.at("filename");
if (!validate_file_name(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
@ -3313,7 +3330,7 @@ int main(int argc, char ** argv) {
const auto handle_slots_restore = [&ctx_server, &res_error, &sparams](const httplib::Request & req, httplib::Response & res, int id_slot) {
json request_data = json::parse(req.body);
std::string filename = request_data["filename"];
std::string filename = request_data.at("filename");
if (!validate_file_name(filename)) {
res_error(res, format_error_response("Invalid filename", ERROR_TYPE_INVALID_REQUEST));
return;
@ -3632,7 +3649,8 @@ int main(int argc, char ** argv) {
std::vector<llama_token> tokens;
if (body.count("content") != 0) {
tokens = ctx_server.tokenize(body["content"], false);
const bool add_special = json_value(body, "add_special", false);
tokens = ctx_server.tokenize(body.at("content"), add_special);
}
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json; charset=utf-8");
@ -3644,7 +3662,7 @@ int main(int argc, char ** argv) {
std::string content;
if (body.count("tokens") != 0) {
const std::vector<llama_token> tokens = body["tokens"];
const std::vector<llama_token> tokens = body.at("tokens");
content = tokens_to_str(ctx_server.ctx, tokens.cbegin(), tokens.cend());
}
@ -3667,10 +3685,10 @@ int main(int argc, char ** argv) {
json prompt;
if (body.count("input") != 0) {
is_openai = true;
prompt = body["input"];
prompt = body.at("input");
} else if (body.count("content") != 0) {
// with "content", we only support single prompt
prompt = std::vector<std::string>{body["content"]};
prompt = std::vector<std::string>{body.at("content")};
} else {
res_error(res, format_error_response("\"input\" or \"content\" must be provided", ERROR_TYPE_INVALID_REQUEST));
return;
@ -3689,7 +3707,7 @@ int main(int argc, char ** argv) {
if (!result.error) {
if (result.data.count("results")) {
// result for multi-task
responses = result.data["results"];
responses = result.data.at("results");
} else {
// result for single task
responses = std::vector<json>{result.data};

View file

@ -7,44 +7,16 @@ Feature: Results
And a model file tinyllamas/split/stories15M-00001-of-00003.gguf from HF repo ggml-org/models
And a model file test-model-00001-of-00003.gguf
And 128 as batch size
And 256 KV cache size
And 1024 KV cache size
And 128 max tokens to predict
Scenario Outline: Multi users completion
Given <n_slots> slots
And continuous batching
Scenario Outline: consistent results with same seed
Given <n_slots> slots
Then the server is starting
Then the server is healthy
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 42 as seed
And a prompt:
"""
Write a very long story about AI.
"""
Given 4 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
Given concurrent completion requests
Then the server is busy
@ -55,3 +27,55 @@ Feature: Results
| n_slots |
| 1 |
| 2 |
Scenario Outline: different results with different seed
Given <n_slots> slots
Then the server is starting
Then the server is healthy
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 42
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 43
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 44
Given 1 prompts "Title: Little Red Riding Hood But In Space\n\nSummary:" with seed 45
Given concurrent completion requests
Then the server is busy
Then the server is idle
And all slots are idle
Then all predictions are different
Examples:
| n_slots |
| 1 |
| 2 |
Scenario Outline: consistent results with same seed and varying batch size
Given 4 slots
And <temp> temperature
# And 0 as draft
Then the server is starting
Then the server is healthy
Given 1 prompts "Write a very long story about AI." with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Given <n_parallel> prompts "Write a very long story about AI." with seed 42
And concurrent completion requests
# Then the server is busy # Not all slots will be utilized.
Then the server is idle
And all slots are idle
Then all predictions are equal
Examples:
| n_parallel | temp |
| 1 | 0.0 |
| 2 | 0.0 |
| 4 | 0.0 |
| 1 | 1.0 |
# FIXME: These tests fail on master. The problem seems to be the unified KV cache.
# See https://github.com/ggerganov/whisper.cpp/issues/1941#issuecomment-1986923227
# and https://github.com/ggerganov/llama.cpp/pull/6122#discussion_r1531405574 .
# | 2 | 1.0 |
# | 4 | 1.0 |

View file

@ -7,6 +7,7 @@ Feature: llama.cpp server
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model file test-model.gguf
And a model alias tinyllama-2
And BOS token is 1
And 42 as server seed
# KV Cache corresponds to the total amount of tokens
# that can be stored across all independent sequences: #4130
@ -91,7 +92,18 @@ Feature: llama.cpp server
"""
What is the capital of France ?
"""
Then tokens can be detokenize
Then tokens can be detokenized
And tokens do not begin with BOS
Scenario: Tokenize w/ BOS
Given adding special tokens
When tokenizing:
"""
What is the capital of Germany?
"""
Then tokens begin with BOS
Given first token is removed
Then tokens can be detokenized
Scenario: Models available
Given available models

View file

@ -65,6 +65,7 @@ def step_server_config(context, server_fqdn, server_port):
context.server_seed = None
context.user_api_key = None
context.response_format = None
context.temperature = None
context.tasks_result = []
context.concurrent_tasks = []
@ -232,15 +233,17 @@ async def step_all_slots_status(context, expected_slot_status_string):
@async_run_until_complete
async def step_request_completion(context, api_error):
expect_api_error = api_error == 'raised'
seeds = await completions_seed(context, num_seeds=1)
completion = await request_completion(context.prompts.pop(),
seeds[0] if seeds is not None else seeds,
context.base_url,
debug=context.debug,
n_predict=context.n_predict,
cache_prompt=context.cache_prompt,
id_slot=context.id_slot,
seed=await completions_seed(context),
expect_api_error=expect_api_error,
user_api_key=context.user_api_key)
user_api_key=context.user_api_key,
temperature=context.temperature)
context.tasks_result.append(completion)
if context.debug:
print(f"Completion response: {completion}")
@ -269,6 +272,15 @@ async def step_predictions_equal(context):
context.tasks_result = []
@step('all predictions are different')
@async_run_until_complete
async def step_predictions_equal(context):
n_completions = await gather_tasks_results(context)
assert n_completions >= 2, "need at least 2 completions"
assert_all_predictions_different(context.tasks_result)
context.tasks_result = []
@step('the completion is truncated')
def step_assert_completion_truncated(context):
step_assert_completion_truncated(context, '')
@ -311,6 +323,11 @@ def step_response_format(context, response_format):
context.response_format = json.loads(response_format)
@step('{temperature:f} temperature')
def step_temperature(context, temperature):
context.temperature = temperature
@step('streaming is {enable_streaming}')
def step_streaming(context, enable_streaming):
context.enable_streaming = enable_streaming == 'enabled'
@ -353,7 +370,15 @@ def step_n_ubatch(context, n_ubatch):
@step('{seed:d} as seed')
def step_seed(context, seed):
context.seed = seed
if context.seed is None:
context.seed = [seed]
else:
context.seed.append(seed)
@step('BOS token is {bos:d}')
def step_bos_token(context, bos):
context.bos = bos
@step('a prefix prompt')
@ -413,7 +438,9 @@ async def step_oai_chat_completions(context, api_error):
if context.debug:
print(f"Submitting OAI compatible completions request...")
expect_api_error = api_error == 'raised'
seeds = await completions_seed(context, num_seeds=1),
completion = await oai_chat_completions(context.prompts.pop(),
seeds[0] if seeds is not None else seeds,
context.system_prompt,
context.base_url,
'/v1/chat',
@ -429,8 +456,6 @@ async def step_oai_chat_completions(context, api_error):
response_format=context.response_format
if hasattr(context, 'response_format') else None,
seed=await completions_seed(context),
user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None,
@ -457,10 +482,21 @@ def step_a_prompt_prompt(context, prompt):
context.n_prompts = len(context.prompts)
@step('{num_prompts:d} prompts {prompt} with seed {seed:d}')
def step_many_prompts(context, num_prompts, prompt, seed):
if context.seed is None:
context.seed = []
for _ in range(num_prompts):
context.seed.append(seed)
context.prompts.append(prompt)
context.n_prompts = len(context.prompts)
@step('concurrent completion requests')
@async_run_until_complete()
async def step_concurrent_completion_requests(context):
await concurrent_requests(context,
await concurrent_requests(
context,
request_completion,
# prompt is inserted automatically
context.base_url,
@ -468,9 +504,9 @@ async def step_concurrent_completion_requests(context):
prompt_prefix=context.prompt_prefix,
prompt_suffix=context.prompt_suffix,
n_predict=context.n_predict if hasattr(context, 'n_predict') else None,
seed=await completions_seed(context),
user_api_key=context.user_api_key if hasattr(context,
'user_api_key') else None)
user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None,
temperature=context.temperature,
)
@step('concurrent OAI completions requests')
@ -490,7 +526,6 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'enable_streaming') else None,
response_format=context.response_format
if hasattr(context, 'response_format') else None,
seed=await completions_seed(context),
user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None)
@ -512,10 +547,6 @@ async def step_oai_chat_completions(context):
if hasattr(context, 'enable_streaming') else None,
response_format=context.response_format
if hasattr(context, 'response_format') else None,
seed=context.seed
if hasattr(context, 'seed') else
context.server_seed
if hasattr(context, 'server_seed') else None,
user_api_key=context.user_api_key
if hasattr(context, 'user_api_key') else None)
@ -544,7 +575,7 @@ async def all_prompts_are_predicted(context, expected_predicted_n=None):
@async_run_until_complete
async def step_compute_embedding(context):
context.n_prompts = 1
context.embeddings = await request_embedding(context_text(context), base_url=context.base_url)
context.embeddings = await request_embedding(context_text(context), None, base_url=context.base_url)
@step('all embeddings are the same')
@ -585,7 +616,7 @@ def step_assert_embeddings(context):
@async_run_until_complete
async def step_oai_compute_embeddings(context):
context.n_prompts = 1
context.embeddings = await request_oai_embeddings(context_text(context),
context.embeddings = await request_oai_embeddings(context_text(context), None,
base_url=context.base_url,
user_api_key=context.user_api_key,
model=context.model)
@ -594,7 +625,7 @@ async def step_oai_compute_embeddings(context):
@step('an OAI compatible embeddings computation request for multiple inputs')
@async_run_until_complete
async def step_oai_compute_embeddings_multiple_inputs(context):
context.embeddings = await request_oai_embeddings(context.prompts,
context.embeddings = await request_oai_embeddings(context.prompts, None,
base_url=context.base_url,
user_api_key=context.user_api_key,
model=context.model)
@ -630,21 +661,29 @@ async def all_embeddings_are_generated(context):
assert_embeddings(context.tasks_result.pop().pop())
@step('adding special tokens')
def step_tokenize_set_add_special(context):
context.tokenize_add_special = True
@step('tokenizing')
@async_run_until_complete
async def step_tokenize(context):
context.tokenized_text = context_text(context)
async with aiohttp.ClientSession() as session:
async with session.post(f'{context.base_url}/tokenize',
json={
tokenize_args = {
"content": context.tokenized_text,
}) as response:
}
if getattr(context, 'tokenize_add_special', None) is not None:
tokenize_args['add_special'] = context.tokenize_add_special
async with session.post(f'{context.base_url}/tokenize',
json=tokenize_args) as response:
assert response.status == 200
tokenize_json = await response.json()
context.tokens = tokenize_json['tokens']
@step('tokens can be detokenize')
@step('tokens can be detokenized')
@async_run_until_complete
async def step_detokenize(context):
assert len(context.tokens) > 0
@ -659,6 +698,21 @@ async def step_detokenize(context):
assert context.tokenized_text == detokenize_json['content'].strip()
@step('tokens begin with BOS')
def step_strings_for_tokenization(context):
assert context.tokens[0] == context.bos
@step('tokens do not begin with BOS')
def step_strings_for_tokenization(context):
assert context.tokens[0] != context.bos
@step('first token is removed')
def step_strings_for_tokenization(context):
context.tokens = context.tokens[1:]
@step('an OPTIONS request is sent from {origin}')
@async_run_until_complete
async def step_options_request(context, origin):
@ -740,8 +794,9 @@ async def concurrent_requests(context, f_completion, *args, **kwargs):
if context.debug:
print(f"starting {context.n_prompts} concurrent completion requests...")
assert context.n_prompts > 0
seeds = await completions_seed(context)
for prompt_no in range(context.n_prompts):
shifted_args = [context.prompts.pop(), *args]
shifted_args = [context.prompts.pop(), seeds[prompt_no], *args]
context.concurrent_tasks.append(asyncio.create_task(f_completion(*shifted_args, **kwargs)))
await asyncio.sleep(0.1)
@ -781,6 +836,7 @@ def step_server_responds_with_status_code(context, status_code):
async def request_completion(prompt,
seed,
base_url,
debug=False,
prompt_prefix=None,
@ -788,9 +844,9 @@ async def request_completion(prompt,
n_predict=None,
cache_prompt=False,
id_slot=None,
seed=None,
expect_api_error=None,
user_api_key=None):
user_api_key=None,
temperature=None):
if debug:
print(f"Sending completion request: {prompt}")
origin = "my.super.domain"
@ -811,7 +867,8 @@ async def request_completion(prompt,
"n_predict": n_predict if n_predict is not None else -1,
"cache_prompt": cache_prompt,
"id_slot": id_slot,
"seed": seed if seed is not None else 42
"seed": seed if seed is not None else 42,
"temperature": temperature if temperature is not None else "0.8f",
},
headers=headers,
timeout=3600) as response:
@ -824,6 +881,7 @@ async def request_completion(prompt,
async def oai_chat_completions(user_prompt,
seed,
system_prompt,
base_url,
base_path,
@ -833,7 +891,6 @@ async def oai_chat_completions(user_prompt,
n_predict=None,
enable_streaming=None,
response_format=None,
seed=None,
user_api_key=None,
expect_api_error=None):
if debug:
@ -882,7 +939,7 @@ async def oai_chat_completions(user_prompt,
while event_received:
event_received = False
async for line_in_bytes in response.content:
line = line_in_bytes.decode('utf8')
line = line_in_bytes.decode('utf-8')
line = line.rstrip('\n').rstrip('\r')
if line == '':
continue
@ -952,7 +1009,7 @@ async def oai_chat_completions(user_prompt,
return completion_response
async def request_embedding(content, base_url=None):
async def request_embedding(content, seed, base_url=None):
async with aiohttp.ClientSession() as session:
async with session.post(f'{base_url}/embedding',
json={
@ -963,7 +1020,7 @@ async def request_embedding(content, base_url=None):
return [response_json['embedding']]
async def request_oai_embeddings(input,
async def request_oai_embeddings(input, seed,
base_url=None, user_api_key=None,
model=None, async_client=False):
# openai client always expects an api_key
@ -1036,21 +1093,31 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re
f' {n_predicted} <> {expected_predicted_n}')
def assert_all_predictions_equal(completion_responses):
content_0 = completion_responses[0]['content']
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content 0: {content_0}")
for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
print(f"content {i}: {content_i}")
for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
for j, response_j in enumerate(completion_responses):
if i == j:
continue
content_j = response_j['content']
assert content_i == content_j, "contents not equal"
i = 1
for response in completion_responses[1:]:
content = response['content']
def assert_all_predictions_different(completion_responses):
if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON':
print(f"content {i}: {content}")
assert content == content_0, "contents not equal"
i += 1
for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
print(f"content {i}: {content_i}")
for i, response_i in enumerate(completion_responses):
content_i = response_i['content']
for j, response_j in enumerate(completion_responses):
if i == j:
continue
content_j = response_j['content']
assert content_i != content_j, "contents not different"
async def gather_tasks_results(context):
@ -1145,9 +1212,22 @@ def assert_slots_status(slots, expected_slots):
f" = {expected[key]} != {slot[key]}")
async def completions_seed(context):
return context.seed if hasattr(context, 'seed') and context.seed is not None \
else context.server_seed if hasattr(context, 'server_seed') else None
async def completions_seed(context, num_seeds=None):
if hasattr(context, "seed") and context.seed is not None:
assert len(context.seed) == context.n_prompts
if num_seeds is None:
num_seeds = context.n_prompts
assert num_seeds <= context.n_prompts
seeds = context.seed[:num_seeds]
context.seed = context.seed[num_seeds:] if num_seeds < context.n_prompts else None
return seeds
if hasattr(context, "server_seed") and context.server_seed is not None:
if num_seeds is None:
return [context.server_seed] * context.n_prompts
else:
return [context.server_seed] * num_seeds
return None
def context_text(context):

View file

@ -0,0 +1,5 @@
# LLaMA.cpp Server Wild Theme
Simple themes directory of sample "public" directories. To try any of these add --path to your run like `server --path=wild`.
![image](wild/wild.png)

View file

@ -0,0 +1,7 @@
# LLaMA.cpp Server Buttons Top Theme
Simple tweaks to the UI. Chat buttons at the top of the page instead of bottom so you can hit Stop instead of chasing it down the page.
To use simply run server with `--path=themes/buttons_top`
![image](buttons_top.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 117 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 4 KiB

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,5 @@
# LLaMA.cpp Server Wild Theme
Simple tweaks to the UI. To use simply run server with `--path=themes/wild`
![image](wild.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 4 KiB

File diff suppressed because it is too large Load diff

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 254 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 485 KiB

View file

@ -3,6 +3,8 @@
#include "llama.h"
#include "common.h"
// Change JSON_ASSERT from assert() to GGML_ASSERT:
#define JSON_ASSERT GGML_ASSERT
#include "json.hpp"
#include <string>
@ -49,18 +51,18 @@ extern bool server_log_json;
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra);
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra);
template <typename T>
static T json_value(const json &body, const std::string &key, const T &default_value) {
static T json_value(const json & body, const std::string & key, const T & default_value) {
// Fallback null to default value
if (body.contains(key) && !body.at(key).is_null()){
if (body.contains(key) && !body.at(key).is_null()) {
try {
return body.value(key, default_value);
}
catch (nlohmann::json_abi_v3_11_3::detail::type_error const&){
std::string message = "Wrong type supplied for parameter '" + key + "'. Expected '" + typeid(default_value).name() + "', using default value.";
server_log("WARN", __func__, __LINE__, message.c_str(), body);
return body.at(key);
} catch (NLOHMANN_JSON_NAMESPACE::detail::type_error const &) {
std::stringstream ss;
ss << "Wrong type supplied for parameter '" << key << "'. Expected '" << json(default_value).type_name() << "', using default value.";
LOG_WARNING(ss.str().c_str(), body);
return default_value;
}
} else {
@ -68,16 +70,16 @@ static T json_value(const json &body, const std::string &key, const T &default_v
}
}
static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) {
static inline void server_log(const char * level, const char * function, int line, const char * message, const json & extra) {
std::stringstream ss_tid;
ss_tid << std::this_thread::get_id();
json log = nlohmann::ordered_json{
json log = json{
{"tid", ss_tid.str()},
{"timestamp", time(nullptr)},
};
if (server_log_json) {
log.merge_patch( {
log.merge_patch({
{"level", level},
{"function", function},
{"line", line},
@ -98,7 +100,7 @@ static inline void server_log(const char *level, const char *function, int line,
}
std::stringstream ss;
ss << buf << " |";
for (const auto& el : log.items())
for (const auto & el : log.items())
{
const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace);
ss << " " << el.key() << "=" << value;
@ -373,11 +375,11 @@ static json oaicompat_completion_params_parse(
llama_params["top_p"] = json_value(body, "top_p", 1.0);
// Apply chat template to the list of messages
llama_params["prompt"] = format_chat(model, chat_template, body["messages"]);
llama_params["prompt"] = format_chat(model, chat_template, body.at("messages"));
// Handle "stop" field
if (body.contains("stop") && body["stop"].is_string()) {
llama_params["stop"] = json::array({body["stop"].get<std::string>()});
if (body.contains("stop") && body.at("stop").is_string()) {
llama_params["stop"] = json::array({body.at("stop").get<std::string>()});
} else {
llama_params["stop"] = json_value(body, "stop", json::array());
}

View file

@ -1,6 +1,6 @@
# llama.cpp/example/sycl
This example program provide the tools for llama.cpp for SYCL on Intel GPU.
This example program provides the tools for llama.cpp for SYCL on Intel GPU.
## Tool

30
flake.lock generated
View file

@ -5,11 +5,11 @@
"nixpkgs-lib": "nixpkgs-lib"
},
"locked": {
"lastModified": 1712014858,
"narHash": "sha256-sB4SWl2lX95bExY2gMFG5HIzvva5AVMJd4Igm+GpZNw=",
"lastModified": 1714641030,
"narHash": "sha256-yzcRNDoyVP7+SCNX0wmuDju1NUCt8Dz9+lyUXEI0dbI=",
"owner": "hercules-ci",
"repo": "flake-parts",
"rev": "9126214d0a59633752a136528f5f3b9aa8565b7d",
"rev": "e5d10a24b66c3ea8f150e47dfdb0416ab7c3390e",
"type": "github"
},
"original": {
@ -20,11 +20,11 @@
},
"nixpkgs": {
"locked": {
"lastModified": 1714076141,
"narHash": "sha256-Drmja/f5MRHZCskS6mvzFqxEaZMeciScCTFxWVLqWEY=",
"lastModified": 1714635257,
"narHash": "sha256-4cPymbty65RvF1DWQfc+Bc8B233A1BWxJnNULJKQ1EY=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "7bb2ccd8cdc44c91edba16c48d2c8f331fb3d856",
"rev": "63c3a29ca82437c87573e4c6919b09a24ea61b0f",
"type": "github"
},
"original": {
@ -36,20 +36,14 @@
},
"nixpkgs-lib": {
"locked": {
"dir": "lib",
"lastModified": 1711703276,
"narHash": "sha256-iMUFArF0WCatKK6RzfUJknjem0H9m4KgorO/p3Dopkk=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "d8fe5e6c92d0d190646fb9f1056741a229980089",
"type": "github"
"lastModified": 1714640452,
"narHash": "sha256-QBx10+k6JWz6u7VsohfSw8g8hjdBZEf8CFzXH1/1Z94=",
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
},
"original": {
"dir": "lib",
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
"type": "tarball",
"url": "https://github.com/NixOS/nixpkgs/archive/50eb7ecf4cd0a5756d7275c8ba36790e5bd53e33.tar.gz"
}
},
"root": {

View file

@ -113,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
for (int id = 0; id < info.device_count; ++id) {
int device_vmm = 0;
#if !defined(GGML_USE_HIPBLAS)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
CUdevice device;
CU_CHECK(cuDeviceGet(&device, id));
CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@ -259,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
};
// pool with virtual memory
#if !defined(GGML_USE_HIPBLAS)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
@ -356,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
#endif // !defined(GGML_USE_HIPBLAS)
std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
#if !defined(GGML_USE_HIPBLAS)
#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
if (ggml_cuda_info().devices[device].vmm) {
return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
}
@ -1647,7 +1647,7 @@ static void ggml_cuda_op_mul_mat(
}
}
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(ggml_is_permuted(src0) && ggml_is_permuted(src1));
GGML_ASSERT(ggml_backend_buffer_is_cuda(src0->buffer));
GGML_ASSERT(src0->nb[0] <= src0->nb[1] && src0->nb[2] <= src0->nb[3]); // 0213 permutation
@ -1670,7 +1670,7 @@ static void ggml_cuda_mul_mat_vec_p021(ggml_backend_cuda_context & ctx, const gg
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
}
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){
static void ggml_cuda_mul_mat_vec_nc(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
GGML_ASSERT(!ggml_is_transposed(src0));
GGML_ASSERT(!ggml_is_transposed(src1));
GGML_ASSERT(!ggml_is_permuted(src0));
@ -2410,11 +2410,184 @@ GGML_CALL static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
GGML_UNUSED(backend);
}
static void set_ggml_graph_node_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
graph_node_properties->node_address = node->data;
graph_node_properties->node_op = node->op;
for (int i = 0; i < GGML_MAX_DIMS; i++) {
graph_node_properties->ne[i] = node->ne[i];
graph_node_properties->nb[i] = node->nb[i];
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
graph_node_properties->src_address[i] = node->src[i] ? node->src[i]->data : nullptr;
}
}
static bool ggml_graph_node_has_matching_properties(ggml_tensor * node, ggml_graph_node_properties * graph_node_properties) {
if (node->data != graph_node_properties->node_address &&
node->op != GGML_OP_CPY &&
node->op != GGML_OP_VIEW) {
return false;
}
if (node->op != graph_node_properties->node_op) {
return false;
}
for (int i = 0; i < GGML_MAX_DIMS; i++) {
if (node->ne[i] != graph_node_properties->ne[i]) {
return false;
}
if (node->nb[i] != graph_node_properties->nb[i]) {
return false;
}
}
for (int i = 0; i < GGML_MAX_SRC; i++) {
if (node->src[i] &&
node->src[i]->data != graph_node_properties->src_address[i] &&
node->op != GGML_OP_CPY &&
node->op != GGML_OP_VIEW
) {
return false;
}
}
return true;
}
GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_device(cuda_ctx->device);
#ifdef USE_CUDA_GRAPH
static const bool disable_cuda_graphs_due_to_env = (getenv("GGML_CUDA_DISABLE_GRAPHS") != nullptr);
// Objects required for CUDA Graph
if (cuda_ctx->cuda_graph == nullptr) {
cuda_ctx->cuda_graph.reset(new ggml_cuda_graph());
}
bool use_cuda_graph = true;
bool cuda_graph_update_required = false;
// pointer to CUDA cpy kernel, which is required to identify
// kernel parameters which need updated in the graph for each token
void * ggml_cuda_cpy_fn_ptr = nullptr;
if (cuda_ctx->cuda_graph->graph == nullptr) {
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
#ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to GPU architecture\n", __func__);
#endif
}
}
// Disable CUDA graphs in presence of env var, old GPU, use-case which is changing too rapidly,
// or previous graph capture failure.
// Also disable for multi-gpu for now. TO DO investigate
if (disable_cuda_graphs_due_to_env
|| cuda_ctx->cuda_graph->disable_due_to_gpu_arch
|| cuda_ctx->cuda_graph->disable_due_to_too_many_updates
|| cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture) {
use_cuda_graph = false;
}
if (use_cuda_graph) {
if (cuda_ctx->cuda_graph->instance == nullptr) {
cuda_graph_update_required = true;
}
// Check if the graph size has changed
if (cuda_ctx->cuda_graph->ggml_graph_properties.size() != (size_t)cgraph->n_nodes) {
cuda_graph_update_required = true;
cuda_ctx->cuda_graph->ggml_graph_properties.resize(cgraph->n_nodes);
}
// Loop over nodes in GGML graph to determine if CUDA graph update is required
// and store properties to allow this comparison for the next token
for (int i = 0; i < cgraph->n_nodes; i++) {
bool has_matching_properties = true;
if (!cuda_graph_update_required) {
has_matching_properties = ggml_graph_node_has_matching_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
}
if (!has_matching_properties) {
cuda_graph_update_required = true;
}
set_ggml_graph_node_properties(cgraph->nodes[i], &cuda_ctx->cuda_graph->ggml_graph_properties[i]);
}
// Loop over nodes in GGML graph to obtain info needed for CUDA graph
cuda_ctx->cuda_graph->updated_kernel_arg.clear();
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
if (node->src[0] && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
#ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to split buffer\n", __func__);
#endif
}
if (node->op == GGML_OP_MUL_MAT_ID) {
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
#ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
#endif
}
if (node->op == GGML_OP_ADD && node->src[1] && node->src[1]->ne[1] > 1) {
// disable CUDA graphs for batch size > 1 for now.
// Changes in batch size or context size can cause changes to the grid size of some kernels.
use_cuda_graph = false;
#ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
#endif
}
if (node->op == GGML_OP_CPY) {
// store the copy op parameter which changes with each token.
cuda_ctx->cuda_graph->updated_kernel_arg.push_back((char **) &(node->src[1]->data));
if (ggml_cuda_cpy_fn_ptr == nullptr) {
// store a pointer to the copy op CUDA kernel to identify it later
ggml_cuda_cpy_fn_ptr = ggml_cuda_cpy_fn(node->src[0], node->src[1]);
}
}
if (!use_cuda_graph) {
break;
}
}
// Disable CUDA graphs (from the next token) if the use-case is demanding too many consecutive graph updates.
if (cuda_graph_update_required) {
cuda_ctx->cuda_graph->number_consecutive_updates++;
} else {
cuda_ctx->cuda_graph->number_consecutive_updates = 0;
}
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
#ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
#endif
}
}
if (use_cuda_graph && cuda_graph_update_required) { // Start CUDA graph capture
CUDA_CHECK(cudaStreamBeginCapture(cuda_ctx->stream(), cudaStreamCaptureModeRelaxed));
}
#else
bool use_cuda_graph = false;
bool cuda_graph_update_required = false;
#endif // USE_CUDA_GRAPH
bool graph_evaluated_or_captured = false;
while (!graph_evaluated_or_captured) {
// Only perform the graph execution if CUDA graphs are not enabled, or we are capturing the graph.
// With the use of CUDA graphs, the execution will be performed by the graph launch.
if (!use_cuda_graph || cuda_graph_update_required) {
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
@ -2437,6 +2610,105 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
}
GGML_ASSERT(ok);
}
}
#ifdef USE_CUDA_GRAPH
if (use_cuda_graph && cuda_graph_update_required) { // End CUDA graph capture
if (cuda_ctx->cuda_graph->graph != nullptr) {
CUDA_CHECK(cudaGraphDestroy(cuda_ctx->cuda_graph->graph));
cuda_ctx->cuda_graph->graph = nullptr;
}
CUDA_CHECK(cudaStreamEndCapture(cuda_ctx->stream(), &cuda_ctx->cuda_graph->graph));
#if 0
if (disable_cuda_graphs_due_to_failed_capture) {
use_cuda_graph = false;
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
#ifndef NDEBUG
fprintf(stderr, "%s: disabling CUDA graphs due to failed graph capture\n", __func__);
#endif
} else {
graph_evaluated_or_captured = true; // CUDA graph has been captured
}
#endif
graph_evaluated_or_captured = true; // CUDA graph has been captured
} else {
graph_evaluated_or_captured = true; // ggml graph has been directly evaluated
}
}
if (use_cuda_graph) {
if (cuda_ctx->cuda_graph->instance == nullptr) { // Create executable graph from captured graph.
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
}
// Perform update to graph (if required for this token), and change copy parameter (required for every token)
if (cuda_graph_update_required) {
// Extract nodes from graph
if (cuda_ctx->cuda_graph->num_nodes == 0) {
// First call with null argument gets number of nodes in graph
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, nullptr, &cuda_ctx->cuda_graph->num_nodes));
}
// Subsequent call with non-null argument gets nodes
cuda_ctx->cuda_graph->nodes.resize(cuda_ctx->cuda_graph->num_nodes);
cuda_ctx->cuda_graph->params.resize(cuda_ctx->cuda_graph->num_nodes);
if (cuda_ctx->cuda_graph->num_nodes > 0) {
CUDA_CHECK(cudaGraphGetNodes(cuda_ctx->cuda_graph->graph, cuda_ctx->cuda_graph->nodes.data(), &cuda_ctx->cuda_graph->num_nodes));
// Loop over nodes, and extract kernel parameters from each node
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
cudaGraphNodeType node_type;
CUDA_CHECK(cudaGraphNodeGetType(cuda_ctx->cuda_graph->nodes[i], &node_type));
if (node_type == cudaGraphNodeTypeKernel) {
cudaError_t stat = cudaGraphKernelNodeGetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]); // Get params using runtime
if (stat == cudaErrorInvalidDeviceFunction) {
// Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
// We don't need to update blas nodes, so clear error and move on.
cudaGetLastError();
} else {
GGML_ASSERT(stat == cudaSuccess);
}
}
}
}
}
// One of the arguments to the copy kernel is updated for each token, hence we need to
// replace that argument with the updated value in the CUDA graph
if (!cuda_graph_update_required) { // on update steps, the live parameters will already be captured
int k = 0;
for (size_t i = 0; i < cuda_ctx->cuda_graph->num_nodes; i++) {
if (cuda_ctx->cuda_graph->params[i].func == ggml_cuda_cpy_fn_ptr) {
char ** updated_kernel_arg_ptr = cuda_ctx->cuda_graph->updated_kernel_arg.at(k++);
cuda_ctx->cuda_graph->params[i].kernelParams[1] = updated_kernel_arg_ptr;
CUDA_CHECK(cudaGraphKernelNodeSetParams(cuda_ctx->cuda_graph->nodes[i], &cuda_ctx->cuda_graph->params[i]));
}
}
}
// Update graph executable
cudaGraphExecUpdateResultInfo result_info;
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
if (stat == cudaErrorGraphExecUpdateFailure) {
#ifndef NDEBUG
fprintf(stderr, "%s: CUDA graph update failed\n", __func__);
#endif
// The pre-existing graph exec cannot be updated due to violated constraints
// so instead clear error and re-instantiate
cudaGetLastError();
CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
cuda_ctx->cuda_graph->instance = nullptr;
CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
} else {
GGML_ASSERT(stat == cudaSuccess);
}
// Launch graph
CUDA_CHECK(cudaGraphLaunch(cuda_ctx->cuda_graph->instance, cuda_ctx->stream()));
#else
graph_evaluated_or_captured = true;
#endif // USE_CUDA_GRAPH
}
return GGML_STATUS_SUCCESS;
}

View file

@ -31,5 +31,4 @@ void ggml_cuda_op_clamp(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&max, (float *) dst->op_params + 1, sizeof(float));
clamp_f32_cuda(src0_d, dst_d, min, max, ggml_nelements(src0), stream);
CUDA_CHECK(cudaGetLastError());
}

View file

@ -19,6 +19,7 @@
#include <cassert>
#include <cfloat>
#include <string>
#include <vector>
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
@ -138,6 +139,7 @@
#define WARP_SIZE 32
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
#define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
#define CC_PASCAL 600
#define MIN_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
@ -232,88 +234,6 @@ typedef float dfloat; // dequantize float
typedef float2 dfloat2;
#endif //GGML_CUDA_F16
[[noreturn]]
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch);
GGML_UNUSED(arch_list);
#else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
__trap();
GGML_UNUSED(no_device_code); // suppress unused function warning
}
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
}
return x;
}
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
}
return a;
}
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
}
return a;
#else
GGML_UNUSED(a);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
}
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = __hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
#else
GGML_UNUSED(x);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL && CUDART_VERSION >= CUDART_HMAX
}
#if CUDART_VERSION < 12000
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high;
}
#endif // CUDART_VERSION < 12000
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300
@ -397,11 +317,143 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
}
#endif // defined(GGML_USE_HIPBLAS)
#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
static bool fp16_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
}
[[noreturn]]
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch);
GGML_UNUSED(arch_list);
#else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
__trap();
GGML_UNUSED(no_device_code); // suppress unused function warning
}
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
}
return x;
}
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
}
return a;
}
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if FP16_AVAILABLE
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
reinterpret_cast<half&>(a.x) += __low2half(a_other);
reinterpret_cast<half&>(a.y) += __high2half(a_other);
}
return a;
#else
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
}
return a;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#else
NO_DEVICE_CODE;
return a;
#endif // FP16_AVAILABLE
}
static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
}
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
#if FP16_AVAILABLE
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
return __float2half(fmaxf(__half2float(a), __half2float(b)));
#else
return __hmax(a, b);
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
#else
NO_DEVICE_CODE;
GGML_UNUSED(b);
return a;
#endif // FP16_AVAILABLE
}
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX
return __hmax2(a, b);
#else
half2 ret;
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
return ret;
#endif // CUDART_VERSION >= CUDART_HMAX
#else
GGML_UNUSED(a);
GGML_UNUSED(b);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
}
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
#else
GGML_UNUSED(x);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#if CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high;
}
#endif // CUDART_VERSION < 12000
// TODO: move to ggml-common.h
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
@ -491,6 +543,43 @@ struct ggml_tensor_extra_gpu {
cudaEvent_t events[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS]; // events for synchronizing multiple GPUs
};
#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
#define USE_CUDA_GRAPH
#endif
struct ggml_graph_node_properties {
void * node_address;
ggml_op node_op;
int64_t ne[GGML_MAX_DIMS];
size_t nb[GGML_MAX_DIMS];
void * src_address[GGML_MAX_SRC];
};
struct ggml_cuda_graph {
#ifdef USE_CUDA_GRAPH
~ggml_cuda_graph() {
if (instance != nullptr) {
CUDA_CHECK(cudaGraphExecDestroy(instance));
}
if (graph != nullptr) {
CUDA_CHECK(cudaGraphDestroy(graph));
}
}
cudaGraph_t graph = nullptr;
cudaGraphExec_t instance = nullptr;
size_t num_nodes = 0;
std::vector<cudaGraphNode_t> nodes;
std::vector<cudaKernelNodeParams> params;
bool disable_due_to_gpu_arch = false;
bool disable_due_to_too_many_updates = false;
bool disable_due_to_failed_graph_capture = false;
int number_consecutive_updates = 0;
std::vector<ggml_graph_node_properties> ggml_graph_properties;
std::vector<char **> updated_kernel_arg;
#endif
};
struct ggml_backend_cuda_context {
int device;
std::string name;
@ -499,6 +588,8 @@ struct ggml_backend_cuda_context {
cudaStream_t streams[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_STREAMS] = { { nullptr } };
cublasHandle_t cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
std::unique_ptr<ggml_cuda_graph> cuda_graph;
explicit ggml_backend_cuda_context(int device) :
device(device),
name(GGML_CUDA_NAME + std::to_string(device)) {

View file

@ -727,7 +727,6 @@ static void convert_unary_cuda(const void * __restrict__ vx, dst_t * __restrict_
}
to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
int id;
switch (type) {
case GGML_TYPE_Q4_0:
return dequantize_row_q4_0_cuda;
@ -738,8 +737,7 @@ to_fp16_cuda_t ggml_get_to_fp16_cuda(ggml_type type) {
case GGML_TYPE_Q5_1:
return dequantize_block_cuda<QK5_1, QR5_1, dequantize_q5_1>;
case GGML_TYPE_Q8_0:
CUDA_CHECK(cudaGetDevice(&id));
if (ggml_cuda_info().devices[id].cc >= CC_PASCAL) {
if (ggml_cuda_info().devices[ggml_cuda_get_device()].cc >= CC_PASCAL) {
return dequantize_block_q8_0_f16_cuda;
}
return dequantize_block_cuda<QK8_0, QR8_0, dequantize_q8_0>;

View file

@ -459,3 +459,32 @@ void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
ggml_cuda_cpy(ctx, src0, dst);
}
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1) {
if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) {
return (void*) cpy_f32_f16<cpy_1_f32_f32>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F16) {
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q8_0) {
return (void*) cpy_f32_q<cpy_blck_f32_q8_0, QK8_0>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_0) {
return (void*) cpy_f32_q<cpy_blck_f32_q4_0, QK4_0>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) {
return (void*) cpy_f32_q<cpy_blck_f32_q4_1, QK4_1>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_0) {
return (void*) cpy_f32_q<cpy_blck_f32_q5_0, QK5_0>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_IQ4_NL) {
return (void*) cpy_f32_q<cpy_blck_f32_iq4_nl, QK4_NL>;
} else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q5_1) {
return (void*) cpy_f32_q<cpy_blck_f32_q5_1, QK5_1>;
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) {
return (void*) cpy_f32_f16<cpy_1_f32_f16>;
} else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) {
return (void*) cpy_f32_f16<cpy_1_f16_f32>;
} else {
fprintf(stderr, "%s: unsupported type combination (%s to %s)\n", __func__,
ggml_type_name(src0->type), ggml_type_name(src1->type));
GGML_ASSERT(false);
}
}

View file

@ -5,3 +5,5 @@
void ggml_cuda_cpy(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, ggml_tensor * src1);
void ggml_cuda_dup(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
void* ggml_cuda_cpy_fn(const ggml_tensor * src0, ggml_tensor * src1);

View file

@ -11,8 +11,10 @@
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
template<int D, int parallel_blocks> // D == head size
__launch_bounds__(((D + WARP_SIZE - 1) / WARP_SIZE)*WARP_SIZE, 1)
template<int D, int ncols, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
@ -44,55 +46,77 @@ static __global__ void flash_attn_vec_ext_f16(
#if FP16_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic = blockIdx.x / parallel_blocks; // Index of the Q/QKV column to work on.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic);
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic;
const half * maskh = (const half *) mask + ne11*ic0;
const int stride_KV = nb11 / sizeof(half);
const int stride_KV2 = nb11 / sizeof(half2);
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < nwarps*WARP_SIZE);
__builtin_assume(tid < D);
__shared__ half KQ[nwarps*WARP_SIZE];
KQ[tid] = -INFINITY;
__shared__ half KQ[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -HALF_MAX_HALF;
}
half2 * KQ2 = (half2 *) KQ;
half kqmax = -HALF_MAX_HALF;
half kqsum = 0.0f;
half kqmax[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -HALF_MAX_HALF;
}
half kqsum[ncols] = {0.0f};
__shared__ half kqmax_shared[WARP_SIZE];
__shared__ half kqsum_shared[WARP_SIZE];
__shared__ half kqmax_shared[ncols][WARP_SIZE];
__shared__ half kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
kqmax_shared[threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[threadIdx.x] = 0.0f;
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[j][threadIdx.x] = 0.0f;
}
}
__syncthreads();
// Convert Q to half2 and store in registers:
half2 Q_h2[(D/2 + WARP_SIZE - 1) / WARP_SIZE];
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
break;
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
}
}
Q_h2[i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(Q_f2[i].x, Q_f2[i].y);
}
half2 VKQ = make_half2(0.0f, 0.0f); // Each thread calculates a single VKQ value.
half2 VKQ[ncols] = {{0.0f, 0.0f}};
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
half kqmax_new = kqmax;
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
half kqmax_new = kqmax[0];
half kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
@ -101,47 +125,65 @@ static __global__ void flash_attn_vec_ext_f16(
break;
}
half2 sum2 = make_half2(0.0f, 0.0f);
half2 sum2[ncols] = {{0.0f, 0.0f}};
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
const int k_KQ = k_KQ_0 + threadIdx.x;
if (k_KQ_0 + WARP_SIZE > D/2 && k_KQ >= D/2) {
break;
}
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
sum2[j] = warp_reduce_sum(sum2[j]);
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
sum += mask ? maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
if (ncols == 1) {
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
} else {
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
}
sum2 = warp_reduce_sum(sum2);
half sum = __low2half(sum2) + __high2half(sum2);
sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f);
kqmax_new = __hmax(kqmax_new, sum);
if (threadIdx.x == 0) {
KQ[i_KQ] = sum;
KQ[j*D + i_KQ] = sum;
}
}
}
kqmax_new = warp_reduce_max(kqmax_new);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) {
kqmax_shared[threadIdx.y] = kqmax_new;
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= __half2half2(KQ_max_scale);
}
__syncthreads();
kqmax_new = kqmax_shared[threadIdx.x];
kqmax_new = warp_reduce_max(kqmax_new);
const half KQ_max_scale = hexp(kqmax - kqmax_new);
kqmax = kqmax_new;
const half val = hexp(KQ[tid] - kqmax);
kqsum = kqsum*KQ_max_scale + val;
KQ[tid] = val;
VKQ *= __half2half2(KQ_max_scale);
__syncthreads();
if (tid < D) {
#pragma unroll
for (int k0 = 0; k0 < D; k0 += 2) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
@ -151,39 +193,44 @@ static __global__ void flash_attn_vec_ext_f16(
half2 V_k;
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
VKQ += V_k*KQ2[k0/2];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
}
}
__syncthreads();
}
if (tid >= D) {
kqsum = 0.0f;
}
kqsum = warp_reduce_sum(kqsum);
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum(kqsum[j]);
if (threadIdx.x == 0) {
kqsum_shared[threadIdx.y] = kqsum;
kqsum_shared[j][threadIdx.y] = kqsum[j];
}
}
__syncthreads();
kqsum = kqsum_shared[threadIdx.x];
kqsum = warp_reduce_sum(kqsum);
if (tid >= D) {
return;
}
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
half dst_val = (__low2half(VKQ) + __high2half(VKQ));
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
if (parallel_blocks == 1) {
dst_val /= kqsum;
dst_val /= kqsum[j_VKQ];
}
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
}
dst[D*gridDim.y*blockIdx.x + D*blockIdx.y + tid] = dst_val;
if (parallel_blocks == 1 || tid != 0) {
return;
if (parallel_blocks != 1 && tid != 0) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
}
}
dst_meta[ic*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax, kqsum);
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
@ -191,7 +238,9 @@ static __global__ void flash_attn_vec_ext_f16(
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
@ -416,9 +465,9 @@ static __global__ void flash_attn_ext_f16(
const int k = k0 + threadIdx.x;
KQ2_tmp[k0/WARP_SIZE] += mask ? mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
KQ_max_new = __hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
}
KQ_max_new = __half2half2(warp_reduce_max(__hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
const half2 diff = KQ_max_h2[j0/nwarps] - KQ_max_new;
KQ_max_scale_h2[j0/nwarps] = h2exp(diff);
const uint32_t ftz_mask = __hgt2_mask(diff, make_half2(SOFTMAX_FTZ_THRESHOLD, SOFTMAX_FTZ_THRESHOLD));
@ -573,7 +622,9 @@ static __global__ void flash_attn_ext_f16(
}
template<int D, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta,
@ -642,7 +693,7 @@ static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
template <int D, int parallel_blocks> void launch_fattn_vec_f16(
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
ggml_cuda_pool & pool, cudaStream_t main_stream
) {
@ -656,13 +707,13 @@ template <int D, int parallel_blocks> void launch_fattn_vec_f16(
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
const dim3 block_dim(WARP_SIZE, nwarps, 1);
const dim3 blocks_num(parallel_blocks*Q->ne[1], Q->ne[2], Q->ne[3]);
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
const int shmem = 0;
float scale;
memcpy(&scale, KQV->op_params, sizeof(float));
flash_attn_vec_ext_f16<D, parallel_blocks>
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
<<<blocks_num, block_dim, shmem, main_stream>>> (
(const char *) Q->data,
(const char *) K->data,
@ -783,10 +834,99 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
ggml_cuda_set_device(ctx.device);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
const int32_t precision = KQV->op_params[1];
if (!fp16_mma_available(cc)) {
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (precision != GGML_PREC_DEFAULT) {
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
constexpr int cols_per_block = 16;
@ -845,16 +985,17 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
}
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 256:
launch_fattn_vec_f16<256, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);

View file

@ -1735,8 +1735,7 @@ static void ggml_mul_mat_q4_0_q8_1_cuda(
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;
@ -1780,8 +1779,7 @@ static void ggml_mul_mat_q4_1_q8_1_cuda(
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;
@ -1825,8 +1823,7 @@ static void ggml_mul_mat_q5_0_q8_1_cuda(
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;
@ -1870,8 +1867,7 @@ static void ggml_mul_mat_q5_1_q8_1_cuda(
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;
@ -1915,8 +1911,7 @@ static void ggml_mul_mat_q8_0_q8_1_cuda(
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;
@ -1960,8 +1955,7 @@ static void ggml_mul_mat_q2_K_q8_1_cuda(
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;
@ -2007,8 +2001,7 @@ static void ggml_mul_mat_q3_K_q8_1_cuda(
#if QK_K == 256
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;
@ -2053,8 +2046,7 @@ static void ggml_mul_mat_q4_K_q8_1_cuda(
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;
@ -2098,8 +2090,7 @@ static void ggml_mul_mat_q5_K_q8_1_cuda(
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;
@ -2143,8 +2134,7 @@ static void ggml_mul_mat_q6_K_q8_1_cuda(
const void * vx, const void * vy, float * dst, const int ncols_x, const int nrows_x,
const int ncols_y, const int nrows_y, const int nrows_dst, cudaStream_t stream) {
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
const int compute_capability = ggml_cuda_info().devices[id].cc;
int mmq_x, mmq_y, nwarps;

View file

@ -89,8 +89,7 @@ static void mul_mat_vec_q_cuda(
GGML_ASSERT(ncols_x % qk == 0);
GGML_ASSERT(ncols_y <= MMVQ_MAX_BATCH_SIZE);
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
int64_t nwarps = 1;
int64_t rows_per_cuda_block = 1;
@ -328,8 +327,7 @@ void ggml_cuda_op_mul_mat_vec_q(
const int64_t ne0 = dst->ne[0];
int id;
CUDA_CHECK(cudaGetDevice(&id));
int id = ggml_cuda_get_device();
// the main device has a larger memory buffer to hold the results from all GPUs
// nrows_dst == nrows of the matrix that the kernel writes into

View file

@ -28,5 +28,4 @@ void ggml_cuda_op_scale(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&scale, dst->op_params, sizeof(float));
scale_f32_cuda(src0_d, dst_d, scale, ggml_nelements(src0), stream);
CUDA_CHECK(cudaGetLastError());
}

View file

@ -17,6 +17,83 @@
#define MIN(a, b) ((a) < (b) ? (a) : (b))
#define MAX(a, b) ((a) > (b) ? (a) : (b))
/**
* Converts brain16 to float32.
*
* The bfloat16 floating point format has the following structure:
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 brain16
*
* Since bf16 has the same number of exponent bits as a 32bit float,
* encoding and decoding numbers becomes relatively straightforward.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b00000000000000000000000000000000 IEEE binary32
*
* For comparison, the standard fp16 format has fewer exponent bits.
*
* sign
*
* exponent
*
* mantissa
*
*
* 0b0000000000000000 IEEE binary16
*
* @see IEEE 754-2008
*/
static inline float ggml_compute_bf16_to_fp32(ggml_bf16_t h) {
union {
float f;
uint32_t i;
} u;
u.i = (uint32_t)h.bits << 16;
return u.f;
}
/**
* Converts float32 to brain16.
*
* This function is binary identical to AMD Zen4 VCVTNEPS2BF16.
* Subnormals shall be flushed to zero, and NANs will be quiet.
* This code should vectorize nicely if using modern compilers.
*/
static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
ggml_bf16_t h;
union {
float f;
uint32_t i;
} u;
u.f = s;
if ((u.i & 0x7fffffff) > 0x7f800000) { /* nan */
h.bits = (u.i >> 16) | 64; /* force to quiet */
return h;
}
if (!(u.i & 0x7f800000)) { /* subnormal */
h.bits = (u.i & 0x80000000) >> 16; /* flush to zero */
return h;
}
h.bits = (u.i + (0x7fff + ((u.i >> 16) & 1))) >> 16;
return h;
}
#define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
#define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
#ifdef __cplusplus
extern "C" {
#endif

View file

@ -265,11 +265,20 @@ static void ggml_metal_log(enum ggml_log_level level, const char * format, ...){
static void * ggml_metal_host_malloc(size_t n) {
void * data = NULL;
#if TARGET_OS_OSX
kern_return_t err = vm_allocate((vm_map_t) mach_task_self(), (void *) &data, n, VM_FLAGS_ANYWHERE);
if (err != KERN_SUCCESS) {
GGML_METAL_LOG_ERROR("%s: error: vm_allocate failed\n", __func__);
return NULL;
}
#else
const int result = posix_memalign((void **) &data, sysconf(_SC_PAGESIZE), n);
if (result != 0) {
GGML_METAL_LOG_ERROR("%s: error: posix_memalign failed\n", __func__);
return NULL;
}
#endif
return data;
}
@ -803,7 +812,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_GET_ROWS:
{
return op->ne[3] == 1;
return op->src[0]->type != GGML_TYPE_BF16 && op->ne[3] == 1;
}
default:
return false;
@ -2840,7 +2849,11 @@ GGML_CALL static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_
ggml_backend_metal_free_device();
if (ctx->owned) {
#if TARGET_OS_OSX
vm_deallocate((vm_map_t)mach_task_self(), (vm_address_t)ctx->all_data, ctx->all_size);
#else
free(ctx->all_data);
#endif
}
free(ctx);
@ -2944,14 +2957,16 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buff
ctx->owned = true;
ctx->n_buffers = 1;
if (ctx->all_data != NULL) {
ctx->buffers[0].data = ctx->all_data;
ctx->buffers[0].size = size;
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
length:size_aligned
options:MTLResourceStorageModeShared
deallocator:nil];
}
if (ctx->buffers[0].metal == nil) {
if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
free(ctx);
ggml_backend_metal_free_device();

View file

@ -2175,7 +2175,7 @@ kernel void kernel_flash_attn_ext_f16(
const short D4 = D/4;
const short D8 = D/8;
const short Q8 = Q/8;
//const short Q8 = Q/8;
const short NW = N_SIMDWIDTH;
const short SH = (C + Q); // shared memory per simdgroup in (half)

View file

@ -2119,6 +2119,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
if (alignment == (cl_uint)-1) {
ggml_cl_init();
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
alignment /= 8; // bits to bytes
}
return alignment;

View file

@ -12450,6 +12450,24 @@ bool ggml_validate_row_data(enum ggml_type type, const void * data, size_t nbyte
const size_t nb = nbytes/ggml_type_size(type);
switch (type) {
case GGML_TYPE_BF16:
{
int nans = 0;
int infs = 0;
const unsigned short * f = (const unsigned short *) data;
for (size_t i = 0; i < nb; ++i) {
nans += (f[i] & 0x7fff) > 0x7f80;
infs += (f[i] & 0x7fff) == 0x7f80;
}
if (nans) {
fprintf(stderr, "%s: found %d NaNs in row of %zu BF16 values\n", __func__, nans, nb);
return false;
}
if (infs) {
fprintf(stderr, "%s: found %d infinities in row of %zu BF16 values\n", __func__, infs, nb);
return false;
}
} break;
case GGML_TYPE_F16:
{
const ggml_fp16_t * f = (const ggml_fp16_t *) data;

1031
ggml.c

File diff suppressed because it is too large Load diff

20
ggml.h
View file

@ -326,14 +326,20 @@ extern "C" {
// get ggml_status name string
GGML_API GGML_CALL const char * ggml_status_to_string(enum ggml_status status);
// ieee 754-2008 half-precision float16
// todo: make this not an integral type
typedef uint16_t ggml_fp16_t;
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t);
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float);
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t *, float *, int64_t);
GGML_API void ggml_fp32_to_fp16_row(const float *, ggml_fp16_t *, int64_t);
// convert FP16 <-> FP32
GGML_API float ggml_fp16_to_fp32(ggml_fp16_t x);
GGML_API ggml_fp16_t ggml_fp32_to_fp16(float x);
GGML_API void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, int64_t n);
GGML_API void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, int64_t n);
// google brain half-precision bfloat16
typedef struct { uint16_t bits; } ggml_bf16_t;
GGML_API ggml_bf16_t ggml_fp32_to_bf16(float);
GGML_API float ggml_bf16_to_fp32(ggml_bf16_t); // consider just doing << 16
GGML_API void ggml_bf16_to_fp32_row(const ggml_bf16_t *, float *, int64_t);
GGML_API void ggml_fp32_to_bf16_row(const float *, ggml_bf16_t *, int64_t);
struct ggml_object;
struct ggml_context;
@ -370,6 +376,7 @@ extern "C" {
GGML_TYPE_I64 = 27,
GGML_TYPE_F64 = 28,
GGML_TYPE_IQ1_M = 29,
GGML_TYPE_BF16 = 30,
GGML_TYPE_COUNT,
};
@ -410,6 +417,7 @@ extern "C" {
GGML_FTYPE_MOSTLY_IQ2_S = 21, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
};
// available tensor operations:

View file

@ -1,11 +1,14 @@
#!/usr/bin/env python
import logging
import argparse
import asyncio
import os
import sys
from tempfile import gettempdir, NamedTemporaryFile
logger = logging.getLogger("ggml-vk-generate-shaders")
shader_f32 = """
#define FLOAT_TYPE float
"""
@ -2498,7 +2501,7 @@ async def string_to_spv(name, code, defines, fp16=True):
stdout, stderr = await proc.communicate()
print(" ".join(cmd))
logger.info(" ".join(cmd))
if proc.returncode:
raise RuntimeError(f"{name=} {f.name=} {stdout=} {stderr=}")
@ -2507,7 +2510,7 @@ async def string_to_spv(name, code, defines, fp16=True):
cmd.extend([f"-D{key}={value}" for key, value in defines.items()])
code_with_lines = "\n".join([f"{i + 1}: {line}" for i, line in enumerate(preprocessed_code.splitlines())])
print(f"ERROR compiling {name}\n\n{code_with_lines}\n\n{error}")
logger.error(f"cannot compile {name}\n\n{code_with_lines}\n\n{error}")
f.close()
os.remove(f.name)
sys.exit(proc.returncode)
@ -2520,7 +2523,7 @@ async def string_to_spv(name, code, defines, fp16=True):
async def main():
print("ggml_vulkan: Generating and compiling shaders to SPIR-V")
logger.info("ggml_vulkan: Generating and compiling shaders to SPIR-V")
tasks = []
@ -2768,9 +2771,12 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(description="GGML Vulkan Shader Generator")
parser.add_argument("--glslc", help="Path to glslc")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if args.glslc:
GLSLC = args.glslc

View file

@ -1,8 +1,10 @@
#!/usr/bin/env python3
import logging
import sys
from pathlib import Path
from gguf.gguf_reader import GGUFReader
logger = logging.getLogger("reader")
sys.path.insert(0, str(Path(__file__).parent.parent))
@ -18,28 +20,28 @@ def read_gguf_file(gguf_file_path):
reader = GGUFReader(gguf_file_path)
# List all key-value pairs in a columnized format
print("Key-Value Pairs:")
print("Key-Value Pairs:") # noqa: NP100
max_key_length = max(len(key) for key in reader.fields.keys())
for key, field in reader.fields.items():
value = field.parts[field.data[0]]
print(f"{key:{max_key_length}} : {value}")
print("----")
print(f"{key:{max_key_length}} : {value}") # noqa: NP100
print("----") # noqa: NP100
# List all tensors
print("Tensors:")
print("Tensors:") # noqa: NP100
tensor_info_format = "{:<30} | Shape: {:<15} | Size: {:<12} | Quantization: {}"
print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization"))
print("-" * 80)
print(tensor_info_format.format("Tensor Name", "Shape", "Size", "Quantization")) # noqa: NP100
print("-" * 80) # noqa: NP100
for tensor in reader.tensors:
shape_str = "x".join(map(str, tensor.shape))
size_str = str(tensor.n_elements)
quantization_str = tensor.tensor_type.name
print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str))
print(tensor_info_format.format(tensor.name, shape_str, size_str, quantization_str)) # noqa: NP100
if __name__ == '__main__':
if len(sys.argv) < 2:
print("Usage: reader.py <path_to_gguf_file>")
logger.info("Usage: reader.py <path_to_gguf_file>")
sys.exit(1)
gguf_file_path = sys.argv[1]
read_gguf_file(gguf_file_path)

View file

@ -1,6 +1,5 @@
from __future__ import annotations
import sys
from enum import Enum, IntEnum, auto
from typing import Any
@ -843,6 +842,7 @@ class GGMLQuantizationType(IntEnum):
I64 = 27
F64 = 28
IQ1_M = 29
BF16 = 30
class GGUFEndian(IntEnum):
@ -879,14 +879,13 @@ class GGUFValueType(IntEnum):
return GGUFValueType.INT32
# TODO: need help with 64-bit types in Python
else:
print("Unknown type:", type(val))
sys.exit()
raise ValueError(f"Unknown type: {type(val)}")
# Note: Does not support GGML_QKK_64
QK_K = 256
# Items here are (block size, type size)
GGML_QUANT_SIZES = {
GGML_QUANT_SIZES: dict[GGMLQuantizationType, tuple[int, int]] = {
GGMLQuantizationType.F32: (1, 4),
GGMLQuantizationType.F16: (1, 2),
GGMLQuantizationType.Q4_0: (32, 2 + 16),
@ -915,6 +914,7 @@ GGML_QUANT_SIZES = {
GGMLQuantizationType.I64: (1, 8),
GGMLQuantizationType.F64: (1, 8),
GGMLQuantizationType.IQ1_M: (256, QK_K // 8 + QK_K // 16 + QK_K // 32),
GGMLQuantizationType.BF16: (1, 2),
}

View file

@ -4,6 +4,7 @@
#
from __future__ import annotations
import logging
import os
from collections import OrderedDict
from typing import Any, Literal, NamedTuple, TypeVar, Union
@ -27,6 +28,7 @@ from gguf.constants import (
GGUFValueType,
)
logger = logging.getLogger(__name__)
READER_SUPPORTED_VERSIONS = [2, GGUF_VERSION]
@ -63,7 +65,7 @@ class ReaderTensor(NamedTuple):
class GGUFReader:
# I - same as host, S - swapped
byte_order: Literal['I' | 'S'] = 'I'
byte_order: Literal['I'] | Literal['S'] = 'I'
alignment: int = GGUF_DEFAULT_ALIGNMENT
# Note: Internal helper, API may change.
@ -81,7 +83,7 @@ class GGUFReader:
GGUFValueType.BOOL: np.bool_,
}
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r' | 'r+' | 'c'] = 'r'):
def __init__(self, path: os.PathLike[str] | str, mode: Literal['r'] | Literal['r+'] | Literal['c'] = 'r'):
self.data = np.memmap(path, mode = mode)
offs = 0
if self._get(offs, np.uint32, override_order = '<')[0] != GGUF_MAGIC:
@ -126,7 +128,7 @@ class GGUFReader:
return self.tensors[idx]
def _get(
self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I' | 'S' | '<'] = None,
self, offset: int, dtype: npt.DTypeLike, count: int = 1, override_order: None | Literal['I'] | Literal['S'] | Literal['<'] = None,
) -> npt.NDArray[Any]:
count = int(count)
itemsize = int(np.empty([], dtype = dtype).itemsize)
@ -142,7 +144,7 @@ class GGUFReader:
# TODO: add option to generate error on duplicate keys
# raise KeyError(f'Duplicate {field.name} already in list at offset {field.offset}')
print(f'Warning: Duplicate key {field.name} at offset {field.offset}')
logger.warning(f'Duplicate key {field.name} at offset {field.offset}')
self.fields[field.name + '_{}'.format(field.offset)] = field
else:
self.fields[field.name] = field
@ -248,7 +250,7 @@ class GGUFReader:
raise ValueError(f'Found duplicated tensor with name {tensor_name}')
tensor_names.add(tensor_name)
ggml_type = GGMLQuantizationType(raw_dtype[0])
n_elems = np.prod(dims)
n_elems = int(np.prod(dims))
block_size, type_size = GGML_QUANT_SIZES[ggml_type]
n_bytes = n_elems * type_size // block_size
data_offs = int(start_offs + offset_tensor[0])

View file

@ -1,12 +1,13 @@
from __future__ import annotations
import logging
import os
import shutil
import struct
import tempfile
from enum import Enum, auto
from io import BufferedWriter
from typing import IO, Any, Sequence, Mapping
from typing import IO, Any, Callable, Sequence, Mapping
from string import ascii_letters, digits
import numpy as np
@ -24,6 +25,49 @@ from .constants import (
TokenType,
)
logger = logging.getLogger(__name__)
class LazyTensor:
data: Callable[[], np.ndarray[Any, Any]]
# to avoid too deep recursion
functions: list[Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]]]
dtype: np.dtype[Any]
shape: tuple[int, ...]
def __init__(self, data: Callable[[], np.ndarray[Any, Any]], *, dtype: type, shape: tuple[int, ...]):
self.data = data
self.functions = []
self.dtype = np.dtype(dtype)
self.shape = shape
def astype(self, dtype: type, **kwargs) -> LazyTensor:
self.functions.append(lambda n: n.astype(dtype, **kwargs))
self.dtype = np.dtype(dtype)
return self
@property
def nbytes(self) -> int:
size = 1
for n in self.shape:
size *= n
return size * self.dtype.itemsize
def tofile(self, *args, **kwargs) -> None:
data = self.data()
for f in self.functions:
data = f(data)
assert data.shape == self.shape
assert data.dtype == self.dtype
assert data.nbytes == self.nbytes
self.functions = []
self.data = lambda: data
data.tofile(*args, **kwargs)
def byteswap(self, *args, **kwargs) -> LazyTensor:
self.functions.append(lambda n: n.byteswap(*args, **kwargs))
return self
class WriterState(Enum):
EMPTY = auto()
@ -35,7 +79,7 @@ class WriterState(Enum):
class GGUFWriter:
fout: BufferedWriter
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
tensors: list[np.ndarray[Any, Any]]
tensors: list[np.ndarray[Any, Any] | LazyTensor]
_simple_value_packing = {
GGUFValueType.UINT8: "B",
GGUFValueType.INT8: "b",
@ -67,7 +111,7 @@ class GGUFWriter:
self.use_temp_file = use_temp_file
self.temp_file = None
self.tensors = []
print("gguf: This GGUF file is for {0} Endian only".format(
logger.info("gguf: This GGUF file is for {0} Endian only".format(
"Big" if self.endianess == GGUFEndian.BIG else "Little",
))
self.state = WriterState.EMPTY
@ -173,7 +217,7 @@ class GGUFWriter:
if pack_fmt is not None:
self.kv_data += self._pack(pack_fmt, val, skip_pack_prefix = vtype == GGUFValueType.BOOL)
elif vtype == GGUFValueType.STRING:
encoded_val = val.encode("utf8") if isinstance(val, str) else val
encoded_val = val.encode("utf-8") if isinstance(val, str) else val
self.kv_data += self._pack("Q", len(encoded_val))
self.kv_data += encoded_val
elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and val:
@ -202,7 +246,7 @@ class GGUFWriter:
raise ValueError(f'Duplicated tensor name {name}')
self.ti_names.add(name)
encoded_name = name.encode("utf8")
encoded_name = name.encode("utf-8")
self.ti_data += self._pack("Q", len(encoded_name))
self.ti_data += encoded_name
n_dims = len(tensor_shape)
@ -234,7 +278,7 @@ class GGUFWriter:
self.ti_data_count += 1
def add_tensor(
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
self, name: str, tensor: np.ndarray[Any, Any] | LazyTensor, raw_shape: Sequence[int] | None = None,
raw_dtype: GGMLQuantizationType | None = None,
) -> None:
if self.endianess == GGUFEndian.BIG:
@ -259,7 +303,7 @@ class GGUFWriter:
if pad != 0:
fp.write(bytes([0] * pad))
def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
def write_tensor_data(self, tensor: np.ndarray[Any, Any] | LazyTensor) -> None:
if self.state is not WriterState.TI_DATA:
raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
@ -269,15 +313,33 @@ class GGUFWriter:
tensor.tofile(self.fout)
self.write_padding(self.fout, tensor.nbytes)
def write_tensors_to_file(self) -> None:
def write_tensors_to_file(self, *, progress: bool = False) -> None:
self.write_ti_data_to_file()
self.write_padding(self.fout, self.fout.tell())
if self.temp_file is None:
self.tensors.reverse() # to pop from the "beginning" in constant time
if progress:
from tqdm import tqdm
total_bytes = sum(t.nbytes for t in self.tensors)
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
while True:
try:
tensor = self.tensors.pop(0)
tensor = self.tensors.pop()
except IndexError:
break
tensor.tofile(self.fout)
bar.update(tensor.nbytes)
self.write_padding(self.fout, tensor.nbytes)
return
while True:
try:
tensor = self.tensors.pop()
except IndexError:
break
tensor.tofile(self.fout)
@ -476,7 +538,7 @@ class GGUFWriter:
self.add_bool(Keys.Tokenizer.ADD_PREFIX, value)
def add_chat_template(self, value: str | Sequence[Mapping[str, str]]) -> None:
if isinstance(value, list):
if not isinstance(value, str):
template_default = None
template_names = set()

View file

@ -1,23 +1,25 @@
from __future__ import annotations
import logging
import json
import os
import sys
from pathlib import Path
from typing import Any, Callable
from typing import Any, Callable, Sequence, Mapping, Iterable
from .gguf_writer import GGUFWriter
logger = logging.getLogger(__name__)
class SpecialVocab:
merges: list[str]
add_special_token: dict[str, bool]
special_token_ids: dict[str, int]
chat_template: str | None
chat_template: str | Sequence[Mapping[str, str]] | None
def __init__(
self, path: str | os.PathLike[str], load_merges: bool = False,
special_token_types: tuple[str, ...] | None = None,
special_token_types: Iterable[str] | None = None,
n_vocab: int | None = None,
):
self.special_token_ids = {}
@ -40,38 +42,29 @@ class SpecialVocab:
def add_to_gguf(self, gw: GGUFWriter, quiet: bool = False) -> None:
if self.merges:
if not quiet:
print(f'gguf: Adding {len(self.merges)} merge(s).')
logger.info(f'Adding {len(self.merges)} merge(s).')
gw.add_token_merges(self.merges)
elif self.load_merges:
print(
'gguf: WARNING: Adding merges requested but no merges found, output may be non-functional.',
file = sys.stderr,
)
logger.warning('Adding merges requested but no merges found, output may be non-functional.')
for typ, tokid in self.special_token_ids.items():
id_handler: Callable[[int], None] | None = getattr(gw, f'add_{typ}_token_id', None)
if id_handler is None:
print(
f'gguf: WARNING: No handler for special token type {typ} with id {tokid} - skipping',
file = sys.stderr,
)
logger.warning(f'No handler for special token type {typ} with id {tokid} - skipping')
continue
if not quiet:
print(f'gguf: Setting special token type {typ} to {tokid}')
logger.info(f'Setting special token type {typ} to {tokid}')
id_handler(tokid)
for typ, value in self.add_special_token.items():
add_handler: Callable[[bool], None] | None = getattr(gw, f'add_add_{typ}_token', None)
if add_handler is None:
print(
f'gguf: WARNING: No handler for add_{typ}_token with value {value} - skipping',
file = sys.stderr,
)
logger.warning(f'No handler for add_{typ}_token with value {value} - skipping')
continue
if not quiet:
print(f'gguf: Setting add_{typ}_token to {value}')
logger.info(f'Setting add_{typ}_token to {value}')
add_handler(value)
if self.chat_template is not None:
if not quiet:
print(f'gguf: Setting chat_template to {self.chat_template}')
logger.info(f'Setting chat_template to {self.chat_template}')
gw.add_chat_template(self.chat_template)
def _load(self, path: Path) -> None:
@ -99,10 +92,7 @@ class SpecialVocab:
continue
parts = line.split(None, 3)
if len(parts) != 2:
print(
f'gguf: WARNING: {merges_file.name}: Line {line_num}: Entry malformed, ignoring',
file = sys.stderr,
)
logger.warning(f'{merges_file.name}: Line {line_num}: Entry malformed, ignoring')
continue
merges.append(f'{parts[0]} {parts[1]}')
self.merges = merges
@ -118,10 +108,7 @@ class SpecialVocab:
return
self.special_token_ids[typ] = tid
return
print(
f'gguf: WARNING: Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping',
file = sys.stderr,
)
logger.warning(f'Special token type {typ}, id {tid} out of range, must be under {self.n_vocab} - skipping')
def _try_load_from_tokenizer_json(self, path: Path) -> bool:
tokenizer_file = path / 'tokenizer.json'
@ -144,10 +131,7 @@ class SpecialVocab:
if chat_template is None or isinstance(chat_template, (str, list)):
self.chat_template = chat_template
else:
print(
f'gguf: WARNING: Bad type for chat_template field in {tokenizer_config_file!r} - ignoring',
file = sys.stderr
)
logger.warning(f'Bad type for chat_template field in {tokenizer_config_file!r} - ignoring')
for typ in self.special_token_types:
add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool):

View file

@ -21,6 +21,7 @@ classifiers = [
[tool.poetry.dependencies]
python = ">=3.8"
numpy = ">=1.17"
tqdm = ">=4.27"
[tool.poetry.dev-dependencies]
pytest = "^5.2"

View file

@ -1,9 +1,11 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import argparse
import os
import sys
from tqdm import tqdm
from pathlib import Path
import numpy as np
@ -14,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
import gguf
logger = logging.getLogger("gguf-convert-endian")
def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None:
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
@ -29,11 +33,11 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
else:
file_endian = host_endian
order = host_endian if args.order == "native" else args.order
print(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
logger.info(f"* Host is {host_endian.upper()} endian, GGUF file seems to be {file_endian.upper()} endian")
if file_endian == order:
print(f"* File is already {order.upper()} endian. Nothing to do.")
logger.info(f"* File is already {order.upper()} endian. Nothing to do.")
sys.exit(0)
print("* Checking tensors for conversion compatibility")
logger.info("* Checking tensors for conversion compatibility")
for tensor in reader.tensors:
if tensor.tensor_type not in (
gguf.GGMLQuantizationType.F32,
@ -41,51 +45,64 @@ def convert_byteorder(reader: gguf.GGUFReader, args: argparse.Namespace) -> None
gguf.GGMLQuantizationType.Q8_0,
):
raise ValueError(f"Cannot handle type {tensor.tensor_type.name} for tensor {repr(tensor.name)}")
print(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
logger.info(f"* Preparing to convert from {file_endian.upper()} to {order.upper()}")
if args.dry_run:
return
print("\n*** Warning *** Warning *** Warning **")
print("* This conversion process may damage the file. Ensure you have a backup.")
logger.warning("*** Warning *** Warning *** Warning **")
logger.warning("* This conversion process may damage the file. Ensure you have a backup.")
if order != host_endian:
print("* Requested endian differs from host, you will not be able to load the model on this machine.")
print("* The file will be modified immediately, so if conversion fails or is interrupted")
print("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
logger.warning("* Requested endian differs from host, you will not be able to load the model on this machine.")
logger.warning("* The file will be modified immediately, so if conversion fails or is interrupted")
logger.warning("* the file will be corrupted. Enter exactly YES if you are positive you want to proceed:")
response = input("YES, I am sure> ")
if response != "YES":
print("You didn't enter YES. Okay then, see ya!")
logger.warning("You didn't enter YES. Okay then, see ya!")
sys.exit(0)
print(f"\n* Converting fields ({len(reader.fields)})")
logger.info(f"* Converting fields ({len(reader.fields)})")
for idx, field in enumerate(reader.fields.values()):
print(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
logger.info(f"- {idx:4}: Converting field {repr(field.name)}, part count: {len(field.parts)}")
for part in field.parts:
part.byteswap(inplace=True)
print(f"\n* Converting tensors ({len(reader.tensors)})")
for idx, tensor in enumerate(reader.tensors):
print(
f" - {idx:4}: Converting tensor {repr(tensor.name)}, type={tensor.tensor_type.name}, "
f"elements={tensor.n_elements}... ",
end="",
logger.info(f"* Converting tensors ({len(reader.tensors)})")
for idx, tensor in enumerate(pbar := tqdm(reader.tensors, desc="Converting tensor")):
log_message = (
f"Converting tensor {repr(tensor.name)}, "
f"type={tensor.tensor_type.name}, "
f"elements={tensor.n_elements} "
)
tensor_type = tensor.tensor_type
# Byte-swap each part of the tensor's field
for part in tensor.field.parts:
part.byteswap(inplace=True)
if tensor_type != gguf.GGMLQuantizationType.Q8_0:
tensor.data.byteswap(inplace=True)
print()
continue
# A Q8_0 block consists of a f16 delta followed by 32 int8 quants, so 34 bytes
block_size = 34
# Byte-swap tensor data if necessary
if tensor.tensor_type == gguf.GGMLQuantizationType.Q8_0:
# Handle Q8_0 tensor blocks (block_q8_0)
# Specific handling of block_q8_0 is required.
# Each block_q8_0 consists of an f16 delta (scaling factor) followed by 32 int8 quantizations.
block_size = 34 # 34 bytes = <f16 delta scaling factor> + 32 * <int8 quant>
n_blocks = len(tensor.data) // block_size
for block_num in range(n_blocks):
for block_num in (inner_pbar := tqdm(range(n_blocks), desc="Byte-swapping Blocks", leave=False)):
block_offs = block_num * block_size
# I know I said f16, but it doesn't matter here - any simple 16 bit type works.
# Byte-Swap f16 sized delta field
delta = tensor.data[block_offs:block_offs + 2].view(dtype=np.uint16)
delta.byteswap(inplace=True)
# Byte-Swap Q8 weights
if block_num % 100000 == 0:
print(f"[{(n_blocks - block_num) // 1000}K]", end="")
sys.stdout.flush()
print()
print("* Completion")
inner_pbar.set_description(f"Byte-swapping Blocks [{(n_blocks - block_num) // n_blocks}]")
else:
# Handle other tensor types
tensor.data.byteswap(inplace=True)
pbar.set_description(log_message)
logger.info("* Completion")
def main() -> None:
@ -102,8 +119,13 @@ def main() -> None:
"--dry-run", action="store_true",
help="Don't actually change anything",
)
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
print(f'* Loading: {args.model}')
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
logger.info(f'* Loading: {args.model}')
reader = gguf.GGUFReader(args.model, 'r' if args.dry_run else 'r+')
convert_byteorder(reader, args)

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3
from __future__ import annotations
import logging
import argparse
import os
import sys
@ -15,6 +16,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
from gguf import GGUFReader, GGUFValueType # noqa: E402
logger = logging.getLogger("gguf-dump")
def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
host_endian = 'LITTLE' if np.uint32(1) == np.uint32(1).newbyteorder("<") else 'BIG'
@ -29,8 +32,8 @@ def get_file_host_endian(reader: GGUFReader) -> tuple[str, str]:
# please see the comments in the modify_gguf.py example.
def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
host_endian, file_endian = get_file_host_endian(reader)
print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.')
print(f'\n* Dumping {len(reader.fields)} key/value pair(s)')
print(f'* File is {file_endian} endian, script is running on a {host_endian} endian host.') # noqa: NP100
print(f'* Dumping {len(reader.fields)} key/value pair(s)') # noqa: NP100
for n, field in enumerate(reader.fields.values(), 1):
if not field.types:
pretty_type = 'N/A'
@ -39,20 +42,21 @@ def dump_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
pretty_type = '[' * nest_count + str(field.types[-1].name) + ']' * nest_count
else:
pretty_type = str(field.types[-1].name)
print(f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}', end = '')
log_message = f' {n:5}: {pretty_type:10} | {len(field.data):8} | {field.name}'
if len(field.types) == 1:
curr_type = field.types[0]
if curr_type == GGUFValueType.STRING:
print(' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf8')[:60])), end = '')
log_message += ' = {0}'.format(repr(str(bytes(field.parts[-1]), encoding='utf-8')[:60]))
elif field.types[0] in reader.gguf_scalar_to_np:
print(' = {0}'.format(field.parts[-1][0]), end = '')
print()
log_message += ' = {0}'.format(field.parts[-1][0])
print(log_message) # noqa: NP100
if args.no_tensors:
return
print(f'\n* Dumping {len(reader.tensors)} tensor(s)')
print(f'* Dumping {len(reader.tensors)} tensor(s)') # noqa: NP100
for n, tensor in enumerate(reader.tensors, 1):
prettydims = ', '.join('{0:5}'.format(d) for d in list(tensor.shape) + [1] * (4 - len(tensor.shape)))
print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}')
print(f' {n:5}: {tensor.n_elements:10} | {prettydims} | {tensor.tensor_type.name:7} | {tensor.name}') # noqa: NP100
def dump_metadata_json(reader: GGUFReader, args: argparse.Namespace) -> None:
@ -103,10 +107,17 @@ def main() -> None:
parser.add_argument("--no-tensors", action="store_true", help="Don't dump tensor metadata")
parser.add_argument("--json", action="store_true", help="Produce JSON output")
parser.add_argument("--json-array", action="store_true", help="Include full array values in JSON output (long)")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
if not args.json:
print(f'* Loading: {args.model}')
logger.info(f'* Loading: {args.model}')
reader = GGUFReader(args.model, 'r')
if args.json:
dump_metadata_json(reader, args)
else:

100
gguf-py/scripts/gguf-new-metadata.py Normal file → Executable file
View file

@ -7,7 +7,8 @@ import json
from pathlib import Path
import numpy as np
from typing import Any, Mapping, Sequence
from tqdm import tqdm
from typing import Any, Sequence, NamedTuple
# Necessary to load the local gguf package
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
@ -18,6 +19,12 @@ import gguf
logger = logging.getLogger("gguf-new-metadata")
class MetadataDetails(NamedTuple):
type: gguf.GGUFValueType
value: Any
description: str = ''
def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# Host is little endian
@ -34,7 +41,7 @@ def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
return host_endian
def decode_field(field: gguf.ReaderField) -> Any:
def decode_field(field: gguf.ReaderField | None) -> Any:
if field and field.types:
main_type = field.types[0]
@ -42,11 +49,11 @@ def decode_field(field: gguf.ReaderField) -> Any:
sub_type = field.types[-1]
if sub_type == gguf.GGUFValueType.STRING:
return [str(bytes(field.parts[idx]), encoding='utf8') for idx in field.data]
return [str(bytes(field.parts[idx]), encoding='utf-8') for idx in field.data]
else:
return [pv for idx in field.data for pv in field.parts[idx].tolist()]
if main_type == gguf.GGUFValueType.STRING:
return str(bytes(field.parts[-1]), encoding='utf8')
return str(bytes(field.parts[-1]), encoding='utf-8')
else:
return field.parts[-1][0]
@ -59,7 +66,16 @@ def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
return decode_field(field)
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: Mapping[str, str], remove_metadata: Sequence[str]) -> None:
def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
token_ids = [index for index, value in enumerate(token_list) if value == token]
if len(token_ids) == 0:
raise LookupError(f'Unable to find "{token}" in token list!')
return token_ids
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None:
for field in reader.fields.values():
# Suppress virtual fields and fields written by GGUFWriter
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
@ -75,54 +91,64 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
logger.debug(f'Removing {field.name}')
continue
old_val = decode_field(field)
old_val = MetadataDetails(field.types[0], decode_field(field))
val = new_metadata.get(field.name, old_val)
if field.name in new_metadata:
logger.debug(f'Modifying {field.name}: "{old_val}" -> "{val}"')
logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}')
del new_metadata[field.name]
elif val is not None:
elif val.value is not None:
logger.debug(f'Copying {field.name}')
if val is not None:
if val.value is not None:
writer.add_key(field.name)
writer.add_val(val, field.types[0])
writer.add_val(val.value, val.type)
if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
logger.debug('Adding chat template(s)')
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE])
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
# TODO: Support other types than string?
for key, val in new_metadata.items():
logger.debug(f'Adding {key}: {val}')
logger.debug(f'Adding {key}: "{val.value}" {val.description}')
writer.add_key(key)
writer.add_val(val, gguf.GGUFValueType.STRING)
writer.add_val(val.value, val.type)
total_bytes = 0
for tensor in reader.tensors:
total_bytes += tensor.n_bytes
# Dimensions are written in reverse order, so flip them first
shape = np.flipud(tensor.shape)
shape = np.flipud(tensor.shape).tolist()
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_ti_data_to_file()
for tensor in reader.tensors:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
writer.close()
def main() -> None:
tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_'))
token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id'))
parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
parser.add_argument("input", type=Path, help="GGUF format model input filename")
parser.add_argument("output", type=Path, help="GGUF format model output filename")
parser.add_argument("--general-name", type=str, help="The models general.name")
parser.add_argument("--general-description", type=str, help="The models general.description")
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)")
parser.add_argument("--chat-template-config", type=Path, help="Config file (tokenizer_config.json) containing chat template(s)")
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model")
parser.add_argument("--general-name", type=str, help="The models general.name", metavar='"name"')
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
parser.add_argument("--special-token-by-id", action="append", type=str, help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation")
parser.add_argument("--verbose", action="store_true", help="Increase output verbosity")
args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
@ -133,20 +159,20 @@ def main() -> None:
remove_metadata = args.remove_metadata or []
if args.general_name:
new_metadata[gguf.Keys.General.NAME] = args.general_name
new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name)
if args.general_description:
new_metadata[gguf.Keys.General.DESCRIPTION] = args.general_description
new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description)
if args.chat_template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
if args.chat_template_config:
with open(args.chat_template_config, 'r') as fp:
config = json.load(fp)
template = config.get('chat_template')
if template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = template
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
if remove_metadata:
logger.warning('*** Warning *** Warning *** Warning **')
@ -166,6 +192,32 @@ def main() -> None:
arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
endianess = get_byteorder(reader)
token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
for name, token in args.special_token or []:
if name not in token_names:
logger.warning(f'Unknown special token "{name}", ignoring...')
else:
ids = find_token(token_list, token)
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}')
if len(ids) > 1:
logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:')
logger.warning(', '.join(str(i) for i in ids))
for name, id_string in args.special_token_by_id or []:
if name not in token_names:
logger.warning(f'Unknown special token "{name}", ignoring...')
elif not id_string.isdecimal():
raise LookupError(f'Token ID "{id_string}" is not a valid ID!')
else:
id_int = int(id_string)
if id_int >= 0 and id_int < len(token_list):
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}')
else:
raise LookupError(f'Token ID {id_int} is not within token list!')
if os.path.isfile(args.output) and not args.force:
logger.warning('*** Warning *** Warning *** Warning **')
logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')

View file

@ -1,4 +1,5 @@
#!/usr/bin/env python3
import logging
import argparse
import os
import sys
@ -10,6 +11,8 @@ if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent /
from gguf import GGUFReader # noqa: E402
logger = logging.getLogger("gguf-set-metadata")
def minimal_example(filename: str) -> None:
reader = GGUFReader(filename, 'r+')
@ -41,36 +44,33 @@ def minimal_example(filename: str) -> None:
def set_metadata(reader: GGUFReader, args: argparse.Namespace) -> None:
field = reader.get_field(args.key)
if field is None:
print(f'! Field {repr(args.key)} not found', file = sys.stderr)
logger.error(f'! Field {repr(args.key)} not found')
sys.exit(1)
# Note that field.types is a list of types. This is because the GGUF
# format supports arrays. For example, an array of UINT32 would
# look like [GGUFValueType.ARRAY, GGUFValueType.UINT32]
handler = reader.gguf_scalar_to_np.get(field.types[0]) if field.types else None
if handler is None:
print(
f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}',
file = sys.stderr,
)
logger.error(f'! This tool only supports changing simple values, {repr(args.key)} has unsupported type {field.types}')
sys.exit(1)
current_value = field.parts[field.data[0]][0]
new_value = handler(args.value)
print(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
logger.info(f'* Preparing to change field {repr(args.key)} from {current_value} to {new_value}')
if current_value == new_value:
print(f'- Key {repr(args.key)} already set to requested value {current_value}')
logger.info(f'- Key {repr(args.key)} already set to requested value {current_value}')
sys.exit(0)
if args.dry_run:
sys.exit(0)
if not args.force:
print('*** Warning *** Warning *** Warning **')
print('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
print('* Enter exactly YES if you are positive you want to proceed:')
logger.warning('*** Warning *** Warning *** Warning **')
logger.warning('* Changing fields in a GGUF file can make it unusable. Proceed at your own risk.')
logger.warning('* Enter exactly YES if you are positive you want to proceed:')
response = input('YES, I am sure> ')
if response != 'YES':
print("You didn't enter YES. Okay then, see ya!")
logger.info("You didn't enter YES. Okay then, see ya!")
sys.exit(0)
field.parts[field.data[0]][0] = new_value
print('* Field changed. Successful completion.')
logger.info('* Field changed. Successful completion.')
def main() -> None:
@ -80,8 +80,13 @@ def main() -> None:
parser.add_argument("value", type=str, help="Metadata value to set")
parser.add_argument("--dry-run", action="store_true", help="Don't actually change anything")
parser.add_argument("--force", action="store_true", help="Change the field without confirmation")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args(None if len(sys.argv) > 1 else ["--help"])
print(f'* Loading: {args.model}')
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
logger.info(f'* Loading: {args.model}')
reader = GGUFReader(args.model, 'r' if args.dry_run else 'r+')
set_metadata(reader, args)

View file

@ -51,7 +51,7 @@ single-line ::= [^\n]+ "\n"`
## Sequences and Alternatives
The order of symbols in a sequence matter. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
The order of symbols in a sequence matters. For example, in `"1. " move " " move "\n"`, the `"1. "` must come before the first `move`, etc.
Alternatives, denoted by `|`, give different sequences that are acceptable. For example, in `move ::= pawn | nonpawn | castle`, `move` can be a `pawn` move, a `nonpawn` move, or a `castle`.

View file

@ -2386,7 +2386,7 @@ static bool llama_kv_cache_init(
cache.recurrent = model.arch == LLM_ARCH_MAMBA;
cache.v_trans = !cparams.flash_attn;
// TODO: support mixed reccurent Transformer architectues
// TODO: support mixed recurrent Transformer architectures
// NOTE: (!a || b) is a logical implication (a -> b)
GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
@ -3202,6 +3202,7 @@ struct llama_model_loader {
switch (type_max) {
case GGML_TYPE_F32: ftype = LLAMA_FTYPE_ALL_F32; break;
case GGML_TYPE_F16: ftype = LLAMA_FTYPE_MOSTLY_F16; break;
case GGML_TYPE_BF16: ftype = LLAMA_FTYPE_MOSTLY_BF16; break;
case GGML_TYPE_Q4_0: ftype = LLAMA_FTYPE_MOSTLY_Q4_0; break;
case GGML_TYPE_Q4_1: ftype = LLAMA_FTYPE_MOSTLY_Q4_1; break;
case GGML_TYPE_Q5_0: ftype = LLAMA_FTYPE_MOSTLY_Q5_0; break;
@ -3693,6 +3694,7 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
switch (ftype) {
case LLAMA_FTYPE_ALL_F32: return "all F32";
case LLAMA_FTYPE_MOSTLY_F16: return "F16";
case LLAMA_FTYPE_MOSTLY_BF16: return "BF16";
case LLAMA_FTYPE_MOSTLY_Q4_0: return "Q4_0";
case LLAMA_FTYPE_MOSTLY_Q4_1: return "Q4_1";
case LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16:
@ -4425,6 +4427,21 @@ static void llm_load_vocab(
} else if (
tokenizer_pre == "gpt-2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
} else if (
tokenizer_pre == "refact") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_REFACT;
} else if (
tokenizer_pre == "command-r") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
} else if (
tokenizer_pre == "qwen2") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
} else if (
tokenizer_pre == "olmo") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
} else if (
tokenizer_pre == "dbrx") {
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DBRX;
} else {
throw std::runtime_error(format("unknown pre-tokenizer type: '%s'", tokenizer_pre.c_str()));
}
@ -6211,6 +6228,7 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|| !(
model.ftype == LLAMA_FTYPE_ALL_F32 ||
model.ftype == LLAMA_FTYPE_MOSTLY_F16 ||
model.ftype == LLAMA_FTYPE_MOSTLY_BF16 ||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ||
model.ftype == LLAMA_FTYPE_MOSTLY_Q4_1
)
@ -12195,7 +12213,7 @@ static bool llama_is_user_defined_token(const llama_vocab& vocab, llama_token id
static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
GGML_ASSERT(llama_vocab_get_type(vocab) != LLAMA_VOCAB_TYPE_NONE);
GGML_ASSERT(llama_is_byte_token(vocab, id));
const auto& token_data = vocab.id_to_token.at(id);
const auto & token_data = vocab.id_to_token.at(id);
switch (llama_vocab_get_type(vocab)) {
case LLAMA_VOCAB_TYPE_SPM: {
auto buf = token_data.text.substr(3, 2);
@ -12431,6 +12449,7 @@ struct llm_tokenizer_bpe {
case LLAMA_VOCAB_TYPE_BPE:
switch (vocab.type_pre) {
case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
case LLAMA_VOCAB_PRE_TYPE_DBRX:
word_collection = unicode_regex_split(text, {
// original regex from tokenizer.json
//"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@ -12455,14 +12474,13 @@ struct llm_tokenizer_bpe {
"\\s?\\p{L}+",
"\\s?\\p{P}+",
"[一-龥ࠀ-一가-퟿]+",
"\\p{N}+",
"\\p{N}",
});
break;
case LLAMA_VOCAB_PRE_TYPE_FALCON:
word_collection = unicode_regex_split(text, {
"[\\p{P}\\$\\+<=>\\^~\\|]+",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
"\\p{N}+",
"[0-9][0-9][0-9]",
});
break;
@ -12478,11 +12496,26 @@ struct llm_tokenizer_bpe {
});
break;
case LLAMA_VOCAB_PRE_TYPE_STARCODER:
case LLAMA_VOCAB_PRE_TYPE_REFACT:
case LLAMA_VOCAB_PRE_TYPE_COMMAND_R:
word_collection = unicode_regex_split(text, {
"\\p{N}",
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
case LLAMA_VOCAB_PRE_TYPE_GPT2:
case LLAMA_VOCAB_PRE_TYPE_OLMO:
word_collection = unicode_regex_split(text, {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
});
break;
case LLAMA_VOCAB_PRE_TYPE_QWEN2:
word_collection = unicode_regex_split(text, {
// original regex from tokenizer.json
// "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
"(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
});
break;
default:
// default regex for BPE tokenization pre-processing
word_collection = unicode_regex_split(text, {
@ -12698,7 +12731,7 @@ struct llm_tokenizer_wpm {
continue;
}
code = unicode_tolower(code);
if (type == CODEPOINT_TYPE_WHITESPACE) {
if (type == CODEPOINT_TYPE_SEPARATOR) {
code = ' ';
}
std::string s = unicode_cpt_to_utf8(code);
@ -14385,13 +14418,16 @@ static void llama_tensor_dequantize_internal(
if (qtype.to_float == NULL) {
throw std::runtime_error(format("type %s unsupported for integer quantization: no dequantization available", ggml_type_name(tensor->type)));
}
} else if (tensor->type != GGML_TYPE_F16) {
} else if (tensor->type != GGML_TYPE_F16 &&
tensor->type != GGML_TYPE_BF16) {
throw std::runtime_error(format("cannot dequantize/convert tensor type %s", ggml_type_name(tensor->type)));
}
if (nthread < 2) {
if (tensor->type == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)tensor->data, f32_output, nelements);
} else if (tensor->type == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)tensor->data, f32_output, nelements);
} else if (ggml_is_quantized(tensor->type)) {
qtype.to_float(tensor->data, f32_output, nelements);
} else {
@ -14400,7 +14436,14 @@ static void llama_tensor_dequantize_internal(
return;
}
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
size_t block_size;
if (tensor->type == GGML_TYPE_F16 ||
tensor->type == GGML_TYPE_BF16) {
block_size = 1;
} else {
block_size = (size_t)ggml_blck_size(tensor->type);
}
size_t block_size_bytes = ggml_type_size(tensor->type);
GGML_ASSERT(nelements % block_size == 0);
@ -14419,6 +14462,8 @@ static void llama_tensor_dequantize_internal(
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
if (typ == GGML_TYPE_F16) {
ggml_fp16_to_fp32_row((ggml_fp16_t *)inbuf, outbuf, nels);
} else if (typ == GGML_TYPE_BF16) {
ggml_bf16_to_fp32_row((ggml_bf16_t *)inbuf, outbuf, nels);
} else {
qtype.to_float(inbuf, outbuf, nels);
}
@ -14779,6 +14824,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
case LLAMA_FTYPE_MOSTLY_Q5_1: default_type = GGML_TYPE_Q5_1; break;
case LLAMA_FTYPE_MOSTLY_Q8_0: default_type = GGML_TYPE_Q8_0; break;
case LLAMA_FTYPE_MOSTLY_F16: default_type = GGML_TYPE_F16; break;
case LLAMA_FTYPE_MOSTLY_BF16: default_type = GGML_TYPE_BF16; break;
case LLAMA_FTYPE_ALL_F32: default_type = GGML_TYPE_F32; break;
// K-quants
@ -15716,13 +15762,6 @@ struct llama_context * llama_new_context_with_model(
cparams.flash_attn = false;
}
#ifdef GGML_USE_HIPBLAS
if (cparams.flash_attn) {
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
cparams.flash_attn = false;
}
#endif
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
@ -17710,9 +17749,10 @@ int32_t llama_tokenize(
static std::string llama_decode_text(const std::string & text) {
std::string decoded_text;
auto unicode_sequences = unicode_cpts_from_utf8(text);
for (auto & unicode_sequence : unicode_sequences) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(unicode_sequence));
const auto cpts = unicode_cpts_from_utf8(text);
for (const auto cpt : cpts) {
decoded_text += unicode_utf8_to_byte(unicode_cpt_to_utf8(cpt));
}
return decoded_text;
@ -18076,7 +18116,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
/*.n_sample =*/ std::max(1, ctx->n_sample),
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
/*.n_eval =*/ std::max(1, ctx->n_eval),
};

View file

@ -79,6 +79,11 @@ extern "C" {
LLAMA_VOCAB_PRE_TYPE_MPT = 5,
LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
LLAMA_VOCAB_PRE_TYPE_QWEN2 = 10,
LLAMA_VOCAB_PRE_TYPE_OLMO = 11,
LLAMA_VOCAB_PRE_TYPE_DBRX = 12,
};
// note: these values should be synchronized with ggml_rope
@ -134,6 +139,7 @@ extern "C" {
LLAMA_FTYPE_MOSTLY_IQ2_M = 29, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
};
@ -171,7 +177,7 @@ extern "C" {
bool sorted;
} llama_token_data_array;
typedef bool (*llama_progress_callback)(float progress, void *ctx);
typedef bool (*llama_progress_callback)(float progress, void * user_data);
// Input data for llama_decode
// A llama_batch object can contain input about one or many sequences

View file

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,3 +1,5 @@
29464 2094 1018 1092 2706
11865 17875

Binary file not shown.

View file

@ -0,0 +1,106 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello world
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World
__ggml_vocab_test__
Hello World!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
Hello, world!
__ggml_vocab_test__
this is 🦙.cpp
__ggml_vocab_test__
w048 7tuijk dsdfhu
__ggml_vocab_test__
нещо на Български
__ggml_vocab_test__
កាន់តែពិសេសអាចខលចេញ
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
__ggml_vocab_test__
Hello
Hello
__ggml_vocab_test__
(
__ggml_vocab_test__
=
__ggml_vocab_test__
' era
__ggml_vocab_test__
Hello, y'all! How are you 😁 ?我想在apple工作1314151天
__ggml_vocab_test__
3
__ggml_vocab_test__
33
__ggml_vocab_test__
333
__ggml_vocab_test__
3333
__ggml_vocab_test__
33333
__ggml_vocab_test__
333333
__ggml_vocab_test__
3333333
__ggml_vocab_test__
33333333
__ggml_vocab_test__
333333333
__ggml_vocab_test__
🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天 ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
__ggml_vocab_test__

View file

@ -0,0 +1,43 @@
2536 228 27 228 22957 6983
45 193433
228
1667
1742
205
206
2126
11516
34777
28339 3845
46609 3845
28339 3930
46609 3930
46609 3930 8
28339 19 3845 8
46609 19 3845 8
2075 1801 11254 107 255 21 19317
94 23 27 31 228 30 21213 20752 39267 6405 9980
4929 40071 2196 3236 8750 1764 37097 41168
38111 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 38111 231 38111 257 38111 235 165 24629 38111 239
2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 1737 10203 109160 1875 2222 2517 3342 12523 16
28339
46609
228 46609
1667 46609
1742 46609
1742 46609 1856 46609
1737
206 1857
14 4515
28339 19 1770 14 1954 8 4070 1955 1933 80503 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372
26
26 26
26 26 26
26 26 26 26
26 26 26 26 26
26 26 26 26 26 26
26 26 26 26 26 26 26
26 26 26 26 26 26 26 26
26 26 26 26 26 26 26 26 26
127731 51628 205 57788 18494 97469 126134 206 2226 256 230 1737 18258 16 80503 122 35927 2226 242 112 57462 1737 54457 223165 106230 2096 16 48389 11254 107 255 2226 107 255 228 26 228 26 26 228 26 26 26 228 26 26 26 26 228 26 26 26 26 26 228 26 26 26 26 26 26 228 26 26 26 26 26 26 26 228 26 26 26 26 26 26 26 26 228 26 21 26 228 26 2271 26 228 26 3834 26 182018 230 174833 38111 249 86325 241 38111 245 86325 232 38111 252 38111 123 38111 261 165 24629 38111 261 38111 103 174833 38111 235 188568 231 5691 12081 13336 2648 29325 14315 24 26 24 27 24 28 24 5123 18372 8391 158343 3512 40071 2196 3236 8750 1764 37097 41168 29721 32797 25646 3802 4975 4975 116167 57178 10251 154048 27292 1767 5125 2632 2155 91 2378 1919 1914 2782 19 2155 3354 1933 5470 38 2155 52 2068 5470 1767 4961 3059 1894 19 2155 43 1933 3026 2725 23186 38 2930 14 20676 1671 14 83 51

View file

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,3 +1,5 @@
1050 207 19 207 19192 4217
37 32009 71 6247
207
243

View file

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,3 +1,5 @@
1052 207 19 207 19109 4223
37 100014 71 6245
207
243

View file

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,3 +1,5 @@
878 204 31 3068 133 2137
28611 132 30042
204
258

View file

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,3 +1,5 @@
798 604 25208 1933
37 9116 71 11751
220
220 220

View file

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

View file

@ -1,3 +1,5 @@
1142 220 19 220 27154 4038
37 51853 261
220
256

View file

@ -1,3 +1,7 @@
ied 4 ½ months
__ggml_vocab_test__
Führer
__ggml_vocab_test__
__ggml_vocab_test__

Some files were not shown because too many files have changed in this diff Show more