Merge branch 'master' of https://github.com/JoanFM/llama.cpp into feat-jina-embeddings-v2-zh
This commit is contained in:
commit
d0a99aa424
35 changed files with 1854 additions and 963 deletions
15
.github/workflows/build.yml
vendored
15
.github/workflows/build.yml
vendored
|
@ -898,9 +898,9 @@ jobs:
|
|||
shell: bash
|
||||
|
||||
env:
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
|
||||
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
|
||||
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
|
||||
|
||||
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
|
||||
steps:
|
||||
- name: Clone
|
||||
id: checkout
|
||||
|
@ -932,6 +932,17 @@ jobs:
|
|||
id: pack_artifacts
|
||||
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
|
||||
run: |
|
||||
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
|
||||
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
|
||||
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
|
||||
echo "cp oneAPI running time dll files to ./build/bin done"
|
||||
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
|
||||
|
||||
- name: Upload artifacts
|
||||
|
|
|
@ -296,7 +296,7 @@ if (LLAMA_BLAS)
|
|||
if (LLAMA_STATIC)
|
||||
set(BLA_STATIC ON)
|
||||
endif()
|
||||
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
|
||||
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
|
||||
set(BLA_SIZEOF_INTEGER 8)
|
||||
endif()
|
||||
|
||||
|
@ -1281,17 +1281,6 @@ install(
|
|||
WORLD_READ
|
||||
WORLD_EXECUTE
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
install(
|
||||
FILES convert-lora-to-ggml.py
|
||||
PERMISSIONS
|
||||
OWNER_READ
|
||||
OWNER_WRITE
|
||||
OWNER_EXECUTE
|
||||
GROUP_READ
|
||||
GROUP_EXECUTE
|
||||
WORLD_READ
|
||||
WORLD_EXECUTE
|
||||
DESTINATION ${CMAKE_INSTALL_BINDIR})
|
||||
if (LLAMA_METAL)
|
||||
install(
|
||||
FILES ggml-metal.metal
|
||||
|
|
95
ci/run.sh
95
ci/run.sh
|
@ -365,47 +365,6 @@ function gg_run_open_llama_3b_v2 {
|
|||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
# lora
|
||||
function compare_ppl {
|
||||
qnt="$1"
|
||||
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||
return 0
|
||||
}
|
||||
|
||||
path_lora="../models-mnt/open-llama/3B-v2/lora"
|
||||
path_shakespeare="../models-mnt/shakespeare"
|
||||
|
||||
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||
|
||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||
|
||||
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||
|
||||
# f16
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
# q8_0
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
# q8_0 + f16 lora-base
|
||||
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
|
@ -416,7 +375,6 @@ function gg_sum_open_llama_3b_v2 {
|
|||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
|
@ -429,11 +387,6 @@ function gg_sum_open_llama_3b_v2 {
|
|||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||
}
|
||||
|
||||
# open_llama_7b_v2
|
||||
|
@ -549,48 +502,6 @@ function gg_run_open_llama_7b_v2 {
|
|||
|
||||
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
|
||||
|
||||
# lora
|
||||
function compare_ppl {
|
||||
qnt="$1"
|
||||
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
|
||||
|
||||
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
|
||||
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
|
||||
return 20
|
||||
fi
|
||||
|
||||
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
|
||||
return 0
|
||||
}
|
||||
|
||||
path_lora="../models-mnt/open-llama/7B-v2/lora"
|
||||
path_shakespeare="../models-mnt/shakespeare"
|
||||
|
||||
shakespeare="${path_shakespeare}/shakespeare.txt"
|
||||
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
|
||||
|
||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
|
||||
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
|
||||
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
|
||||
|
||||
python3 ../convert-lora-to-ggml.py ${path_lora}
|
||||
|
||||
# f16
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
|
||||
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
|
||||
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
# currently not supported by the CUDA backend
|
||||
# q8_0
|
||||
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
|
||||
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
|
||||
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
# q8_0 + f16 lora-base
|
||||
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
|
||||
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
|
||||
|
||||
set +e
|
||||
}
|
||||
|
||||
|
@ -601,7 +512,6 @@ function gg_sum_open_llama_7b_v2 {
|
|||
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
|
||||
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
|
||||
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
|
||||
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
|
||||
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
|
||||
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
|
||||
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
|
||||
|
@ -614,11 +524,6 @@ function gg_sum_open_llama_7b_v2 {
|
|||
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
|
||||
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
|
||||
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
|
||||
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
|
||||
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
|
||||
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
|
||||
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
|
||||
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
|
||||
}
|
||||
|
||||
# bge-small
|
||||
|
|
|
@ -12,7 +12,7 @@ import sys
|
|||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from hashlib import sha256
|
||||
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast, overload
|
||||
from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterable, Iterator, Sequence, TypeVar, cast
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
@ -48,7 +48,6 @@ class Model:
|
|||
|
||||
dir_model: Path
|
||||
ftype: int
|
||||
fname_out: Path
|
||||
is_big_endian: bool
|
||||
endianess: gguf.GGUFEndian
|
||||
use_temp_file: bool
|
||||
|
@ -56,20 +55,20 @@ class Model:
|
|||
part_names: list[str]
|
||||
is_safetensors: bool
|
||||
hparams: dict[str, Any]
|
||||
gguf_writer: gguf.GGUFWriter
|
||||
block_count: int
|
||||
tensor_map: gguf.TensorNameMap
|
||||
tensor_names: set[str] | None
|
||||
fname_out: Path
|
||||
gguf_writer: gguf.GGUFWriter
|
||||
|
||||
# subclasses should define this!
|
||||
model_arch: gguf.MODEL_ARCH
|
||||
|
||||
def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
|
||||
if self.__class__ == Model:
|
||||
raise TypeError(f"{self.__class__.__name__!r} should not be directly instantiated")
|
||||
def __init__(self, dir_model: Path, ftype: gguf.LlamaFileType, fname_out: Path, is_big_endian: bool, use_temp_file: bool, eager: bool):
|
||||
if type(self) is Model:
|
||||
raise TypeError(f"{type(self).__name__!r} should not be directly instantiated")
|
||||
self.dir_model = dir_model
|
||||
self.ftype = ftype
|
||||
self.fname_out = fname_out
|
||||
self.is_big_endian = is_big_endian
|
||||
self.endianess = gguf.GGUFEndian.BIG if is_big_endian else gguf.GGUFEndian.LITTLE
|
||||
self.use_temp_file = use_temp_file
|
||||
|
@ -79,10 +78,23 @@ class Model:
|
|||
if not self.is_safetensors:
|
||||
self.part_names = Model.get_model_part_names(self.dir_model, ".bin")
|
||||
self.hparams = Model.load_hparams(self.dir_model)
|
||||
self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
||||
self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"])
|
||||
self.tensor_map = gguf.get_tensor_name_map(self.model_arch, self.block_count)
|
||||
self.tensor_names = None
|
||||
if self.ftype == gguf.LlamaFileType.GUESSED:
|
||||
# NOTE: can't use field "torch_dtype" in config.json, because some finetunes lie.
|
||||
_, first_tensor = next(self.get_tensors())
|
||||
if first_tensor.dtype == torch.float16:
|
||||
logger.info(f"choosing --outtype f16 from first tensor type ({first_tensor.dtype})")
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_F16
|
||||
else:
|
||||
logger.info(f"choosing --outtype bf16 from first tensor type ({first_tensor.dtype})")
|
||||
self.ftype = gguf.LlamaFileType.MOSTLY_BF16
|
||||
ftype_up: str = self.ftype.name.partition("_")[2].upper()
|
||||
ftype_lw: str = ftype_up.lower()
|
||||
# allow templating the file name with the output ftype, useful with the "auto" ftype
|
||||
self.fname_out = fname_out.parent / fname_out.name.format(ftype_lw, outtype=ftype_lw, ftype=ftype_lw, OUTTYPE=ftype_up, FTYPE=ftype_up)
|
||||
self.gguf_writer = gguf.GGUFWriter(self.fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=self.use_temp_file)
|
||||
|
||||
@classmethod
|
||||
def __init_subclass__(cls):
|
||||
|
@ -142,14 +154,27 @@ class Model:
|
|||
raise ValueError(f"Mismatch between weight map and model parts for tensor names: {sym_diff}")
|
||||
|
||||
def format_tensor_name(self, key: gguf.MODEL_TENSOR, bid: int | None = None, suffix: str = ".weight") -> str:
|
||||
name: str = gguf.TENSOR_NAMES[key]
|
||||
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
||||
raise ValueError(f"Missing {key!r} for MODEL_TENSORS of {self.model_arch!r}")
|
||||
name: str = gguf.TENSOR_NAMES[key]
|
||||
if "{bid}" in name:
|
||||
assert bid is not None
|
||||
name = name.format(bid=bid)
|
||||
return name + suffix
|
||||
|
||||
def match_model_tensor_name(self, name: str, key: gguf.MODEL_TENSOR, bid: int | None, suffix: str = ".weight") -> bool:
|
||||
if key not in gguf.MODEL_TENSORS[self.model_arch]:
|
||||
return False
|
||||
key_name: str = gguf.TENSOR_NAMES[key]
|
||||
if "{bid}" in key_name:
|
||||
if bid is None:
|
||||
return False
|
||||
key_name = key_name.format(bid=bid)
|
||||
else:
|
||||
if bid is not None:
|
||||
return False
|
||||
return name == (key_name + suffix)
|
||||
|
||||
def map_tensor_name(self, name: str, try_suffixes: Sequence[str] = (".weight", ".bias")) -> str:
|
||||
new_name = self.tensor_map.get_name(key=name, try_suffixes=try_suffixes)
|
||||
if new_name is None:
|
||||
|
@ -215,6 +240,23 @@ class Model:
|
|||
return False
|
||||
|
||||
def write_tensors(self):
|
||||
# same as ggml_compute_fp32_to_bf16 in ggml-impl.h
|
||||
def np_fp32_to_bf16(n: np.ndarray):
|
||||
# force nan to quiet
|
||||
n = np.where((n & 0x7fffffff) > 0x7f800000, (n & 0xffff0000) | (64 << 16), n)
|
||||
# flush subnormals to zero
|
||||
n = np.where((n & 0x7f800000) == 0, n & 0x80000000, n)
|
||||
# round to nearest even
|
||||
n = (n + (0x7fff + ((n >> 16) & 1))) >> 16
|
||||
return n.astype(np.int16)
|
||||
|
||||
# Doing this row-wise is much, much faster than element-wise, hence the signature
|
||||
v_fp32_to_bf16 = np.vectorize(np_fp32_to_bf16, otypes=[np.int16], signature="(n)->(n)")
|
||||
if self.lazy:
|
||||
# TODO: find a way to implicitly wrap np.vectorize functions
|
||||
# NOTE: the type is changed to reflect otypes passed to np.vectorize above
|
||||
v_fp32_to_bf16 = gguf.LazyNumpyTensor._wrap_fn(v_fp32_to_bf16, meta_noop=np.int16)
|
||||
|
||||
max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
|
||||
|
||||
for name, data_torch in self.get_tensors():
|
||||
|
@ -239,35 +281,60 @@ class Model:
|
|||
data: np.ndarray = data # type hint
|
||||
n_dims = len(data.shape)
|
||||
data_dtype = data.dtype
|
||||
|
||||
# if f32 desired, convert any float16 to float32
|
||||
if self.ftype == 0 and data_dtype == np.float16:
|
||||
data = data.astype(np.float32)
|
||||
data_qtype: gguf.GGMLQuantizationType | None = None
|
||||
|
||||
# when both are True, f32 should win
|
||||
extra_f32 = self.extra_f32_tensors(name, new_name, bid, n_dims)
|
||||
extra_f16 = self.extra_f16_tensors(name, new_name, bid, n_dims)
|
||||
|
||||
# Most of the codebase that takes in 1D tensors or norms only handles F32 tensors
|
||||
extra_f32 = extra_f32 or n_dims == 1 or new_name.endswith("_norm.weight")
|
||||
# Conditions should closely match those in llama_model_quantize_internal in llama.cpp
|
||||
extra_f32 = any(cond for cond in (
|
||||
extra_f32,
|
||||
n_dims == 1,
|
||||
new_name.endswith("_norm.weight"),
|
||||
))
|
||||
|
||||
# Some tensor types are always in float32
|
||||
extra_f32 = extra_f32 or any(self.match_model_tensor_name(new_name, key, bid) for key in (
|
||||
gguf.MODEL_TENSOR.FFN_GATE_INP,
|
||||
gguf.MODEL_TENSOR.POS_EMBD,
|
||||
gguf.MODEL_TENSOR.TOKEN_TYPES,
|
||||
))
|
||||
|
||||
# if f16 desired, convert any float32 2-dim weight tensors to float16
|
||||
extra_f16 = extra_f16 or (name.endswith(".weight") and n_dims >= 2)
|
||||
extra_f16 = any(cond for cond in (
|
||||
extra_f16,
|
||||
(name.endswith(".weight") and n_dims >= 2),
|
||||
))
|
||||
|
||||
# when both extra_f32 and extra_f16 are False, convert to float32 by default
|
||||
if self.ftype == 1 and data_dtype == np.float16 and (extra_f32 or not extra_f16):
|
||||
data = data.astype(np.float32)
|
||||
if self.ftype != gguf.LlamaFileType.ALL_F32 and extra_f16 and not extra_f32:
|
||||
if self.ftype == gguf.LlamaFileType.MOSTLY_F16:
|
||||
if data_dtype != np.float16:
|
||||
data = data.astype(np.float16)
|
||||
data_qtype = gguf.GGMLQuantizationType.F16
|
||||
|
||||
if self.ftype == 1 and data_dtype == np.float32 and extra_f16 and not extra_f32:
|
||||
data = data.astype(np.float16)
|
||||
elif self.ftype == gguf.LlamaFileType.MOSTLY_BF16:
|
||||
if data_dtype != np.float32:
|
||||
data = data.astype(np.float32)
|
||||
data = v_fp32_to_bf16(data.view(np.int32))
|
||||
assert data.dtype == np.int16
|
||||
data_qtype = gguf.GGMLQuantizationType.BF16
|
||||
|
||||
else: # by default, convert to float32
|
||||
if data_dtype != np.float32:
|
||||
data = data.astype(np.float32)
|
||||
data_qtype = gguf.GGMLQuantizationType.F32
|
||||
|
||||
assert data_qtype is not None
|
||||
|
||||
# reverse shape to make it similar to the internal ggml dimension order
|
||||
shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
|
||||
|
||||
# n_dims is implicit in the shape
|
||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data.dtype}, shape = {shape_str}")
|
||||
logger.info(f"{f'%-{max_name_len}s' % f'{new_name},'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")
|
||||
|
||||
self.gguf_writer.add_tensor(new_name, data)
|
||||
self.gguf_writer.add_tensor(new_name, data, raw_dtype=data_qtype)
|
||||
|
||||
def write(self):
|
||||
self.write_tensors()
|
||||
|
@ -2047,12 +2114,6 @@ class BertModel(Model):
|
|||
|
||||
return [(self.map_tensor_name(name), data_torch)]
|
||||
|
||||
def extra_f32_tensors(self, name: str, new_name: str, bid: int | None, n_dims: int) -> bool:
|
||||
del new_name, bid, n_dims # unused
|
||||
|
||||
# not used with get_rows, must be F32
|
||||
return name == "embeddings.token_type_embeddings.weight"
|
||||
|
||||
|
||||
@Model.register("NomicBertModel")
|
||||
class NomicBertModel(BertModel):
|
||||
|
@ -2342,92 +2403,40 @@ class JinaBertV2Model(BertModel):
|
|||
|
||||
|
||||
# tree of lazy tensors
|
||||
class LazyTorchTensor:
|
||||
_meta: Tensor
|
||||
_data: Tensor | None
|
||||
_args: tuple
|
||||
_func: Callable[[tuple], Tensor] | None
|
||||
|
||||
def __init__(self, *, meta: Tensor, data: Tensor | None = None, args: tuple = (), func: Callable[[tuple], Tensor] | None = None):
|
||||
self._meta = meta
|
||||
self._data = data
|
||||
self._args = args
|
||||
self._func = func
|
||||
|
||||
@staticmethod
|
||||
def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
|
||||
# TODO: dict and set
|
||||
if isinstance(o, (list, tuple)):
|
||||
L = []
|
||||
for item in o:
|
||||
L.append(LazyTorchTensor._recurse_apply(item, fn))
|
||||
if isinstance(o, tuple):
|
||||
L = tuple(L)
|
||||
return L
|
||||
elif isinstance(o, LazyTorchTensor):
|
||||
return fn(o)
|
||||
else:
|
||||
return o
|
||||
|
||||
def _wrap_fn(self, fn: Callable, use_self: bool = False) -> Callable[[Any], LazyTorchTensor]:
|
||||
def wrapped_fn(*args, **kwargs):
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
args = ((self,) if use_self else ()) + args
|
||||
|
||||
meta_args = LazyTorchTensor._recurse_apply(args, lambda t: t._meta)
|
||||
|
||||
return LazyTorchTensor(meta=fn(*meta_args, **kwargs), args=args, func=lambda a: fn(*a, **kwargs))
|
||||
return wrapped_fn
|
||||
|
||||
def __getattr__(self, __name: str) -> Any:
|
||||
meta_attr = getattr(self._meta, __name)
|
||||
if callable(meta_attr):
|
||||
return self._wrap_fn(getattr(torch.Tensor, __name), use_self=True)
|
||||
elif isinstance(meta_attr, torch.Tensor):
|
||||
# for things like self.T
|
||||
return self._wrap_fn(lambda s: getattr(s, __name))(self)
|
||||
else:
|
||||
return meta_attr
|
||||
class LazyTorchTensor(gguf.LazyBase):
|
||||
_tensor_type = torch.Tensor
|
||||
# to keep the type-checker happy
|
||||
dtype: torch.dtype
|
||||
shape: torch.Size
|
||||
|
||||
# only used when converting a torch.Tensor to a np.ndarray
|
||||
_dtype_map: dict[torch.dtype, type] = {
|
||||
torch.float16: np.float16,
|
||||
torch.float32: np.float32,
|
||||
}
|
||||
|
||||
def numpy(self) -> gguf.LazyTensor:
|
||||
def numpy(self) -> gguf.LazyNumpyTensor:
|
||||
dtype = self._dtype_map[self.dtype]
|
||||
return gguf.LazyTensor(lambda: LazyTorchTensor.to_eager(self).numpy(), dtype=dtype, shape=self.shape)
|
||||
return gguf.LazyNumpyTensor(
|
||||
meta=np.lib.stride_tricks.as_strided(np.zeros(1, dtype), self.shape, (0 for _ in self.shape)),
|
||||
lazy=self._lazy,
|
||||
args=(self,),
|
||||
func=(lambda s: s[0].numpy())
|
||||
)
|
||||
|
||||
@overload
|
||||
@staticmethod
|
||||
def to_eager(t: Tensor | LazyTorchTensor) -> Tensor: ...
|
||||
|
||||
@overload
|
||||
@staticmethod
|
||||
def to_eager(t: tuple) -> tuple: ...
|
||||
|
||||
@staticmethod
|
||||
def to_eager(t: Any) -> Any:
|
||||
def simple_to_eager(_t: LazyTorchTensor) -> Tensor:
|
||||
# wake up the lazy tensor
|
||||
if _t._data is None and _t._func is not None:
|
||||
# recurse into its arguments
|
||||
_t._args = LazyTorchTensor.to_eager(_t._args)
|
||||
_t._data = _t._func(_t._args)
|
||||
if _t._data is not None:
|
||||
return _t._data
|
||||
else:
|
||||
raise ValueError(f"Could not compute lazy tensor {_t!r} with args {_t._args!r}")
|
||||
|
||||
# recurse into lists and/or tuples, keeping their structure
|
||||
return LazyTorchTensor._recurse_apply(t, simple_to_eager)
|
||||
|
||||
@staticmethod
|
||||
def from_eager(t: Tensor) -> Tensor:
|
||||
if (t.__class__ == LazyTorchTensor):
|
||||
@classmethod
|
||||
def eager_to_meta(cls, t: Tensor) -> Tensor:
|
||||
if t.is_meta:
|
||||
return t
|
||||
return LazyTorchTensor(meta=t.detach().to("meta"), data=t) # type: ignore
|
||||
return t.detach().to("meta")
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype(cls, m: Tensor, dtype: torch.dtype) -> Tensor:
|
||||
m = m.detach()
|
||||
if not m.is_meta:
|
||||
m = m.to("meta")
|
||||
m.dtype = dtype
|
||||
return m
|
||||
|
||||
@classmethod
|
||||
def __torch_function__(cls, func, types, args=(), kwargs=None):
|
||||
|
@ -2438,28 +2447,8 @@ class LazyTorchTensor:
|
|||
|
||||
if func is torch.Tensor.numpy:
|
||||
return args[0].numpy()
|
||||
if func is torch.equal:
|
||||
eager_args = LazyTorchTensor.to_eager(args)
|
||||
return func(*eager_args, **kwargs)
|
||||
|
||||
return LazyTorchTensor._wrap_fn(args[0], func)(*args, **kwargs)
|
||||
|
||||
# special methods bypass __getattr__, so they need to be added manually
|
||||
# ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
|
||||
# NOTE: LazyTorchTensor can't be a subclass of Tensor (and then be used
|
||||
# as self._meta is currently used), because then the following
|
||||
# operations would by default not be wrapped, and so not propagated
|
||||
# when the tensor is made eager.
|
||||
# It's better to get non-silent errors for not-yet-supported operators.
|
||||
# TODO: add more when needed to avoid clutter, or find a more concise way
|
||||
def __neg__(self, *args): # mamba
|
||||
return self._wrap_fn(torch.Tensor.__neg__)(self, *args)
|
||||
|
||||
def __add__(self, *args): # gemma
|
||||
return self._wrap_fn(torch.Tensor.__add__)(self, *args)
|
||||
|
||||
def __getitem__(self, *args): # bloom falcon refact internlm2
|
||||
return self._wrap_fn(torch.Tensor.__getitem__)(self, *args)
|
||||
return LazyTorchTensor._wrap_fn(func)(*args, **kwargs)
|
||||
|
||||
|
||||
def parse_args() -> argparse.Namespace:
|
||||
|
@ -2475,11 +2464,11 @@ def parse_args() -> argparse.Namespace:
|
|||
)
|
||||
parser.add_argument(
|
||||
"--outfile", type=Path,
|
||||
help="path to write to; default: based on input",
|
||||
help="path to write to; default: based on input. {ftype} will be replaced by the outtype.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--outtype", type=str, choices=["f32", "f16"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16",
|
||||
"--outtype", type=str, choices=["f32", "f16", "bf16", "auto"], default="f16",
|
||||
help="output format - use f32 for float32, f16 for float16, bf16 for bfloat16, auto for the highest-fidelity 16-bit float type depending on the first loaded tensor type",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--bigendian", action="store_true",
|
||||
|
@ -2533,16 +2522,18 @@ def main() -> None:
|
|||
logger.error(f'Error: {args.model} is not a directory')
|
||||
sys.exit(1)
|
||||
|
||||
ftype_map = {
|
||||
"f32": gguf.GGMLQuantizationType.F32,
|
||||
"f16": gguf.GGMLQuantizationType.F16,
|
||||
ftype_map: dict[str, gguf.LlamaFileType] = {
|
||||
"f32": gguf.LlamaFileType.ALL_F32,
|
||||
"f16": gguf.LlamaFileType.MOSTLY_F16,
|
||||
"bf16": gguf.LlamaFileType.MOSTLY_BF16,
|
||||
"auto": gguf.LlamaFileType.GUESSED,
|
||||
}
|
||||
|
||||
if args.outfile is not None:
|
||||
fname_out = args.outfile
|
||||
else:
|
||||
# output in the same directory as the model by default
|
||||
fname_out = dir_model / f'ggml-model-{args.outtype}.gguf'
|
||||
fname_out = dir_model / 'ggml-model-{ftype}.gguf'
|
||||
|
||||
logger.info(f"Loading model: {dir_model.name}")
|
||||
|
||||
|
@ -2558,14 +2549,16 @@ def main() -> None:
|
|||
logger.info("Set model tokenizer")
|
||||
model_instance.set_vocab()
|
||||
|
||||
model_instance.gguf_writer.add_quantization_version(gguf.GGML_QUANT_VERSION)
|
||||
|
||||
if args.vocab_only:
|
||||
logger.info(f"Exporting model vocab to '{fname_out}'")
|
||||
logger.info(f"Exporting model vocab to '{model_instance.fname_out}'")
|
||||
model_instance.write_vocab()
|
||||
else:
|
||||
logger.info(f"Exporting model to '{fname_out}'")
|
||||
logger.info(f"Exporting model to '{model_instance.fname_out}'")
|
||||
model_instance.write()
|
||||
|
||||
logger.info(f"Model successfully exported to '{fname_out}'")
|
||||
logger.info(f"Model successfully exported to '{model_instance.fname_out}'")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
|
|
@ -1,150 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import json
|
||||
import os
|
||||
import struct
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Any, BinaryIO, Sequence
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
if 'NO_LOCAL_GGUF' not in os.environ:
|
||||
sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
|
||||
import gguf
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
logger = logging.getLogger("lora-to-gguf")
|
||||
|
||||
NUMPY_TYPE_TO_FTYPE: dict[str, int] = {"float32": 0, "float16": 1}
|
||||
|
||||
|
||||
def write_file_header(fout: BinaryIO, params: dict[str, Any]) -> None:
|
||||
fout.write(b"ggla"[::-1]) # magic (ggml lora)
|
||||
fout.write(struct.pack("i", 1)) # file version
|
||||
fout.write(struct.pack("i", params["r"]))
|
||||
# https://opendelta.readthedocs.io/en/latest/modules/deltas.html says that `lora_alpha` is an int
|
||||
# but some models ship a float value instead
|
||||
# let's convert to int, but fail if lossless conversion is not possible
|
||||
assert (
|
||||
int(params["lora_alpha"]) == params["lora_alpha"]
|
||||
), "cannot convert float to int losslessly"
|
||||
fout.write(struct.pack("i", int(params["lora_alpha"])))
|
||||
|
||||
|
||||
def write_tensor_header(fout: BinaryIO, name: str, shape: Sequence[int], data_type: np.dtype[Any]) -> None:
|
||||
sname = name.encode("utf-8")
|
||||
fout.write(
|
||||
struct.pack(
|
||||
"iii",
|
||||
len(shape),
|
||||
len(sname),
|
||||
NUMPY_TYPE_TO_FTYPE[data_type.name],
|
||||
)
|
||||
)
|
||||
fout.write(struct.pack("i" * len(shape), *shape[::-1]))
|
||||
fout.write(sname)
|
||||
fout.seek((fout.tell() + 31) & -32)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if len(sys.argv) < 2:
|
||||
logger.info(f"Usage: python {sys.argv[0]} <path> [arch]")
|
||||
logger.info("Path must contain HuggingFace PEFT LoRA files 'adapter_config.json' and 'adapter_model.bin'")
|
||||
logger.info(f"Arch must be one of {list(gguf.MODEL_ARCH_NAMES.values())} (default: llama)")
|
||||
sys.exit(1)
|
||||
|
||||
input_json = os.path.join(sys.argv[1], "adapter_config.json")
|
||||
input_model = os.path.join(sys.argv[1], "adapter_model.bin")
|
||||
output_path = os.path.join(sys.argv[1], "ggml-adapter-model.bin")
|
||||
|
||||
if os.path.exists(input_model):
|
||||
model = torch.load(input_model, map_location="cpu")
|
||||
else:
|
||||
input_model = os.path.join(sys.argv[1], "adapter_model.safetensors")
|
||||
# lazy import load_file only if lora is in safetensors format.
|
||||
from safetensors.torch import load_file
|
||||
model = load_file(input_model, device="cpu")
|
||||
|
||||
arch_name = sys.argv[2] if len(sys.argv) == 3 else "llama"
|
||||
|
||||
if arch_name not in gguf.MODEL_ARCH_NAMES.values():
|
||||
logger.error(f"Error: unsupported architecture {arch_name}")
|
||||
sys.exit(1)
|
||||
|
||||
arch = list(gguf.MODEL_ARCH_NAMES.keys())[list(gguf.MODEL_ARCH_NAMES.values()).index(arch_name)]
|
||||
name_map = gguf.TensorNameMap(arch, 200) # 200 layers ought to be enough for anyone
|
||||
|
||||
with open(input_json, "r") as f:
|
||||
params = json.load(f)
|
||||
|
||||
if params["peft_type"] != "LORA":
|
||||
logger.error(f"Error: unsupported adapter type {params['peft_type']}, expected LORA")
|
||||
sys.exit(1)
|
||||
|
||||
if params["fan_in_fan_out"] is True:
|
||||
logger.error("Error: param fan_in_fan_out is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
if params["bias"] is not None and params["bias"] != "none":
|
||||
logger.error("Error: param bias is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
# TODO: these seem to be layers that have been trained but without lora.
|
||||
# doesn't seem widely used but eventually should be supported
|
||||
if params["modules_to_save"] is not None and len(params["modules_to_save"]) > 0:
|
||||
logger.error("Error: param modules_to_save is not supported")
|
||||
sys.exit(1)
|
||||
|
||||
with open(output_path, "wb") as fout:
|
||||
fout.truncate()
|
||||
|
||||
write_file_header(fout, params)
|
||||
for k, v in model.items():
|
||||
orig_k = k
|
||||
if k.endswith(".default.weight"):
|
||||
k = k.replace(".default.weight", ".weight")
|
||||
if k in ["llama_proj.weight", "llama_proj.bias"]:
|
||||
continue
|
||||
if k.endswith("lora_A.weight"):
|
||||
if v.dtype != torch.float16 and v.dtype != torch.float32:
|
||||
v = v.float()
|
||||
v = v.T
|
||||
else:
|
||||
v = v.float()
|
||||
|
||||
t = v.detach().numpy()
|
||||
|
||||
prefix = "base_model.model."
|
||||
if k.startswith(prefix):
|
||||
k = k[len(prefix) :]
|
||||
|
||||
lora_suffixes = (".lora_A.weight", ".lora_B.weight")
|
||||
if k.endswith(lora_suffixes):
|
||||
suffix = k[-len(lora_suffixes[0]):]
|
||||
k = k[: -len(lora_suffixes[0])]
|
||||
else:
|
||||
logger.error(f"Error: unrecognized tensor name {orig_k}")
|
||||
sys.exit(1)
|
||||
|
||||
tname = name_map.get_name(k)
|
||||
if tname is None:
|
||||
logger.error(f"Error: could not map tensor name {orig_k}")
|
||||
logger.error(" Note: the arch parameter must be specified if the model is not llama")
|
||||
sys.exit(1)
|
||||
|
||||
if suffix == ".lora_A.weight":
|
||||
tname += ".weight.loraA"
|
||||
elif suffix == ".lora_B.weight":
|
||||
tname += ".weight.loraB"
|
||||
else:
|
||||
assert False
|
||||
|
||||
logger.info(f"{k} => {tname} {t.shape} {t.dtype} {t.nbytes/1024/1024:.2f}MB")
|
||||
write_tensor_header(fout, tname, t.shape, t.dtype)
|
||||
t.tofile(fout)
|
||||
|
||||
logger.info(f"Converted {input_json} and {input_model} to {output_path}")
|
178
convert.py
178
convert.py
|
@ -24,7 +24,7 @@ from abc import ABC, abstractmethod
|
|||
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable
|
||||
from typing import TYPE_CHECKING, Any, Callable, ClassVar, IO, Iterable, Literal, Protocol, TypeVar, runtime_checkable, Optional
|
||||
|
||||
import numpy as np
|
||||
from sentencepiece import SentencePieceProcessor
|
||||
|
@ -344,10 +344,47 @@ class Params:
|
|||
return params
|
||||
|
||||
|
||||
@dataclass
|
||||
class Metadata:
|
||||
name: Optional[str] = None
|
||||
author: Optional[str] = None
|
||||
version: Optional[str] = None
|
||||
url: Optional[str] = None
|
||||
description: Optional[str] = None
|
||||
licence: Optional[str] = None
|
||||
source_url: Optional[str] = None
|
||||
source_hf_repo: Optional[str] = None
|
||||
|
||||
@staticmethod
|
||||
def load(metadata_path: Path) -> Metadata:
|
||||
if metadata_path is None or not metadata_path.exists():
|
||||
return Metadata()
|
||||
|
||||
with open(metadata_path, 'r') as file:
|
||||
data = json.load(file)
|
||||
|
||||
# Create a new Metadata instance
|
||||
metadata = Metadata()
|
||||
|
||||
# Assigning values to Metadata attributes if they exist in the JSON file
|
||||
# This is based on LLM_KV_NAMES mapping in llama.cpp
|
||||
metadata.name = data.get("general.name")
|
||||
metadata.author = data.get("general.author")
|
||||
metadata.version = data.get("general.version")
|
||||
metadata.url = data.get("general.url")
|
||||
metadata.description = data.get("general.description")
|
||||
metadata.license = data.get("general.license")
|
||||
metadata.source_url = data.get("general.source.url")
|
||||
metadata.source_hf_repo = data.get("general.source.huggingface.repository")
|
||||
|
||||
return metadata
|
||||
|
||||
|
||||
#
|
||||
# vocab
|
||||
#
|
||||
|
||||
|
||||
@runtime_checkable
|
||||
class BaseVocab(Protocol):
|
||||
tokenizer_model: ClassVar[str]
|
||||
|
@ -1066,21 +1103,42 @@ class OutputFile:
|
|||
def __init__(self, fname_out: Path, endianess:gguf.GGUFEndian = gguf.GGUFEndian.LITTLE):
|
||||
self.gguf = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[ARCH], endianess=endianess)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
def add_meta_model(self, params: Params, metadata: Metadata) -> None:
|
||||
# Metadata About The Model And Its Provenence
|
||||
name = "LLaMA"
|
||||
|
||||
# TODO: better logic to determine model name
|
||||
if params.n_ctx == 4096:
|
||||
name = "LLaMA v2"
|
||||
if metadata is not None and metadata.name is not None:
|
||||
name = metadata.name
|
||||
elif params.path_model is not None:
|
||||
name = str(params.path_model.parent).split('/')[-1]
|
||||
name = str(params.path_model.parent).split("/")[-1]
|
||||
elif params.n_ctx == 4096:
|
||||
# Heuristic detection of LLaMA v2 model
|
||||
name = "LLaMA v2"
|
||||
|
||||
self.gguf.add_name (name)
|
||||
self.gguf.add_vocab_size (params.n_vocab)
|
||||
self.gguf.add_context_length (params.n_ctx)
|
||||
self.gguf.add_embedding_length (params.n_embd)
|
||||
self.gguf.add_block_count (params.n_layer)
|
||||
self.gguf.add_feed_forward_length (params.n_ff)
|
||||
self.gguf.add_name(name)
|
||||
|
||||
if metadata is not None:
|
||||
if metadata.author is not None:
|
||||
self.gguf.add_author(metadata.author)
|
||||
if metadata.version is not None:
|
||||
self.gguf.add_version(metadata.version)
|
||||
if metadata.url is not None:
|
||||
self.gguf.add_url(metadata.url)
|
||||
if metadata.description is not None:
|
||||
self.gguf.add_description(metadata.description)
|
||||
if metadata.licence is not None:
|
||||
self.gguf.add_licence(metadata.licence)
|
||||
if metadata.source_url is not None:
|
||||
self.gguf.add_source_url(metadata.source_url)
|
||||
if metadata.source_hf_repo is not None:
|
||||
self.gguf.add_source_hf_repo(metadata.source_hf_repo)
|
||||
|
||||
def add_meta_arch(self, params: Params) -> None:
|
||||
# Metadata About The Neural Architecture Itself
|
||||
self.gguf.add_vocab_size(params.n_vocab)
|
||||
self.gguf.add_context_length(params.n_ctx)
|
||||
self.gguf.add_embedding_length(params.n_embd)
|
||||
self.gguf.add_block_count(params.n_layer)
|
||||
self.gguf.add_feed_forward_length(params.n_ff)
|
||||
self.gguf.add_rope_dimension_count(params.n_embd // params.n_head)
|
||||
self.gguf.add_head_count (params.n_head)
|
||||
self.gguf.add_head_count_kv (params.n_head_kv)
|
||||
|
@ -1183,13 +1241,14 @@ class OutputFile:
|
|||
@staticmethod
|
||||
def write_vocab_only(
|
||||
fname_out: Path, params: Params, vocab: Vocab, svocab: gguf.SpecialVocab,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False,
|
||||
endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE, pad_vocab: bool = False, metadata: Metadata = None,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
# meta data
|
||||
of.add_meta_model(params, metadata)
|
||||
of.add_meta_arch(params)
|
||||
of.add_meta_vocab(vocab)
|
||||
of.add_meta_special_vocab(svocab)
|
||||
|
@ -1216,12 +1275,14 @@ class OutputFile:
|
|||
fname_out: Path, ftype: GGMLFileType, params: Params, model: LazyModel, vocab: BaseVocab, svocab: gguf.SpecialVocab,
|
||||
concurrency: int = DEFAULT_CONCURRENCY, endianess: gguf.GGUFEndian = gguf.GGUFEndian.LITTLE,
|
||||
pad_vocab: bool = False,
|
||||
metadata: Metadata = None,
|
||||
) -> None:
|
||||
check_vocab_size(params, vocab, pad_vocab=pad_vocab)
|
||||
|
||||
of = OutputFile(fname_out, endianess=endianess)
|
||||
|
||||
# meta data
|
||||
of.add_meta_model(params, metadata)
|
||||
of.add_meta_arch(params)
|
||||
if isinstance(vocab, Vocab):
|
||||
of.add_meta_vocab(vocab)
|
||||
|
@ -1257,6 +1318,37 @@ def pick_output_type(model: LazyModel, output_type_str: str | None) -> GGMLFileT
|
|||
raise ValueError(f"Unexpected combination of types: {name_to_type}")
|
||||
|
||||
|
||||
def model_parameter_count(model: LazyModel) -> int:
|
||||
total_model_parameters = 0
|
||||
for i, (name, lazy_tensor) in enumerate(model.items()):
|
||||
sum_weights_in_tensor = 1
|
||||
for dim in lazy_tensor.shape:
|
||||
sum_weights_in_tensor *= dim
|
||||
total_model_parameters += sum_weights_in_tensor
|
||||
return total_model_parameters
|
||||
|
||||
|
||||
def model_parameter_count_rounded_notation(model_params_count: int) -> str:
|
||||
if model_params_count > 1e12 :
|
||||
# Trillions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-12
|
||||
scale_suffix = "T"
|
||||
elif model_params_count > 1e9 :
|
||||
# Billions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-9
|
||||
scale_suffix = "B"
|
||||
elif model_params_count > 1e6 :
|
||||
# Millions Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-6
|
||||
scale_suffix = "M"
|
||||
else:
|
||||
# Thousands Of Parameters
|
||||
scaled_model_params = model_params_count * 1e-3
|
||||
scale_suffix = "K"
|
||||
|
||||
return f"{round(scaled_model_params)}{scale_suffix}"
|
||||
|
||||
|
||||
def convert_to_output_type(model: LazyModel, output_type: GGMLFileType) -> LazyModel:
|
||||
return {name: tensor.astype(output_type.type_for_tensor(name, tensor))
|
||||
for (name, tensor) in model.items()}
|
||||
|
@ -1436,13 +1528,35 @@ class VocabFactory:
|
|||
return vocab, special_vocab
|
||||
|
||||
|
||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType) -> Path:
|
||||
namestr = {
|
||||
GGMLFileType.AllF32: "f32",
|
||||
GGMLFileType.MostlyF16: "f16",
|
||||
GGMLFileType.MostlyQ8_0:"q8_0",
|
||||
def default_convention_outfile(file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> str:
|
||||
quantization = {
|
||||
GGMLFileType.AllF32: "F32",
|
||||
GGMLFileType.MostlyF16: "F16",
|
||||
GGMLFileType.MostlyQ8_0: "Q8_0",
|
||||
}[file_type]
|
||||
ret = model_paths[0].parent / f"ggml-model-{namestr}.gguf"
|
||||
|
||||
parameters = model_parameter_count_rounded_notation(model_params_count)
|
||||
|
||||
expert_count = ""
|
||||
if params.n_experts is not None:
|
||||
expert_count = f"{params.n_experts}x"
|
||||
|
||||
version = ""
|
||||
if metadata is not None and metadata.version is not None:
|
||||
version = f"-{metadata.version}"
|
||||
|
||||
name = "ggml-model"
|
||||
if metadata is not None and metadata.name is not None:
|
||||
name = metadata.name
|
||||
elif params.path_model is not None:
|
||||
name = params.path_model.name
|
||||
|
||||
return f"{name}{version}-{expert_count}{parameters}-{quantization}"
|
||||
|
||||
|
||||
def default_outfile(model_paths: list[Path], file_type: GGMLFileType, params: Params, model_params_count: int, metadata: Metadata) -> Path:
|
||||
default_filename = default_convention_outfile(file_type, params, model_params_count, metadata)
|
||||
ret = model_paths[0].parent / f"{default_filename}.gguf"
|
||||
if ret in model_paths:
|
||||
logger.error(
|
||||
f"Error: Default output path ({ret}) would overwrite the input. "
|
||||
|
@ -1480,17 +1594,30 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
parser.add_argument("--pad-vocab", action="store_true", help="add pad tokens when model vocab expects more than tokenizer metadata provides")
|
||||
parser.add_argument("--skip-unknown", action="store_true", help="skip unknown tensor names instead of failing")
|
||||
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
|
||||
parser.add_argument("--metadata", type=Path, help="Specify the path for a metadata file")
|
||||
parser.add_argument("--get-outfile", action="store_true", help="get calculated default outfile name")
|
||||
|
||||
args = parser.parse_args(args_in)
|
||||
|
||||
if args.verbose:
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
elif args.dump_single or args.dump:
|
||||
elif args.dump_single or args.dump or args.get_outfile:
|
||||
# Avoid printing anything besides the dump output
|
||||
logging.basicConfig(level=logging.WARNING)
|
||||
else:
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
|
||||
metadata = Metadata.load(args.metadata)
|
||||
|
||||
if args.get_outfile:
|
||||
model_plus = load_some_model(args.model)
|
||||
params = Params.load(model_plus)
|
||||
model = convert_model_names(model_plus.model, params, args.skip_unknown)
|
||||
model_params_count = model_parameter_count(model_plus.model)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
print(f"{default_convention_outfile(ftype, params, model_params_count, metadata)}") # noqa: NP100
|
||||
return
|
||||
|
||||
if args.no_vocab and args.vocab_only:
|
||||
raise ValueError("--vocab-only does not make sense with --no-vocab")
|
||||
|
||||
|
@ -1504,6 +1631,9 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
else:
|
||||
model_plus = ModelPlus(model = {}, paths = [args.model / 'dummy'], format = 'none', vocab = None)
|
||||
|
||||
model_params_count = model_parameter_count(model_plus.model)
|
||||
logger.info(f"model parameters count : {model_params_count} ({model_parameter_count_rounded_notation(model_params_count)})")
|
||||
|
||||
if args.dump:
|
||||
do_dump_model(model_plus)
|
||||
return
|
||||
|
@ -1557,7 +1687,7 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
f_norm_eps = 1e-5,
|
||||
)
|
||||
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
|
||||
endianess=endianess, pad_vocab=args.pad_vocab)
|
||||
endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
|
||||
logger.info(f"Wrote {outfile}")
|
||||
return
|
||||
|
||||
|
@ -1570,13 +1700,13 @@ def main(args_in: list[str] | None = None) -> None:
|
|||
model = convert_model_names(model, params, args.skip_unknown)
|
||||
ftype = pick_output_type(model, args.outtype)
|
||||
model = convert_to_output_type(model, ftype)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype)
|
||||
outfile = args.outfile or default_outfile(model_plus.paths, ftype, params, model_params_count, metadata)
|
||||
|
||||
params.ftype = ftype
|
||||
logger.info(f"Writing {outfile}, format {ftype}")
|
||||
|
||||
OutputFile.write_all(outfile, ftype, params, model, vocab, special_vocab,
|
||||
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab)
|
||||
concurrency=args.concurrency, endianess=endianess, pad_vocab=args.pad_vocab, metadata=metadata)
|
||||
logger.info(f"Wrote {outfile}")
|
||||
|
||||
|
||||
|
|
88
docs/debugging-tests.md
Normal file
88
docs/debugging-tests.md
Normal file
|
@ -0,0 +1,88 @@
|
|||
# Debugging Tests Tips
|
||||
|
||||
## How to run & debug a specific test without anything else to keep the feedback loop short?
|
||||
|
||||
There is a script called debug-test.sh in the scripts folder whose parameter takes a REGEX and an optional test number.
|
||||
|
||||
For example, running the following command will output an interactive list from which you can select a test. It takes this form:
|
||||
|
||||
`debug-test.sh [OPTION]... <test_regex> <test_number>`
|
||||
|
||||
It will then build & run in the debugger for you.
|
||||
|
||||
```bash
|
||||
./scripts/debug-test.sh test-tokenizer
|
||||
|
||||
# Once in the debugger, i.e. at the chevrons prompt, setting a breakpoint could be as follows:
|
||||
>>> b main
|
||||
```
|
||||
|
||||
For further reference use `debug-test.sh -h` to print help.
|
||||
|
||||
|
||||
|
||||
### How does the script work?
|
||||
If you want to be able to use the concepts contained in the script separately, the important ones are briefly outlined below.
|
||||
|
||||
#### Step 1: Reset and Setup folder context
|
||||
|
||||
From base of this repository, let's create `build-ci-debug` as our build context.
|
||||
|
||||
```bash
|
||||
rm -rf build-ci-debug && mkdir build-ci-debug && cd build-ci-debug
|
||||
```
|
||||
|
||||
#### Step 2: Setup Build Environment and Compile Test Binaries
|
||||
|
||||
Setup and trigger a build under debug mode. You may adapt the arguments as needed, but in this case these are sane defaults.
|
||||
|
||||
```bash
|
||||
cmake -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON ..
|
||||
make -j
|
||||
```
|
||||
|
||||
#### Step 3.1: Identify Test Command for Debugging
|
||||
|
||||
The output of this command will give you the command & arguments needed to run GDB.
|
||||
|
||||
* `-R test-tokenizer` : looks for all the test files named `test-tokenizer*` (R=Regex)
|
||||
* `-N` : "show-only" disables test execution & shows test commands that you can feed to GDB.
|
||||
* `-V` : Verbose Mode
|
||||
|
||||
```bash
|
||||
ctest -R "test-tokenizer" -V -N
|
||||
```
|
||||
|
||||
This may return output similar to below (focusing on key lines to pay attention to):
|
||||
|
||||
```bash
|
||||
...
|
||||
1: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
|
||||
1: Working Directory: .
|
||||
Labels: main
|
||||
Test #1: test-tokenizer-0-llama-spm
|
||||
...
|
||||
4: Test command: ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-falcon.gguf"
|
||||
4: Working Directory: .
|
||||
Labels: main
|
||||
Test #4: test-tokenizer-0-falcon
|
||||
...
|
||||
```
|
||||
|
||||
So for test #1 we can tell these two pieces of relevant information:
|
||||
* Test Binary: `~/llama.cpp/build-ci-debug/bin/test-tokenizer-0`
|
||||
* Test GGUF Model: `~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf`
|
||||
|
||||
#### Step 3.2: Run GDB on test command
|
||||
|
||||
Based on the ctest 'test command' report above we can then run a gdb session via this command below:
|
||||
|
||||
```bash
|
||||
gdb --args ${Test Binary} ${Test GGUF Model}
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
```bash
|
||||
gdb --args ~/llama.cpp/build-ci-debug/bin/test-tokenizer-0 "~/llama.cpp/tests/../models/ggml-vocab-llama-spm.gguf"
|
||||
```
|
|
@ -651,9 +651,6 @@ struct server_context {
|
|||
std::string system_prompt;
|
||||
std::vector<llama_token> system_tokens;
|
||||
|
||||
std::string name_user; // this should be the antiprompt
|
||||
std::string name_assistant;
|
||||
|
||||
// slots / clients
|
||||
std::vector<server_slot> slots;
|
||||
json default_generation_settings_for_props;
|
||||
|
@ -1100,15 +1097,11 @@ struct server_context {
|
|||
system_need_update = false;
|
||||
}
|
||||
|
||||
void system_prompt_set(const json & sys_props) {
|
||||
system_prompt = sys_props.value("prompt", "");
|
||||
name_user = sys_props.value("anti_prompt", "");
|
||||
name_assistant = sys_props.value("assistant_name", "");
|
||||
bool system_prompt_set(const std::string & sys_prompt) {
|
||||
system_prompt = sys_prompt;
|
||||
|
||||
LOG_VERBOSE("system prompt process", {
|
||||
{"system_prompt", system_prompt},
|
||||
{"name_user", name_user},
|
||||
{"name_assistant", name_assistant},
|
||||
});
|
||||
|
||||
// release all slots
|
||||
|
@ -1117,6 +1110,7 @@ struct server_context {
|
|||
}
|
||||
|
||||
system_need_update = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool process_token(completion_token_output & result, server_slot & slot) {
|
||||
|
@ -1536,7 +1530,8 @@ struct server_context {
|
|||
}
|
||||
|
||||
if (task.data.contains("system_prompt")) {
|
||||
system_prompt_set(task.data.at("system_prompt"));
|
||||
std::string sys_prompt = json_value(task.data, "system_prompt", std::string());
|
||||
system_prompt_set(sys_prompt);
|
||||
|
||||
for (server_slot & slot : slots) {
|
||||
slot.n_past = 0;
|
||||
|
@ -2920,7 +2915,7 @@ int main(int argc, char ** argv) {
|
|||
server_params_parse(argc, argv, sparams, params);
|
||||
|
||||
if (!sparams.system_prompt.empty()) {
|
||||
ctx_server.system_prompt_set(json::parse(sparams.system_prompt));
|
||||
ctx_server.system_prompt_set(sparams.system_prompt);
|
||||
}
|
||||
|
||||
if (params.model_alias == "unknown") {
|
||||
|
@ -3409,8 +3404,7 @@ int main(int argc, char ** argv) {
|
|||
const auto handle_props = [&ctx_server](const httplib::Request & req, httplib::Response & res) {
|
||||
res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin"));
|
||||
json data = {
|
||||
{ "user_name", ctx_server.name_user.c_str() },
|
||||
{ "assistant_name", ctx_server.name_assistant.c_str() },
|
||||
{ "system_prompt", ctx_server.system_prompt.c_str() },
|
||||
{ "default_generation_settings", ctx_server.default_generation_settings_for_props },
|
||||
{ "total_slots", ctx_server.params.n_parallel }
|
||||
};
|
||||
|
|
|
@ -887,6 +887,7 @@ async def oai_chat_completions(user_prompt,
|
|||
base_path,
|
||||
async_client,
|
||||
debug=False,
|
||||
temperature=None,
|
||||
model=None,
|
||||
n_predict=None,
|
||||
enable_streaming=None,
|
||||
|
@ -913,7 +914,8 @@ async def oai_chat_completions(user_prompt,
|
|||
"model": model,
|
||||
"max_tokens": n_predict,
|
||||
"stream": enable_streaming,
|
||||
"seed": seed
|
||||
"temperature": temperature if temperature is not None else 0.0,
|
||||
"seed": seed,
|
||||
}
|
||||
if response_format is not None:
|
||||
payload['response_format'] = response_format
|
||||
|
@ -978,7 +980,8 @@ async def oai_chat_completions(user_prompt,
|
|||
max_tokens=n_predict,
|
||||
stream=enable_streaming,
|
||||
response_format=payload.get('response_format'),
|
||||
seed=seed
|
||||
seed=seed,
|
||||
temperature=payload['temperature']
|
||||
)
|
||||
except openai.error.AuthenticationError as e:
|
||||
if expect_api_error is not None and expect_api_error:
|
||||
|
|
|
@ -371,7 +371,7 @@ static json oaicompat_completion_params_parse(
|
|||
llama_params["presence_penalty"] = json_value(body, "presence_penalty", 0.0);
|
||||
llama_params["seed"] = json_value(body, "seed", LLAMA_DEFAULT_SEED);
|
||||
llama_params["stream"] = json_value(body, "stream", false);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 0.0);
|
||||
llama_params["temperature"] = json_value(body, "temperature", 1.0);
|
||||
llama_params["top_p"] = json_value(body, "top_p", 1.0);
|
||||
|
||||
// Apply chat template to the list of messages
|
||||
|
|
|
@ -1182,9 +1182,9 @@ static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, st
|
|||
static char * fmt_size(size_t size) {
|
||||
static char buffer[128];
|
||||
if (size >= 1024*1024) {
|
||||
sprintf(buffer, "%zuM", size/1024/1024);
|
||||
snprintf(buffer, sizeof(buffer), "%zuM", size/1024/1024);
|
||||
} else {
|
||||
sprintf(buffer, "%zuK", size/1024);
|
||||
snprintf(buffer, sizeof(buffer), "%zuK", size/1024);
|
||||
}
|
||||
return buffer;
|
||||
}
|
||||
|
|
15
ggml-cuda.cu
15
ggml-cuda.cu
|
@ -2204,6 +2204,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
|
|||
case GGML_UNARY_OP_RELU:
|
||||
ggml_cuda_op_relu(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
ggml_cuda_op_sigmoid(ctx, dst);
|
||||
break;
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
ggml_cuda_op_hardsigmoid(ctx, dst);
|
||||
break;
|
||||
|
@ -2710,12 +2713,14 @@ GGML_CALL static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t
|
|||
}
|
||||
|
||||
GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
|
||||
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *) backend->context;
|
||||
switch (op->op) {
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_HARDSIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
|
@ -2836,8 +2841,16 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
|
|||
case GGML_OP_ARANGE:
|
||||
case GGML_OP_TIMESTEP_EMBEDDING:
|
||||
case GGML_OP_LEAKY_RELU:
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
return true;
|
||||
case GGML_OP_FLASH_ATTN_EXT:
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
return op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128;
|
||||
#else
|
||||
if (op->src[0]->ne[0] == 64 || op->src[0]->ne[0] == 128) {
|
||||
return true;
|
||||
}
|
||||
return ggml_cuda_info().devices[cuda_ctx->device].cc >= CC_VOLTA;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
|
|
|
@ -321,6 +321,10 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
|
|||
|
||||
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
|
||||
|
||||
static bool fast_fp16_available(const int cc) {
|
||||
return cc >= CC_PASCAL && cc != 610;
|
||||
}
|
||||
|
||||
static bool fp16_mma_available(const int cc) {
|
||||
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
|
||||
}
|
||||
|
|
47
ggml-cuda/fattn-common.cuh
Normal file
47
ggml-cuda/fattn-common.cuh
Normal file
|
@ -0,0 +1,47 @@
|
|||
#define FATTN_KQ_STRIDE 256
|
||||
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||
|
||||
template<int D, int parallel_blocks> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_combine_results(
|
||||
const float * __restrict__ VKQ_parts,
|
||||
const float2 * __restrict__ VKQ_meta,
|
||||
float * __restrict__ dst) {
|
||||
VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
|
||||
VKQ_meta += parallel_blocks * gridDim.y*blockIdx.x;
|
||||
dst += D * gridDim.y*blockIdx.x;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
__shared__ float2 meta[parallel_blocks];
|
||||
if (tid < 2*parallel_blocks) {
|
||||
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
float kqmax = meta[0].x;
|
||||
#pragma unroll
|
||||
for (int l = 1; l < parallel_blocks; ++l) {
|
||||
kqmax = max(kqmax, meta[l].x);
|
||||
}
|
||||
|
||||
float VKQ_numerator = 0.0f;
|
||||
float VKQ_denominator = 0.0f;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < parallel_blocks; ++l) {
|
||||
const float diff = meta[l].x - kqmax;
|
||||
const float KQ_max_scale = expf(diff);
|
||||
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
|
||||
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
|
||||
|
||||
VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
|
||||
VKQ_denominator += KQ_max_scale * meta[l].y;
|
||||
}
|
||||
|
||||
dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
|
||||
}
|
430
ggml-cuda/fattn-vec-f16.cu
Normal file
430
ggml-cuda/fattn-vec-f16.cu
Normal file
|
@ -0,0 +1,430 @@
|
|||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-vec-f16.cuh"
|
||||
|
||||
template<int D, int ncols, int parallel_blocks> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_vec_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int nb31,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#if FP16_AVAILABLE
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
||||
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
|
||||
const int stride_KV = nb11 / sizeof(half);
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
half slopeh = __float2half(1.0f);
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const int h = blockIdx.y;
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slopeh = __float2half(powf(base, exph));
|
||||
}
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
constexpr int nwarps = D / WARP_SIZE;
|
||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
__shared__ half KQ[ncols*D];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ[j*D + tid] = -HALF_MAX_HALF;
|
||||
}
|
||||
half2 * KQ2 = (half2 *) KQ;
|
||||
|
||||
half kqmax[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax[j] = -HALF_MAX_HALF;
|
||||
}
|
||||
half kqsum[ncols] = {0.0f};
|
||||
|
||||
__shared__ half kqmax_shared[ncols][WARP_SIZE];
|
||||
__shared__ half kqsum_shared[ncols][WARP_SIZE];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (threadIdx.y == 0) {
|
||||
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
|
||||
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Convert Q to half2 and store in registers:
|
||||
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||
}
|
||||
}
|
||||
|
||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||
|
||||
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
||||
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
|
||||
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
|
||||
half kqmax_new = kqmax[0];
|
||||
half kqmax_new_arr[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax_new_arr[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
|
||||
break;
|
||||
}
|
||||
|
||||
half2 sum2[ncols] = {{0.0f, 0.0f}};
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
sum2[j] = warp_reduce_sum(sum2[j]);
|
||||
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
|
||||
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||
|
||||
if (ncols == 1) {
|
||||
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
||||
} else {
|
||||
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
KQ[j*D + i_KQ] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
|
||||
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
if (threadIdx.x == 0) {
|
||||
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
|
||||
kqmax[j] = kqmax_new_j;
|
||||
|
||||
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
|
||||
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
||||
KQ[j*D + tid] = val;
|
||||
|
||||
VKQ[j] *= __half2half2(KQ_max_scale);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < D; k0 += 2) {
|
||||
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
half2 V_k;
|
||||
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
|
||||
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||
|
||||
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
||||
if (parallel_blocks == 1) {
|
||||
dst_val /= kqsum[j_VKQ];
|
||||
}
|
||||
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||
}
|
||||
|
||||
if (parallel_blocks != 1 && tid != 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
|
||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||
) {
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||
|
||||
if (parallel_blocks > 1) {
|
||||
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
||||
}
|
||||
|
||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||
const int shmem = 0;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const uint32_t n_head = Q->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) Q->data,
|
||||
(const char *) K->data,
|
||||
(const char *) V->data,
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (parallel_blocks == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
||||
const int shmem_combine = 0;
|
||||
|
||||
flash_attn_combine_results<D, parallel_blocks>
|
||||
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 256:
|
||||
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 2) {
|
||||
constexpr int cols_per_block = 2;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 4) {
|
||||
constexpr int cols_per_block = 4;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 8) {
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 1;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
5
ggml-cuda/fattn-vec-f16.cuh
Normal file
5
ggml-cuda/fattn-vec-f16.cuh
Normal file
|
@ -0,0 +1,5 @@
|
|||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_vec_f16(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_flash_attn_ext_vec_f16_no_mma(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
384
ggml-cuda/fattn-vec-f32.cu
Normal file
384
ggml-cuda/fattn-vec-f32.cu
Normal file
|
@ -0,0 +1,384 @@
|
|||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-vec-f32.cuh"
|
||||
|
||||
template<int D, int ncols, int parallel_blocks> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_vec_ext_f32(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int nb31,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
||||
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
|
||||
const int stride_KV = nb11 / sizeof(half);
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
float slope = 1.0f;
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const int h = blockIdx.y;
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slope = powf(base, exph);
|
||||
}
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
constexpr int nwarps = D / WARP_SIZE;
|
||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
__shared__ float KQ[ncols*D];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ[j*D + tid] = -FLT_MAX/2.0f;
|
||||
}
|
||||
|
||||
float kqmax[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax[j] = -FLT_MAX/2.0f;
|
||||
}
|
||||
float kqsum[ncols] = {0.0f};
|
||||
|
||||
__shared__ float kqmax_shared[ncols][WARP_SIZE];
|
||||
__shared__ float kqsum_shared[ncols][WARP_SIZE];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (threadIdx.y == 0) {
|
||||
kqmax_shared[j][threadIdx.x] = -FLT_MAX/2.0f;
|
||||
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Convert Q to half2 and store in registers:
|
||||
float2 Q_h2[ncols][D/(2*WARP_SIZE)];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
Q_h2[j][i0/WARP_SIZE] = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||
Q_h2[j][i0/WARP_SIZE].x *= scale;
|
||||
Q_h2[j][i0/WARP_SIZE].y *= scale;
|
||||
}
|
||||
}
|
||||
|
||||
float VKQ[ncols] = {0.0f};
|
||||
|
||||
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
||||
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
float kqmax_new_arr[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax_new_arr[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
|
||||
break;
|
||||
}
|
||||
|
||||
float sum[ncols] = {0.0f};
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
sum[j] += __low2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].x;
|
||||
sum[j] += __high2float(K_ik) * Q_h2[j][k_KQ_0/WARP_SIZE].y;
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
sum[j] = warp_reduce_sum(sum[j]);
|
||||
sum[j] += mask ? slope*__half2float(maskh[j*ne11 + k_VKQ_0 + i_KQ]) : 0.0f;
|
||||
|
||||
kqmax_new_arr[j] = fmaxf(kqmax_new_arr[j], sum[j]);
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
KQ[j*D + i_KQ] = sum[j];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float kqmax_new_j = kqmax_new_arr[j];
|
||||
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
if (threadIdx.x == 0) {
|
||||
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
float kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const float KQ_max_scale = expf(kqmax[j] - kqmax_new_j);
|
||||
kqmax[j] = kqmax_new_j;
|
||||
|
||||
const float val = expf(KQ[j*D + tid] - kqmax[j]);
|
||||
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
||||
KQ[j*D + tid] = val;
|
||||
|
||||
VKQ[j] *= KQ_max_scale;
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k = 0; k < D; ++k) {
|
||||
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
const float V_ki = __half2float(V_h[(k_VKQ_0 + k)*stride_KV + tid]);
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_ki*KQ[j*D + k];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||
|
||||
float dst_val = VKQ[j_VKQ];
|
||||
if (parallel_blocks == 1) {
|
||||
dst_val /= kqsum[j_VKQ];
|
||||
}
|
||||
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||
}
|
||||
|
||||
if (parallel_blocks != 1 && tid != 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f32(
|
||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||
) {
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||
|
||||
if (parallel_blocks > 1) {
|
||||
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
||||
}
|
||||
|
||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||
const int shmem = 0;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const uint32_t n_head = Q->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
flash_attn_vec_ext_f32<D, cols_per_block, parallel_blocks>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) Q->data,
|
||||
(const char *) K->data,
|
||||
(const char *) V->data,
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (parallel_blocks == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
||||
const int shmem_combine = 0;
|
||||
|
||||
flash_attn_combine_results<D, parallel_blocks>
|
||||
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * Q = dst->src[0];
|
||||
const ggml_tensor * K = dst->src[1];
|
||||
const ggml_tensor * V = dst->src[2];
|
||||
|
||||
const ggml_tensor * mask = dst->src[3];
|
||||
|
||||
ggml_tensor * KQV = dst;
|
||||
|
||||
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 2) {
|
||||
constexpr int cols_per_block = 2;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 4) {
|
||||
constexpr int cols_per_block = 4;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 8) {
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 1;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f32< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f32<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
}
|
3
ggml-cuda/fattn-vec-f32.cuh
Normal file
3
ggml-cuda/fattn-vec-f32.cuh
Normal file
|
@ -0,0 +1,3 @@
|
|||
#include "common.cuh"
|
||||
|
||||
void ggml_cuda_flash_attn_ext_vec_f32(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|
@ -1,4 +1,7 @@
|
|||
#include "common.cuh"
|
||||
#include "fattn-common.cuh"
|
||||
#include "fattn-vec-f16.cuh"
|
||||
#include "fattn-vec-f32.cuh"
|
||||
#include "fattn.cuh"
|
||||
|
||||
#include <cstdint>
|
||||
|
@ -7,251 +10,6 @@
|
|||
#include <mma.h>
|
||||
#endif
|
||||
|
||||
#define FATTN_KQ_STRIDE 256
|
||||
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
|
||||
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
|
||||
|
||||
template<int D, int ncols, int parallel_blocks> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_vec_ext_f16(
|
||||
const char * __restrict__ Q,
|
||||
const char * __restrict__ K,
|
||||
const char * __restrict__ V,
|
||||
const char * __restrict__ mask,
|
||||
float * __restrict__ dst,
|
||||
float2 * __restrict__ dst_meta,
|
||||
const float scale,
|
||||
const float max_bias,
|
||||
const float m0,
|
||||
const float m1,
|
||||
const uint32_t n_head_log2,
|
||||
const int ne00,
|
||||
const int ne01,
|
||||
const int ne02,
|
||||
const int ne03,
|
||||
const int ne10,
|
||||
const int ne11,
|
||||
const int ne12,
|
||||
const int ne13,
|
||||
const int ne31,
|
||||
const int nb31,
|
||||
const int nb01,
|
||||
const int nb02,
|
||||
const int nb03,
|
||||
const int nb11,
|
||||
const int nb12,
|
||||
const int nb13,
|
||||
const int ne0,
|
||||
const int ne1,
|
||||
const int ne2,
|
||||
const int ne3) {
|
||||
#if FP16_AVAILABLE
|
||||
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
|
||||
|
||||
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
|
||||
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
|
||||
|
||||
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
|
||||
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
|
||||
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
|
||||
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
|
||||
const half * maskh = (const half *) mask + ne11*ic0;
|
||||
|
||||
const int stride_KV = nb11 / sizeof(half);
|
||||
const int stride_KV2 = nb11 / sizeof(half2);
|
||||
|
||||
half slopeh = __float2half(1.0f);
|
||||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const int h = blockIdx.y;
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
||||
slopeh = __float2half(powf(base, exph));
|
||||
}
|
||||
|
||||
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
|
||||
constexpr int nwarps = D / WARP_SIZE;
|
||||
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
__shared__ half KQ[ncols*D];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
KQ[j*D + tid] = -HALF_MAX_HALF;
|
||||
}
|
||||
half2 * KQ2 = (half2 *) KQ;
|
||||
|
||||
half kqmax[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax[j] = -HALF_MAX_HALF;
|
||||
}
|
||||
half kqsum[ncols] = {0.0f};
|
||||
|
||||
__shared__ half kqmax_shared[ncols][WARP_SIZE];
|
||||
__shared__ half kqsum_shared[ncols][WARP_SIZE];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
if (threadIdx.y == 0) {
|
||||
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
|
||||
kqsum_shared[j][threadIdx.x] = 0.0f;
|
||||
}
|
||||
}
|
||||
__syncthreads();
|
||||
|
||||
// Convert Q to half2 and store in registers:
|
||||
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
#pragma unroll
|
||||
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
|
||||
const int i = i0 + threadIdx.x;
|
||||
|
||||
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
|
||||
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
|
||||
}
|
||||
}
|
||||
|
||||
half2 VKQ[ncols] = {{0.0f, 0.0f}};
|
||||
|
||||
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
|
||||
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
|
||||
// Calculate KQ tile and keep track of new maximum KQ values:
|
||||
|
||||
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
|
||||
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
|
||||
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
|
||||
half kqmax_new = kqmax[0];
|
||||
half kqmax_new_arr[ncols];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqmax_new_arr[j] = kqmax[j];
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
|
||||
const int i_KQ = i_KQ_0 + threadIdx.y;
|
||||
|
||||
if ((i_KQ_0 + nwarps > D && i_KQ >= D) || (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + i_KQ >= ne11)) {
|
||||
break;
|
||||
}
|
||||
|
||||
half2 sum2[ncols] = {{0.0f, 0.0f}};
|
||||
#pragma unroll
|
||||
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
|
||||
const int k_KQ = k_KQ_0 + threadIdx.x;
|
||||
|
||||
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
sum2[j] = warp_reduce_sum(sum2[j]);
|
||||
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
|
||||
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
|
||||
|
||||
if (ncols == 1) {
|
||||
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
|
||||
} else {
|
||||
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
|
||||
}
|
||||
|
||||
if (threadIdx.x == 0) {
|
||||
KQ[j*D + i_KQ] = sum;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
|
||||
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
if (threadIdx.x == 0) {
|
||||
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
|
||||
kqmax_new_j = warp_reduce_max(kqmax_new_j);
|
||||
|
||||
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
|
||||
kqmax[j] = kqmax_new_j;
|
||||
|
||||
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
|
||||
kqsum[j] = kqsum[j]*KQ_max_scale + val;
|
||||
KQ[j*D + tid] = val;
|
||||
|
||||
VKQ[j] *= __half2half2(KQ_max_scale);
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int k0 = 0; k0 < D; k0 += 2) {
|
||||
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
|
||||
break;
|
||||
}
|
||||
|
||||
half2 V_k;
|
||||
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
|
||||
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
}
|
||||
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
kqsum[j] = warp_reduce_sum(kqsum[j]);
|
||||
if (threadIdx.x == 0) {
|
||||
kqsum_shared[j][threadIdx.y] = kqsum[j];
|
||||
}
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
#pragma unroll
|
||||
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
|
||||
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
|
||||
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
|
||||
|
||||
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
|
||||
if (parallel_blocks == 1) {
|
||||
dst_val /= kqsum[j_VKQ];
|
||||
}
|
||||
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
|
||||
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
|
||||
}
|
||||
|
||||
if (parallel_blocks != 1 && tid != 0) {
|
||||
#pragma unroll
|
||||
for (int j = 0; j < ncols; ++j) {
|
||||
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
|
||||
}
|
||||
}
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
|
||||
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
|
@ -655,54 +413,6 @@ static __global__ void flash_attn_ext_f16(
|
|||
#endif // FP16_MMA_AVAILABLE
|
||||
}
|
||||
|
||||
template<int D, int parallel_blocks> // D == head size
|
||||
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
__launch_bounds__(D, 1)
|
||||
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
|
||||
static __global__ void flash_attn_combine_results(
|
||||
const float * __restrict__ VKQ_parts,
|
||||
const float2 * __restrict__ VKQ_meta,
|
||||
float * __restrict__ dst) {
|
||||
#if FP16_AVAILABLE
|
||||
VKQ_parts += parallel_blocks*D * gridDim.y*blockIdx.x;
|
||||
VKQ_meta += parallel_blocks * gridDim.y*blockIdx.x;
|
||||
dst += D * gridDim.y*blockIdx.x;
|
||||
|
||||
const int tid = threadIdx.x;
|
||||
__builtin_assume(tid < D);
|
||||
|
||||
__shared__ float2 meta[parallel_blocks];
|
||||
if (tid < 2*parallel_blocks) {
|
||||
((float *) meta)[threadIdx.x] = ((const float *)VKQ_meta) [blockIdx.y*(2*parallel_blocks) + tid];
|
||||
}
|
||||
|
||||
__syncthreads();
|
||||
|
||||
float kqmax = meta[0].x;
|
||||
#pragma unroll
|
||||
for (int l = 1; l < parallel_blocks; ++l) {
|
||||
kqmax = max(kqmax, meta[l].x);
|
||||
}
|
||||
|
||||
float VKQ_numerator = 0.0f;
|
||||
float VKQ_denominator = 0.0f;
|
||||
#pragma unroll
|
||||
for (int l = 0; l < parallel_blocks; ++l) {
|
||||
const float diff = meta[l].x - kqmax;
|
||||
const float KQ_max_scale = expf(diff);
|
||||
const uint32_t ftz_mask = 0xFFFFFFFF * (diff > SOFTMAX_FTZ_THRESHOLD);
|
||||
*((uint32_t *) &KQ_max_scale) &= ftz_mask;
|
||||
|
||||
VKQ_numerator += KQ_max_scale * VKQ_parts[l*gridDim.y*D + blockIdx.y*D + tid];
|
||||
VKQ_denominator += KQ_max_scale * meta[l].y;
|
||||
}
|
||||
|
||||
dst[blockIdx.y*D + tid] = VKQ_numerator / VKQ_denominator;
|
||||
#else
|
||||
NO_DEVICE_CODE;
|
||||
#endif // FP16_AVAILABLE
|
||||
}
|
||||
|
||||
constexpr int get_max_power_of_2(int x) {
|
||||
return x % 2 == 0 ? 2*get_max_power_of_2(x/2) : 1;
|
||||
}
|
||||
|
@ -727,66 +437,6 @@ static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
|
|||
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
|
||||
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
|
||||
|
||||
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
|
||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||
) {
|
||||
ggml_cuda_pool_alloc<float> dst_tmp(pool);
|
||||
ggml_cuda_pool_alloc<float2> dst_tmp_meta(pool);
|
||||
|
||||
if (parallel_blocks > 1) {
|
||||
dst_tmp.alloc(parallel_blocks*ggml_nelements(KQV));
|
||||
dst_tmp_meta.alloc(parallel_blocks*ggml_nrows(KQV));
|
||||
}
|
||||
|
||||
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
|
||||
const dim3 block_dim(WARP_SIZE, nwarps, 1);
|
||||
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
|
||||
const int shmem = 0;
|
||||
|
||||
float scale = 1.0f;
|
||||
float max_bias = 0.0f;
|
||||
|
||||
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
|
||||
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
|
||||
|
||||
const uint32_t n_head = Q->ne[2];
|
||||
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
|
||||
|
||||
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
|
||||
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
|
||||
|
||||
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
|
||||
<<<blocks_num, block_dim, shmem, main_stream>>> (
|
||||
(const char *) Q->data,
|
||||
(const char *) K->data,
|
||||
(const char *) V->data,
|
||||
mask ? ((const char *) mask->data) : nullptr,
|
||||
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
|
||||
scale, max_bias, m0, m1, n_head_log2,
|
||||
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
|
||||
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
|
||||
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
|
||||
Q->nb[1], Q->nb[2], Q->nb[3],
|
||||
K->nb[1], K->nb[2], K->nb[3],
|
||||
KQV->ne[0], KQV->ne[1], KQV->ne[2], KQV->ne[3]
|
||||
);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
|
||||
if (parallel_blocks == 1) {
|
||||
return;
|
||||
}
|
||||
|
||||
const dim3 block_dim_combine(D, 1, 1);
|
||||
const dim3 blocks_num_combine(Q->ne[1], blocks_num.y, blocks_num.z);
|
||||
const int shmem_combine = 0;
|
||||
|
||||
flash_attn_combine_results<D, parallel_blocks>
|
||||
<<<blocks_num_combine, block_dim_combine, shmem_combine, main_stream>>>
|
||||
(dst_tmp.ptr, dst_tmp_meta.ptr, (float *) KQV->data);
|
||||
CUDA_CHECK(cudaGetLastError());
|
||||
}
|
||||
|
||||
template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename KQ_acc_t> void launch_fattn_f16_impl(
|
||||
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
|
||||
ggml_cuda_pool & pool, cudaStream_t main_stream
|
||||
|
@ -891,95 +541,22 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
|||
|
||||
const int32_t precision = KQV->op_params[2];
|
||||
|
||||
if (!fast_fp16_available(cc)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (!fp16_mma_available(cc)) {
|
||||
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
|
||||
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
|
||||
|
||||
if (Q->ne[1] == 1) {
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] == 2) {
|
||||
constexpr int cols_per_block = 2;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 4) {
|
||||
constexpr int cols_per_block = 4;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 8) {
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
constexpr int cols_per_block = 8;
|
||||
constexpr int parallel_blocks = 1;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
ggml_cuda_flash_attn_ext_vec_f16_no_mma(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (precision != GGML_PREC_DEFAULT) {
|
||||
if (Q->ne[1] == 1 && (Q->ne[0] == 64 || Q->ne[0] == 128)) {
|
||||
ggml_cuda_flash_attn_ext_vec_f32(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
|
||||
constexpr int cols_per_block = 16;
|
||||
constexpr int nwarps = 4;
|
||||
|
@ -1037,22 +614,7 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
|
|||
}
|
||||
|
||||
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
|
||||
constexpr int cols_per_block = 1;
|
||||
constexpr int parallel_blocks = 4;
|
||||
switch (Q->ne[0]) {
|
||||
case 64:
|
||||
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 128:
|
||||
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
case 256:
|
||||
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
|
||||
break;
|
||||
default:
|
||||
GGML_ASSERT(false);
|
||||
break;
|
||||
}
|
||||
ggml_cuda_flash_attn_ext_vec_f16(ctx, dst);
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
@ -48,6 +48,15 @@ static __global__ void relu_f32(const float * x, float * dst, const int k) {
|
|||
dst[i] = fmaxf(x[i], 0);
|
||||
}
|
||||
|
||||
static __global__ void sigmoid_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
if (i >= k) {
|
||||
return;
|
||||
}
|
||||
dst[i] = 1.0f / (1.0f + expf(-x[i]));
|
||||
}
|
||||
|
||||
static __global__ void hardsigmoid_f32(const float * x, float * dst, const int k) {
|
||||
const int i = blockDim.x*blockIdx.x + threadIdx.x;
|
||||
|
||||
|
@ -108,6 +117,11 @@ static void relu_f32_cuda(const float * x, float * dst, const int k, cudaStream_
|
|||
relu_f32<<<num_blocks, CUDA_RELU_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void sigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_SIGMOID_BLOCK_SIZE - 1) / CUDA_SIGMOID_BLOCK_SIZE;
|
||||
sigmoid_f32<<<num_blocks, CUDA_SIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
}
|
||||
|
||||
static void hardsigmoid_f32_cuda(const float * x, float * dst, const int k, cudaStream_t stream) {
|
||||
const int num_blocks = (k + CUDA_HARDSIGMOID_BLOCK_SIZE - 1) / CUDA_HARDSIGMOID_BLOCK_SIZE;
|
||||
hardsigmoid_f32<<<num_blocks, CUDA_HARDSIGMOID_BLOCK_SIZE, 0, stream>>>(x, dst, k);
|
||||
|
@ -188,6 +202,18 @@ void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
|||
relu_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
float * dst_d = (float *)dst->data;
|
||||
cudaStream_t stream = ctx.stream();
|
||||
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
sigmoid_f32_cuda(src0_d, dst_d, ggml_nelements(src0), stream);
|
||||
}
|
||||
|
||||
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
|
||||
const ggml_tensor * src0 = dst->src[0];
|
||||
const float * src0_d = (const float *)src0->data;
|
||||
|
|
|
@ -4,6 +4,7 @@
|
|||
#define CUDA_SILU_BLOCK_SIZE 256
|
||||
#define CUDA_TANH_BLOCK_SIZE 256
|
||||
#define CUDA_RELU_BLOCK_SIZE 256
|
||||
#define CUDA_SIGMOID_BLOCK_SIZE 256
|
||||
#define CUDA_HARDSIGMOID_BLOCK_SIZE 256
|
||||
#define CUDA_HARDSWISH_BLOCK_SIZE 256
|
||||
#define CUDA_SQR_BLOCK_SIZE 256
|
||||
|
@ -18,6 +19,8 @@ void ggml_cuda_op_tanh(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
|||
|
||||
void ggml_cuda_op_relu(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_sigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_hardsigmoid(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
||||
void ggml_cuda_op_hardswish(ggml_backend_cuda_context & ctx, ggml_tensor * dst);
|
||||
|
|
43
ggml-metal.m
43
ggml-metal.m
|
@ -40,6 +40,7 @@ enum ggml_metal_kernel_type {
|
|||
GGML_METAL_KERNEL_TYPE_CLAMP,
|
||||
GGML_METAL_KERNEL_TYPE_TANH,
|
||||
GGML_METAL_KERNEL_TYPE_RELU,
|
||||
GGML_METAL_KERNEL_TYPE_SIGMOID,
|
||||
GGML_METAL_KERNEL_TYPE_GELU,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_4,
|
||||
GGML_METAL_KERNEL_TYPE_GELU_QUICK,
|
||||
|
@ -493,6 +494,7 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_CLAMP, clamp, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_TANH, tanh, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_RELU, relu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_SIGMOID, sigmoid, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU, gelu, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_4, gelu_4, true);
|
||||
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_GELU_QUICK, gelu_quick, true);
|
||||
|
@ -730,6 +732,7 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
|
|||
switch (ggml_get_unary_op(op)) {
|
||||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_GELU:
|
||||
case GGML_UNARY_OP_GELU_QUICK:
|
||||
case GGML_UNARY_OP_SILU:
|
||||
|
@ -1192,24 +1195,24 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_CLAMP:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_CLAMP].pipeline;
|
||||
|
||||
float min;
|
||||
float max;
|
||||
memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
||||
memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
|
||||
float min;
|
||||
float max;
|
||||
memcpy(&min, ((int32_t *) dst->op_params) + 0, sizeof(float));
|
||||
memcpy(&max, ((int32_t *) dst->op_params) + 1, sizeof(float));
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&min length:sizeof(min) atIndex:2];
|
||||
[encoder setBytes:&max length:sizeof(max) atIndex:3];
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
[encoder setBytes:&min length:sizeof(min) atIndex:2];
|
||||
[encoder setBytes:&max length:sizeof(max) atIndex:3];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_OP_UNARY:
|
||||
switch (ggml_get_unary_op(gf->nodes[i])) {
|
||||
// we are not taking into account the strides, so for now require contiguous tensors
|
||||
|
@ -1237,6 +1240,18 @@ static enum ggml_status ggml_metal_graph_compute(
|
|||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
{
|
||||
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_SIGMOID].pipeline;
|
||||
|
||||
[encoder setComputePipelineState:pipeline];
|
||||
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
||||
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
||||
|
||||
const int64_t n = ggml_nelements(dst);
|
||||
|
||||
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
|
|
|
@ -229,6 +229,13 @@ kernel void kernel_relu(
|
|||
dst[tpig] = max(0.0f, src0[tpig]);
|
||||
}
|
||||
|
||||
kernel void kernel_sigmoid(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
dst[tpig] = 1.0f / (1.0f + exp(-src0[tpig]));
|
||||
}
|
||||
|
||||
kernel void kernel_tanh(
|
||||
device const float * src0,
|
||||
device float * dst,
|
||||
|
@ -2210,7 +2217,7 @@ kernel void kernel_flash_attn_ext_f16(
|
|||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const short h = iq2;
|
||||
const uint32_t h = iq2;
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
@ -2466,7 +2473,7 @@ kernel void kernel_flash_attn_ext_vec_f16(
|
|||
|
||||
// ALiBi
|
||||
if (max_bias > 0.0f) {
|
||||
const short h = iq2;
|
||||
const uint32_t h = iq2;
|
||||
|
||||
const float base = h < n_head_log2 ? m0 : m1;
|
||||
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
|
||||
|
|
|
@ -14,6 +14,12 @@
|
|||
#include <stdlib.h> // for qsort
|
||||
#include <stdio.h> // for GGML_ASSERT
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid warnings for hundreds of casts
|
||||
// we should just be careful :)
|
||||
#pragma warning(disable: 4244 4267)
|
||||
#endif
|
||||
|
||||
#define UNUSED GGML_UNUSED
|
||||
|
||||
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
||||
|
|
78
ggml.c
78
ggml.c
|
@ -4,7 +4,6 @@
|
|||
#include "ggml-impl.h"
|
||||
#include "ggml-quants.h"
|
||||
#include "ggml.h"
|
||||
#include "sgemm.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(__MINGW32__)
|
||||
#include <malloc.h> // using malloc.h with MSC/MINGW
|
||||
|
@ -37,6 +36,10 @@
|
|||
#undef GGML_USE_LLAMAFILE
|
||||
#endif
|
||||
|
||||
#ifdef GGML_USE_LLAMAFILE
|
||||
#include "sgemm.h"
|
||||
#endif
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
// disable "possible loss of data" to avoid hundreds of casts
|
||||
// we should just be careful :)
|
||||
|
@ -1949,6 +1952,7 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||
inline static void ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
||||
inline static void ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
||||
// TODO: optimize performance
|
||||
inline static void ggml_vec_hardswish_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i] * fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
inline static void ggml_vec_hardsigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fminf(1.0f, fmaxf(0.0f, (x[i] + 3.0f) / 6.0f)); }
|
||||
|
@ -2329,6 +2333,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|||
"TANH",
|
||||
"ELU",
|
||||
"RELU",
|
||||
"SIGMOID",
|
||||
"GELU",
|
||||
"GELU_QUICK",
|
||||
"SILU",
|
||||
|
@ -2336,7 +2341,7 @@ static const char * GGML_UNARY_OP_NAME[GGML_UNARY_OP_COUNT] = {
|
|||
"HARDSIGMOID",
|
||||
};
|
||||
|
||||
static_assert(GGML_UNARY_OP_COUNT == 12, "GGML_UNARY_OP_COUNT != 12");
|
||||
static_assert(GGML_UNARY_OP_COUNT == 13, "GGML_UNARY_OP_COUNT != 13");
|
||||
|
||||
|
||||
static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
|
||||
|
@ -4561,6 +4566,20 @@ struct ggml_tensor * ggml_leaky_relu(
|
|||
return result;
|
||||
}
|
||||
|
||||
// ggml_sigmoid
|
||||
|
||||
struct ggml_tensor * ggml_sigmoid(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_unary(ctx, a, GGML_UNARY_OP_SIGMOID);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_sigmoid_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_unary_inplace(ctx, a, GGML_UNARY_OP_SIGMOID);
|
||||
}
|
||||
|
||||
// ggml_gelu
|
||||
|
||||
struct ggml_tensor * ggml_gelu(
|
||||
|
@ -10852,6 +10871,52 @@ static void ggml_compute_forward_relu(
|
|||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_sigmoid
|
||||
|
||||
static void ggml_compute_forward_sigmoid_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
assert(params->ith == 0);
|
||||
assert(ggml_are_same_shape(src0, dst));
|
||||
|
||||
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
const int n = ggml_nrows(src0);
|
||||
const int nc = src0->ne[0];
|
||||
|
||||
assert(dst->nb[0] == sizeof(float));
|
||||
assert(src0->nb[0] == sizeof(float));
|
||||
|
||||
for (int i = 0; i < n; i++) {
|
||||
ggml_vec_sigmoid_f32(nc,
|
||||
(float *) ((char *) dst->data + i*( dst->nb[1])),
|
||||
(float *) ((char *) src0->data + i*(src0->nb[1])));
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_sigmoid(
|
||||
const struct ggml_compute_params * params,
|
||||
struct ggml_tensor * dst) {
|
||||
|
||||
const struct ggml_tensor * src0 = dst->src[0];
|
||||
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
ggml_compute_forward_sigmoid_f32(params, dst);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_gelu
|
||||
|
||||
static void ggml_compute_forward_gelu_f32(
|
||||
|
@ -16617,6 +16682,10 @@ static void ggml_compute_forward_unary(
|
|||
{
|
||||
ggml_compute_forward_relu(params, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
{
|
||||
ggml_compute_forward_sigmoid(params, dst);
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
{
|
||||
ggml_compute_forward_gelu(params, dst);
|
||||
|
@ -18601,6 +18670,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
zero_table);
|
||||
}
|
||||
} break;
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_UNARY_OP_GELU:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
|
@ -19130,6 +19203,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
|
|||
case GGML_UNARY_OP_TANH:
|
||||
case GGML_UNARY_OP_ELU:
|
||||
case GGML_UNARY_OP_RELU:
|
||||
case GGML_UNARY_OP_SIGMOID:
|
||||
case GGML_UNARY_OP_HARDSWISH: // to opt for multiple threads
|
||||
case GGML_UNARY_OP_HARDSIGMOID: // to opt for multiple threads
|
||||
{
|
||||
|
|
9
ggml.h
9
ggml.h
|
@ -519,6 +519,7 @@ extern "C" {
|
|||
GGML_UNARY_OP_TANH,
|
||||
GGML_UNARY_OP_ELU,
|
||||
GGML_UNARY_OP_RELU,
|
||||
GGML_UNARY_OP_SIGMOID,
|
||||
GGML_UNARY_OP_GELU,
|
||||
GGML_UNARY_OP_GELU_QUICK,
|
||||
GGML_UNARY_OP_SILU,
|
||||
|
@ -1073,6 +1074,14 @@ extern "C" {
|
|||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_sigmoid(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_sigmoid_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
||||
GGML_API struct ggml_tensor * ggml_gelu(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a);
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
from .constants import *
|
||||
from .lazy import *
|
||||
from .gguf_reader import *
|
||||
from .gguf_writer import *
|
||||
from .tensor_mapping import *
|
||||
|
|
|
@ -10,6 +10,7 @@ from typing import Any
|
|||
GGUF_MAGIC = 0x46554747 # "GGUF"
|
||||
GGUF_VERSION = 3
|
||||
GGUF_DEFAULT_ALIGNMENT = 32
|
||||
GGML_QUANT_VERSION = 2 # GGML_QNT_VERSION from ggml.h
|
||||
|
||||
#
|
||||
# metadata keys
|
||||
|
@ -838,6 +839,49 @@ class GGMLQuantizationType(IntEnum):
|
|||
BF16 = 30
|
||||
|
||||
|
||||
# TODO: add GGMLFileType from ggml_ftype in ggml.h
|
||||
|
||||
|
||||
# from llama_ftype in llama.h
|
||||
# ALL VALUES SHOULD BE THE SAME HERE AS THEY ARE OVER THERE.
|
||||
class LlamaFileType(IntEnum):
|
||||
ALL_F32 = 0
|
||||
MOSTLY_F16 = 1 # except 1d tensors
|
||||
MOSTLY_Q4_0 = 2 # except 1d tensors
|
||||
MOSTLY_Q4_1 = 3 # except 1d tensors
|
||||
MOSTLY_Q4_1_SOME_F16 = 4 # tok_embeddings.weight and output.weight are F16
|
||||
# MOSTLY_Q4_2 = 5 # support has been removed
|
||||
# MOSTLY_Q4_3 = 6 # support has been removed
|
||||
MOSTLY_Q8_0 = 7 # except 1d tensors
|
||||
MOSTLY_Q5_0 = 8 # except 1d tensors
|
||||
MOSTLY_Q5_1 = 9 # except 1d tensors
|
||||
MOSTLY_Q2_K = 10 # except 1d tensors
|
||||
MOSTLY_Q3_K_S = 11 # except 1d tensors
|
||||
MOSTLY_Q3_K_M = 12 # except 1d tensors
|
||||
MOSTLY_Q3_K_L = 13 # except 1d tensors
|
||||
MOSTLY_Q4_K_S = 14 # except 1d tensors
|
||||
MOSTLY_Q4_K_M = 15 # except 1d tensors
|
||||
MOSTLY_Q5_K_S = 16 # except 1d tensors
|
||||
MOSTLY_Q5_K_M = 17 # except 1d tensors
|
||||
MOSTLY_Q6_K = 18 # except 1d tensors
|
||||
MOSTLY_IQ2_XXS = 19 # except 1d tensors
|
||||
MOSTLY_IQ2_XS = 20 # except 1d tensors
|
||||
MOSTLY_Q2_K_S = 21 # except 1d tensors
|
||||
MOSTLY_IQ3_XS = 22 # except 1d tensors
|
||||
MOSTLY_IQ3_XXS = 23 # except 1d tensors
|
||||
MOSTLY_IQ1_S = 24 # except 1d tensors
|
||||
MOSTLY_IQ4_NL = 25 # except 1d tensors
|
||||
MOSTLY_IQ3_S = 26 # except 1d tensors
|
||||
MOSTLY_IQ3_M = 27 # except 1d tensors
|
||||
MOSTLY_IQ2_S = 28 # except 1d tensors
|
||||
MOSTLY_IQ2_M = 29 # except 1d tensors
|
||||
MOSTLY_IQ4_XS = 30 # except 1d tensors
|
||||
MOSTLY_IQ1_M = 31 # except 1d tensors
|
||||
MOSTLY_BF16 = 32 # except 1d tensors
|
||||
|
||||
GUESSED = 1024 # not specified in the model file
|
||||
|
||||
|
||||
class GGUFEndian(IntEnum):
|
||||
LITTLE = 0
|
||||
BIG = 1
|
||||
|
|
|
@ -7,7 +7,7 @@ import struct
|
|||
import tempfile
|
||||
from enum import Enum, auto
|
||||
from io import BufferedWriter
|
||||
from typing import IO, Any, Callable, Sequence, Mapping
|
||||
from typing import IO, Any, Sequence, Mapping
|
||||
from string import ascii_letters, digits
|
||||
|
||||
import numpy as np
|
||||
|
@ -28,47 +28,6 @@ from .constants import (
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LazyTensor:
|
||||
data: Callable[[], np.ndarray[Any, Any]]
|
||||
# to avoid too deep recursion
|
||||
functions: list[Callable[[np.ndarray[Any, Any]], np.ndarray[Any, Any]]]
|
||||
dtype: np.dtype[Any]
|
||||
shape: tuple[int, ...]
|
||||
|
||||
def __init__(self, data: Callable[[], np.ndarray[Any, Any]], *, dtype: type, shape: tuple[int, ...]):
|
||||
self.data = data
|
||||
self.functions = []
|
||||
self.dtype = np.dtype(dtype)
|
||||
self.shape = shape
|
||||
|
||||
def astype(self, dtype: type, **kwargs) -> LazyTensor:
|
||||
self.functions.append(lambda n: n.astype(dtype, **kwargs))
|
||||
self.dtype = np.dtype(dtype)
|
||||
return self
|
||||
|
||||
@property
|
||||
def nbytes(self) -> int:
|
||||
size = 1
|
||||
for n in self.shape:
|
||||
size *= n
|
||||
return size * self.dtype.itemsize
|
||||
|
||||
def tofile(self, *args, **kwargs) -> None:
|
||||
data = self.data()
|
||||
for f in self.functions:
|
||||
data = f(data)
|
||||
assert data.shape == self.shape
|
||||
assert data.dtype == self.dtype
|
||||
assert data.nbytes == self.nbytes
|
||||
self.functions = []
|
||||
self.data = lambda: data
|
||||
data.tofile(*args, **kwargs)
|
||||
|
||||
def byteswap(self, *args, **kwargs) -> LazyTensor:
|
||||
self.functions.append(lambda n: n.byteswap(*args, **kwargs))
|
||||
return self
|
||||
|
||||
|
||||
class WriterState(Enum):
|
||||
EMPTY = auto()
|
||||
HEADER = auto()
|
||||
|
@ -79,7 +38,7 @@ class WriterState(Enum):
|
|||
class GGUFWriter:
|
||||
fout: BufferedWriter
|
||||
temp_file: tempfile.SpooledTemporaryFile[bytes] | None
|
||||
tensors: list[np.ndarray[Any, Any] | LazyTensor]
|
||||
tensors: list[np.ndarray[Any, Any]]
|
||||
_simple_value_packing = {
|
||||
GGUFValueType.UINT8: "B",
|
||||
GGUFValueType.INT8: "b",
|
||||
|
@ -278,7 +237,7 @@ class GGUFWriter:
|
|||
self.ti_data_count += 1
|
||||
|
||||
def add_tensor(
|
||||
self, name: str, tensor: np.ndarray[Any, Any] | LazyTensor, raw_shape: Sequence[int] | None = None,
|
||||
self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None,
|
||||
raw_dtype: GGMLQuantizationType | None = None,
|
||||
) -> None:
|
||||
if self.endianess == GGUFEndian.BIG:
|
||||
|
@ -303,7 +262,7 @@ class GGUFWriter:
|
|||
if pad != 0:
|
||||
fp.write(bytes([0] * pad))
|
||||
|
||||
def write_tensor_data(self, tensor: np.ndarray[Any, Any] | LazyTensor) -> None:
|
||||
def write_tensor_data(self, tensor: np.ndarray[Any, Any]) -> None:
|
||||
if self.state is not WriterState.TI_DATA:
|
||||
raise ValueError(f'Expected output file to contain tensor info, got {self.state}')
|
||||
|
||||
|
@ -391,7 +350,7 @@ class GGUFWriter:
|
|||
def add_name(self, name: str) -> None:
|
||||
self.add_string(Keys.General.NAME, name)
|
||||
|
||||
def add_quantization_version(self, quantization_version: GGMLQuantizationType) -> None:
|
||||
def add_quantization_version(self, quantization_version: int) -> None:
|
||||
self.add_uint32(
|
||||
Keys.General.QUANTIZATION_VERSION, quantization_version)
|
||||
|
||||
|
|
225
gguf-py/gguf/lazy.py
Normal file
225
gguf-py/gguf/lazy.py
Normal file
|
@ -0,0 +1,225 @@
|
|||
from __future__ import annotations
|
||||
from abc import ABC, ABCMeta, abstractmethod
|
||||
|
||||
import logging
|
||||
from typing import Any, Callable
|
||||
from collections import deque
|
||||
|
||||
import numpy as np
|
||||
from numpy.typing import DTypeLike
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class LazyMeta(ABCMeta):
|
||||
|
||||
def __new__(cls, name: str, bases: tuple[type, ...], namespace: dict[str, Any], **kwargs):
|
||||
def __getattr__(self, __name: str) -> Any:
|
||||
meta_attr = getattr(self._meta, __name)
|
||||
if callable(meta_attr):
|
||||
return type(self)._wrap_fn(
|
||||
(lambda s, *args, **kwargs: getattr(s, __name)(*args, **kwargs)),
|
||||
use_self=self,
|
||||
)
|
||||
elif isinstance(meta_attr, self._tensor_type):
|
||||
# e.g. self.T with torch.Tensor should still be wrapped
|
||||
return type(self)._wrap_fn(lambda s: getattr(s, __name))(self)
|
||||
else:
|
||||
# no need to wrap non-tensor properties,
|
||||
# and they likely don't depend on the actual contents of the tensor
|
||||
return meta_attr
|
||||
|
||||
namespace["__getattr__"] = __getattr__
|
||||
|
||||
# need to make a builder for the wrapped wrapper to copy the name,
|
||||
# or else it fails with very cryptic error messages,
|
||||
# because somehow the same string would end up in every closures
|
||||
def mk_wrap(op_name: str, *, meta_noop: bool = False):
|
||||
# need to wrap the wrapper to get self
|
||||
def wrapped_special_op(self, *args, **kwargs):
|
||||
return type(self)._wrap_fn(
|
||||
getattr(type(self)._tensor_type, op_name),
|
||||
meta_noop=meta_noop,
|
||||
)(self, *args, **kwargs)
|
||||
return wrapped_special_op
|
||||
|
||||
# special methods bypass __getattr__, so they need to be added manually
|
||||
# ref: https://docs.python.org/3/reference/datamodel.html#special-lookup
|
||||
# NOTE: doing this from a metaclass is very convenient
|
||||
# TODO: make this even more comprehensive
|
||||
for binary_op in (
|
||||
"lt", "le", "eq", "ne", "ge", "gt", "not"
|
||||
"abs", "add", "and", "floordiv", "invert", "lshift", "mod", "mul", "matmul",
|
||||
"neg", "or", "pos", "pow", "rshift", "sub", "truediv", "xor",
|
||||
"iadd", "iand", "ifloordiv", "ilshift", "imod", "imul", "ior", "irshift", "isub", "ixor",
|
||||
"radd", "rand", "rfloordiv", "rmul", "ror", "rpow", "rsub", "rtruediv", "rxor",
|
||||
):
|
||||
attr_name = f"__{binary_op}__"
|
||||
# the result of these operators usually has the same shape and dtype as the input,
|
||||
# so evaluation on the meta tensor can be skipped.
|
||||
namespace[attr_name] = mk_wrap(attr_name, meta_noop=True)
|
||||
|
||||
for special_op in (
|
||||
"getitem", "setitem", "len",
|
||||
):
|
||||
attr_name = f"__{special_op}__"
|
||||
namespace[attr_name] = mk_wrap(attr_name, meta_noop=False)
|
||||
|
||||
return super().__new__(cls, name, bases, namespace, **kwargs)
|
||||
|
||||
|
||||
# Tree of lazy tensors
|
||||
class LazyBase(ABC, metaclass=LazyMeta):
|
||||
_tensor_type: type
|
||||
_meta: Any
|
||||
_data: Any | None
|
||||
_lazy: deque[LazyBase] # shared within a graph, to avoid deep recursion when making eager
|
||||
_args: tuple
|
||||
_func: Callable[[tuple], Any] | None
|
||||
|
||||
def __init__(self, *, meta: Any, data: Any | None = None, lazy: deque[LazyBase] | None = None, args: tuple = (), func: Callable[[tuple], Any] | None = None):
|
||||
super().__init__()
|
||||
self._meta = meta
|
||||
self._data = data
|
||||
self._lazy = lazy if lazy is not None else deque()
|
||||
self._args = args
|
||||
self._func = func
|
||||
assert self._func is not None or self._data is not None
|
||||
if self._data is None:
|
||||
self._lazy.append(self)
|
||||
|
||||
def __init_subclass__(cls) -> None:
|
||||
if "_tensor_type" not in cls.__dict__:
|
||||
raise TypeError(f"property '_tensor_type' must be defined for {cls!r}")
|
||||
return super().__init_subclass__()
|
||||
|
||||
@staticmethod
|
||||
def _recurse_apply(o: Any, fn: Callable[[Any], Any]) -> Any:
|
||||
# TODO: dict and set
|
||||
if isinstance(o, (list, tuple)):
|
||||
L = []
|
||||
for item in o:
|
||||
L.append(LazyBase._recurse_apply(item, fn))
|
||||
if isinstance(o, tuple):
|
||||
L = tuple(L)
|
||||
return L
|
||||
elif isinstance(o, LazyBase):
|
||||
return fn(o)
|
||||
else:
|
||||
return o
|
||||
|
||||
@classmethod
|
||||
def _wrap_fn(cls, fn: Callable, *, use_self: LazyBase | None = None, meta_noop: bool | DTypeLike = False) -> Callable[[Any], Any]:
|
||||
def wrapped_fn(*args, **kwargs):
|
||||
if kwargs is None:
|
||||
kwargs = {}
|
||||
args = ((use_self,) if use_self is not None else ()) + args
|
||||
|
||||
meta_args = LazyBase._recurse_apply(args, lambda t: t._meta)
|
||||
|
||||
if isinstance(meta_noop, bool) and not meta_noop:
|
||||
try:
|
||||
res = fn(*meta_args, **kwargs)
|
||||
except NotImplementedError:
|
||||
# running some operations on PyTorch's Meta tensors can cause this exception
|
||||
res = None
|
||||
else:
|
||||
# some operators don't need to actually run on the meta tensors
|
||||
assert len(args) > 0
|
||||
res = args[0]
|
||||
assert isinstance(res, cls)
|
||||
res = res._meta
|
||||
# allow operations to override the dtype
|
||||
if meta_noop is not True:
|
||||
res = cls.meta_with_dtype(res, meta_noop)
|
||||
|
||||
if isinstance(res, cls._tensor_type):
|
||||
def collect_replace(t: LazyBase):
|
||||
if collect_replace.shared_lazy is None:
|
||||
collect_replace.shared_lazy = t._lazy
|
||||
else:
|
||||
collect_replace.shared_lazy.extend(t._lazy)
|
||||
t._lazy = collect_replace.shared_lazy
|
||||
|
||||
# emulating a static variable
|
||||
collect_replace.shared_lazy = None
|
||||
|
||||
LazyBase._recurse_apply(args, collect_replace)
|
||||
|
||||
shared_lazy = collect_replace.shared_lazy
|
||||
|
||||
return cls(meta=cls.eager_to_meta(res), lazy=shared_lazy, args=args, func=lambda a: fn(*a, **kwargs))
|
||||
else:
|
||||
del res # not needed
|
||||
# non-tensor return likely relies on the contents of the args
|
||||
# (e.g. the result of torch.equal)
|
||||
eager_args = cls.to_eager(args)
|
||||
return fn(*eager_args, **kwargs)
|
||||
return wrapped_fn
|
||||
|
||||
@classmethod
|
||||
def to_eager(cls, t: Any) -> Any:
|
||||
def simple_to_eager(_t: LazyBase) -> Any:
|
||||
def already_eager_to_eager(_t: LazyBase) -> Any:
|
||||
assert _t._data is not None
|
||||
return _t._data
|
||||
|
||||
while _t._data is None:
|
||||
lt = _t._lazy.popleft()
|
||||
if lt._data is not None:
|
||||
raise ValueError(f"{lt} did not belong in the lazy queue")
|
||||
assert lt._func is not None
|
||||
lt._args = cls._recurse_apply(lt._args, already_eager_to_eager)
|
||||
lt._data = lt._func(lt._args)
|
||||
# sanity check
|
||||
assert lt._data.dtype == lt._meta.dtype
|
||||
assert lt._data.shape == lt._meta.shape
|
||||
|
||||
return _t._data
|
||||
|
||||
# recurse into lists and/or tuples, keeping their structure
|
||||
return cls._recurse_apply(t, simple_to_eager)
|
||||
|
||||
@classmethod
|
||||
def eager_to_meta(cls, t: Any) -> Any:
|
||||
return cls.meta_with_dtype(t, t.dtype)
|
||||
|
||||
# must be overridden, meta tensor init is backend-specific
|
||||
@classmethod
|
||||
@abstractmethod
|
||||
def meta_with_dtype(cls, m: Any, dtype: Any) -> Any: pass
|
||||
|
||||
@classmethod
|
||||
def from_eager(cls, t: Any) -> Any:
|
||||
if type(t) is cls:
|
||||
# already eager
|
||||
return t
|
||||
elif isinstance(t, cls._tensor_type):
|
||||
return cls(meta=cls.eager_to_meta(t), data=t)
|
||||
else:
|
||||
return TypeError(f"{type(t)!r} is not compatible with {cls._tensor_type!r}")
|
||||
|
||||
|
||||
class LazyNumpyTensor(LazyBase):
|
||||
_tensor_type = np.ndarray
|
||||
|
||||
@classmethod
|
||||
def meta_with_dtype(cls, m: np.ndarray[Any, Any], dtype: DTypeLike) -> np.ndarray[Any, Any]:
|
||||
# The initial idea was to use np.nan as the fill value,
|
||||
# but non-float types like np.int16 can't use that.
|
||||
# So zero it is.
|
||||
cheat = np.zeros(1, dtype)
|
||||
return np.lib.stride_tricks.as_strided(cheat, m.shape, (0 for _ in m.shape))
|
||||
|
||||
def astype(self, dtype, *args, **kwargs):
|
||||
meta = type(self).meta_with_dtype(self._meta, dtype)
|
||||
full_args = (self, dtype,) + args
|
||||
# very important to pass the shared _lazy deque, or else there's an infinite loop somewhere.
|
||||
return type(self)(meta=meta, args=full_args, lazy=self._lazy, func=(lambda a: a[0].astype(*a[1:], **kwargs)))
|
||||
|
||||
def tofile(self, *args, **kwargs):
|
||||
eager = LazyNumpyTensor.to_eager(self)
|
||||
return eager.tofile(*args, **kwargs)
|
||||
|
||||
# TODO: __array_function__
|
|
@ -9,5 +9,4 @@
|
|||
-r ./requirements/requirements-convert-hf-to-gguf.txt
|
||||
-r ./requirements/requirements-convert-hf-to-gguf-update.txt
|
||||
-r ./requirements/requirements-convert-llama-ggml-to-gguf.txt
|
||||
-r ./requirements/requirements-convert-lora-to-ggml.txt
|
||||
-r ./requirements/requirements-convert-persimmon-to-gguf.txt
|
||||
|
|
|
@ -1,2 +0,0 @@
|
|||
-r ./requirements-convert.txt
|
||||
torch~=2.1.1
|
117
scripts/debug-test.sh
Executable file
117
scripts/debug-test.sh
Executable file
|
@ -0,0 +1,117 @@
|
|||
#!/bin/bash
|
||||
test_suite=${1:-}
|
||||
test_number=${2:-}
|
||||
|
||||
PROG=${0##*/}
|
||||
build_dir="build-ci-debug"
|
||||
|
||||
if [ x"$1" = x"-h" ] || [ x"$1" = x"--help" ]; then
|
||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||
echo "Debug specific ctest program."
|
||||
echo
|
||||
echo "Options:"
|
||||
echo " -h, --help Display this help and exit"
|
||||
echo
|
||||
echo "Arguments:"
|
||||
echo " <test_regex> (Mandatory) Supply one regex to the script to filter tests"
|
||||
echo " (test_number) (Optional) Test number to run a specific test"
|
||||
echo
|
||||
echo "Example:"
|
||||
echo " $PROG test-tokenizer"
|
||||
echo " $PROG test-tokenizer 3"
|
||||
echo
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Function to select and debug a test
|
||||
function select_test() {
|
||||
test_suite=${1:-test}
|
||||
test_number=${2:-}
|
||||
|
||||
# Sanity Check If Tests Is Detected
|
||||
printf "\n\nGathering tests that fit REGEX: ${test_suite} ...\n"
|
||||
tests=($(ctest -R ${test_suite} -V -N | grep -E " +Test +#[0-9]+*" | cut -d':' -f2 | awk '{$1=$1};1'))
|
||||
if [ ${#tests[@]} -eq 0 ]
|
||||
then
|
||||
echo "No tests avaliable... check your compliation process..."
|
||||
echo "Exiting."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -z $test_number ]
|
||||
then
|
||||
# List out avaliable tests
|
||||
printf "Which test would you like to debug?\n"
|
||||
id=0
|
||||
for s in "${tests[@]}"
|
||||
do
|
||||
echo "Test# ${id}"
|
||||
echo " $s"
|
||||
((id++))
|
||||
done
|
||||
|
||||
# Prompt user which test they wanted to run
|
||||
printf "\nRun test#? "
|
||||
read test_number
|
||||
else
|
||||
printf "\nUser Already Requested #${test_number}"
|
||||
fi
|
||||
|
||||
# Start GDB with the requested test binary and arguments
|
||||
printf "Debugging(GDB) test: ${tests[test_number]}\n"
|
||||
# Change IFS (Internal Field Separator)
|
||||
sIFS=$IFS
|
||||
IFS=$'\n'
|
||||
|
||||
# Get test args
|
||||
gdb_args=($(ctest -R ${test_suite} -V -N | grep "Test command" | cut -d':' -f3 | awk '{$1=$1};1' ))
|
||||
IFS=$sIFS
|
||||
printf "Debug arguments: ${gdb_args[test_number]}\n\n"
|
||||
|
||||
# Expand paths if needed
|
||||
args=()
|
||||
for x in $(echo ${gdb_args[test_number]} | sed -e 's/"\/\<//' -e 's/\>"//')
|
||||
do
|
||||
args+=($(echo $x | sed -e 's/.*\/..\//..\//'))
|
||||
done
|
||||
|
||||
# Execute debugger
|
||||
echo "gdb args: ${args[@]}"
|
||||
gdb --args ${args[@]}
|
||||
}
|
||||
|
||||
# Step 0: Check the args
|
||||
if [ -z "$test_suite" ]
|
||||
then
|
||||
echo "Usage: $PROG [OPTION]... <test_regex> (test_number)"
|
||||
echo "Supply one regex to the script to filter tests,"
|
||||
echo "and optionally a test number to run a specific test."
|
||||
echo "Use --help flag for full instructions"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Step 1: Reset and Setup folder context
|
||||
## Sanity check that we are actually in a git repo
|
||||
repo_root=$(git rev-parse --show-toplevel)
|
||||
if [ ! -d "$repo_root" ]; then
|
||||
echo "Error: Not in a Git repository."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
## Reset folder to root context of git repo
|
||||
pushd "$repo_root" || exit 1
|
||||
|
||||
## Create and enter build directory
|
||||
rm -rf "$build_dir" && mkdir "$build_dir" || exit 1
|
||||
|
||||
# Step 2: Setup Build Environment and Compile Test Binaries
|
||||
cmake -B "./$build_dir" -DCMAKE_BUILD_TYPE=Debug -DLLAMA_CUDA=1 -DLLAMA_FATAL_WARNINGS=ON || exit 1
|
||||
pushd "$build_dir" && make -j || exit 1
|
||||
|
||||
# Step 3: Debug the Test
|
||||
select_test "$test_suite" "$test_number"
|
||||
|
||||
# Step 4: Return to the directory from which the user ran the command.
|
||||
popd || exit 1
|
||||
popd || exit 1
|
||||
popd || exit 1
|
|
@ -1 +1 @@
|
|||
98875cdb7e9ceeb726d1c196d2fecb3cbb59b93a
|
||||
30f54cbb3ada3e4c5bc6924de3e5918e5be4ff11
|
||||
|
|
|
@ -2,6 +2,7 @@
|
|||
#include <ggml-alloc.h>
|
||||
#include <ggml-backend.h>
|
||||
#include <ggml-backend-impl.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <array>
|
||||
#include <cfloat>
|
||||
|
@ -2173,11 +2174,7 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_timestep_embedding());
|
||||
test_cases.emplace_back(new test_leaky_relu());
|
||||
|
||||
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
for (int hs : { 64, 128, }) { // other head sizes not implemented
|
||||
#else
|
||||
for (int hs : { 64, 80, 128, 256, }) {
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
for (float max_bias : {0.0f, 8.0f}) {
|
||||
for (int nh : { 32, }) {
|
||||
for (int kv : { 512, 1024, }) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue