Merge remote-tracking branch 'origin/gg/refactor-alibi-2' into HEAD

This commit is contained in:
Georgi Gerganov 2024-05-10 15:27:21 +03:00
commit d9adb8832b
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
29 changed files with 45973 additions and 36626 deletions

View file

@ -2,7 +2,7 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -140,6 +140,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
**HTTP server**
@ -175,6 +176,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)

View file

@ -49,6 +49,10 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍
if len(sys.argv) == 2:
token = sys.argv[1]
if not token.startswith("hf_"):
logger.info("Huggingface token seems invalid")
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
else:
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
@ -260,6 +264,7 @@ tests = [
"3333333",
"33333333",
"333333333",
# "Cửa Việt", # llama-bpe fails on this
chktxt,
]

View file

@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v;
if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i);
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) {
v = *(float *) data + i;
v = *(float *) &data[i];
} else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) data + i;
v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) data + i;
v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) data + i;
v = (float) *(int8_t *) &data[i];
} else {
GGML_ASSERT(false);
}

View file

@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
#define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@ -425,6 +426,7 @@ struct clip_vision_model {
// embeddings
struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings;
struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings;
struct ggml_tensor * pre_ln_w;
@ -501,6 +503,11 @@ struct clip_ctx {
bool use_gelu = false;
int32_t ftype = 1;
bool has_class_embedding = true;
bool has_pre_norm = true;
bool has_post_norm = false;
bool has_patch_bias = false;
struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data;
@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
const int num_positions = num_patches + 1;
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head;
@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
}
// concat class_embeddings and patch_embeddings
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
struct ggml_tensor * embeddings = inp;
if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions");
@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
// pre-layernorm
{
if (ctx->has_pre_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln");
@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = cur;
}
// post-layernorm
if (ctx->has_post_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
// llava projector
{
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@ -1148,12 +1170,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}
try {
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
new_clip->has_class_embedding = true;
} catch (const std::exception& e) {
new_clip->has_class_embedding = false;
}
try {
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
new_clip->has_pre_norm = true;
} catch (std::exception & e) {
new_clip->has_pre_norm = false;
}
try {
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
new_clip->has_post_norm = true;
} catch (std::exception & e) {
new_clip->has_post_norm = false;
}
try {
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
new_clip->has_patch_bias = true;
} catch (std::exception & e) {
new_clip->has_patch_bias = false;
}
try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
} catch(const std::exception& e) {
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
}
@ -1797,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int image_size = hparams.image_size;
const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_positions = num_patches + 1;
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
{
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@ -1825,12 +1874,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
}
{
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
if (ctx->has_class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
void* zero_mem = malloc(ggml_nbytes(embeddings));
memset(zero_mem, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem);
void* zero_mem = malloc(ggml_nbytes(embeddings));
memset(zero_mem, 0, ggml_nbytes(embeddings));
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem);
}
}
{

View file

@ -4,7 +4,6 @@
#include "ggml-cuda/common.cuh"
#include "ggml-cuda/acc.cuh"
#include "ggml-cuda/alibi.cuh"
#include "ggml-cuda/arange.cuh"
#include "ggml-cuda/argsort.cuh"
#include "ggml-cuda/binbcast.cuh"
@ -2277,9 +2276,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ROPE:
ggml_cuda_op_rope(ctx, dst);
break;
case GGML_OP_ALIBI:
ggml_cuda_op_alibi(ctx, dst);
break;
case GGML_OP_IM2COL:
ggml_cuda_op_im2col(ctx, dst);
break;
@ -2829,7 +2825,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_ALIBI:
case GGML_OP_IM2COL:
case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS:

View file

@ -1,63 +0,0 @@
#include "alibi.cuh"
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
const int n_heads_log2_floor, const float m0, const float m1) {
const int col = blockDim.x*blockIdx.x + threadIdx.x;
if (col >= ncols) {
return;
}
const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int i = row*ncols + col;
const int k = row/k_rows;
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
dst[i] = col * m_k + x[i];
}
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
const int k_rows, const int n_heads_log2_floor, const float m0,
const float m1, cudaStream_t stream) {
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
const dim3 block_nums(num_blocks_x, nrows, 1);
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
}
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
//GGML_ASSERT(ne01 + n_past == ne00);
GGML_ASSERT(n_head == ne02);
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
alibi_f32_cuda(src0_d, dst_d, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, stream);
}

View file

@ -1,5 +0,0 @@
#include "common.cuh"
#define CUDA_ALIBI_BLOCK_SIZE 32
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -234,122 +234,6 @@ typedef float dfloat; // dequantize float
typedef float2 dfloat2;
#endif //GGML_CUDA_F16
[[noreturn]]
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch);
GGML_UNUSED(arch_list);
#else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
__trap();
GGML_UNUSED(no_device_code); // suppress unused function warning
}
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
}
return x;
}
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
}
return a;
}
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
}
return a;
#else
GGML_UNUSED(a);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
}
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX
return __hmax(a, b);
#else
return __half2float(a) > __half2float(b) ? a : b;
#endif // CUDART_VERSION >= CUDART_HMAX
#else
GGML_UNUSED(a);
GGML_UNUSED(b);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
}
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX
return __hmax2(a, b);
#else
half2 ret;
reinterpret_cast<half&>(ret.x) = __low2float(a) > __low2float(b) ? __low2half(a) : __low2half(b);
reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
return ret;
#endif // CUDART_VERSION >= CUDART_HMAX
#else
GGML_UNUSED(a);
GGML_UNUSED(b);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
}
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
#else
GGML_UNUSED(x);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#if CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high;
}
#endif // CUDART_VERSION < 12000
#if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300
@ -433,11 +317,143 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
}
#endif // defined(GGML_USE_HIPBLAS)
#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
#define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
static bool fp16_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
}
[[noreturn]]
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch);
GGML_UNUSED(arch_list);
#else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
__trap();
GGML_UNUSED(no_device_code); // suppress unused function warning
}
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
}
return x;
}
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
}
return a;
}
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if FP16_AVAILABLE
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
reinterpret_cast<half&>(a.x) += __low2half(a_other);
reinterpret_cast<half&>(a.y) += __high2half(a_other);
}
return a;
#else
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
}
return a;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#else
NO_DEVICE_CODE;
return a;
#endif // FP16_AVAILABLE
}
static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
}
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
#if FP16_AVAILABLE
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
return __float2half(fmaxf(__half2float(a), __half2float(b)));
#else
return __hmax(a, b);
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
#else
NO_DEVICE_CODE;
GGML_UNUSED(b);
return a;
#endif // FP16_AVAILABLE
}
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX
return __hmax2(a, b);
#else
half2 ret;
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
return ret;
#endif // CUDART_VERSION >= CUDART_HMAX
#else
GGML_UNUSED(a);
GGML_UNUSED(b);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
}
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
#else
GGML_UNUSED(x);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#if CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high;
}
#endif // CUDART_VERSION < 12000
// TODO: move to ggml-common.h
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

View file

@ -11,8 +11,10 @@
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
template<int D, int parallel_blocks> // D == head size
__launch_bounds__(((D + WARP_SIZE - 1) / WARP_SIZE)*WARP_SIZE, 1)
template<int D, int ncols, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
@ -21,6 +23,10 @@ static __global__ void flash_attn_vec_ext_f16(
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00,
const int ne01,
const int ne02,
@ -44,55 +50,89 @@ static __global__ void flash_attn_vec_ext_f16(
#if FP16_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic = blockIdx.x / parallel_blocks; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic);
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic;
const half * maskh = (const half *) mask + ne11*ic0;
const int stride_KV = nb11 / sizeof(half);
const int stride_KV2 = nb11 / sizeof(half2);
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < nwarps*WARP_SIZE);
half slopeh = __float2half(1.0f);
__shared__ half KQ[nwarps*WARP_SIZE];
KQ[tid] = -INFINITY;
// ALiBi
if (max_bias > 0.0f) {
const int h = blockIdx.y;
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slopeh = __float2half(powf(base, exph));
}
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < D);
__shared__ half KQ[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -HALF_MAX_HALF;
}
half2 * KQ2 = (half2 *) KQ;
half kqmax = -HALF_MAX_HALF;
half kqsum = 0.0f;
half kqmax[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -HALF_MAX_HALF;
}
half kqsum[ncols] = {0.0f};
__shared__ half kqmax_shared[WARP_SIZE];
__shared__ half kqsum_shared[WARP_SIZE];
if (threadIdx.y == 0) {
kqmax_shared[threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[threadIdx.x] = 0.0f;
__shared__ half kqmax_shared[ncols][WARP_SIZE];
__shared__ half kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) {
kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[j][threadIdx.x] = 0.0f;
}
}
__syncthreads();
// Convert Q to half2 and store in registers:
half2 Q_h2[(D/2 + WARP_SIZE - 1) / WARP_SIZE];
half2 Q_h2[ncols][D/(2*WARP_SIZE)];
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
break;
}
for (int j = 0; j < ncols; ++j) {
#pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x;
Q_h2[i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(Q_f2[i].x, Q_f2[i].y);
const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
}
}
half2 VKQ = make_half2(0.0f, 0.0f); // Each thread calculates a single VKQ value.
half2 VKQ[ncols] = {{0.0f, 0.0f}};
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
const int k_start = parallel_blocks == 1 ? 0 : ip*D;
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
// Calculate KQ tile and keep track of new maximum KQ values:
half kqmax_new = kqmax;
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
half kqmax_new = kqmax[0];
half kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y;
@ -101,89 +141,112 @@ static __global__ void flash_attn_vec_ext_f16(
break;
}
half2 sum2 = make_half2(0.0f, 0.0f);
half2 sum2[ncols] = {{0.0f, 0.0f}};
#pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
const int k_KQ = k_KQ_0 + threadIdx.x;
if (k_KQ_0 + WARP_SIZE > D/2 && k_KQ >= D/2) {
break;
}
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE];
}
sum2 = warp_reduce_sum(sum2);
half sum = __low2half(sum2) + __high2half(sum2);
sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f);
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
if (threadIdx.x == 0) {
KQ[i_KQ] = sum;
}
}
kqmax_new = warp_reduce_max(kqmax_new);
if (threadIdx.x == 0) {
kqmax_shared[threadIdx.y] = kqmax_new;
}
__syncthreads();
kqmax_new = kqmax_shared[threadIdx.x];
kqmax_new = warp_reduce_max(kqmax_new);
const half KQ_max_scale = hexp(kqmax - kqmax_new);
kqmax = kqmax_new;
const half val = hexp(KQ[tid] - kqmax);
kqsum = kqsum*KQ_max_scale + val;
KQ[tid] = val;
VKQ *= __half2half2(KQ_max_scale);
__syncthreads();
if (tid < D) {
#pragma unroll
for (int k0 = 0; k0 < D; k0 += 2) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
break;
for (int j = 0; j < ncols; ++j) {
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
sum2[j] = warp_reduce_sum(sum2[j]);
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
if (ncols == 1) {
kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
} else {
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
}
half2 V_k;
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
VKQ += V_k*KQ2[k0/2];
if (threadIdx.x == 0) {
KQ[j*D + i_KQ] = sum;
}
}
}
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) {
kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= __half2half2(KQ_max_scale);
}
__syncthreads();
#pragma unroll
for (int k0 = 0; k0 < D; k0 += 2) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
break;
}
half2 V_k;
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
}
}
__syncthreads();
}
if (tid >= D) {
kqsum = 0.0f;
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqsum[j] = warp_reduce_sum(kqsum[j]);
if (threadIdx.x == 0) {
kqsum_shared[j][threadIdx.y] = kqsum[j];
}
}
kqsum = warp_reduce_sum(kqsum);
if (threadIdx.x == 0) {
kqsum_shared[threadIdx.y] = kqsum;
}
__syncthreads();
kqsum = kqsum_shared[threadIdx.x];
kqsum = warp_reduce_sum(kqsum);
if (tid >= D) {
return;
#pragma unroll
for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
if (parallel_blocks == 1) {
dst_val /= kqsum[j_VKQ];
}
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
}
half dst_val = (__low2half(VKQ) + __high2half(VKQ));
if (parallel_blocks == 1) {
dst_val /= kqsum;
if (parallel_blocks != 1 && tid != 0) {
#pragma unroll
for (int j = 0; j < ncols; ++j) {
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
}
}
dst[D*gridDim.y*blockIdx.x + D*blockIdx.y + tid] = dst_val;
if (parallel_blocks == 1 || tid != 0) {
return;
}
dst_meta[ic*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax, kqsum);
#else
NO_DEVICE_CODE;
#endif // FP16_AVAILABLE
@ -191,7 +254,9 @@ static __global__ void flash_attn_vec_ext_f16(
// D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_ext_f16(
const char * __restrict__ Q,
const char * __restrict__ K,
@ -200,6 +265,10 @@ static __global__ void flash_attn_ext_f16(
float * __restrict__ dst,
float2 * __restrict__ dst_meta,
const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00,
const int ne01,
const int ne02,
@ -256,6 +325,20 @@ static __global__ void flash_attn_ext_f16(
const int stride_Q = nb01 / sizeof(float);
const int stride_KV = nb11 / sizeof(half);
half slopeh = __float2half(1.0f);
half2 slope2 = make_half2(1.0f, 1.0f);
// ALiBi
if (max_bias > 0.0f) {
const int h = blockIdx.y;
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slopeh = __float2half(powf(base, exph));
slope2 = make_half2(slopeh, slopeh);
}
frag_b Q_b[D/16][ncols/frag_n];
// A single buffer for temporarily holding tiles of KQ and VKQ parts:
@ -372,7 +455,7 @@ static __global__ void flash_attn_ext_f16(
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x;
KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
}
KQ_max_new = warp_reduce_max(KQ_max_new);
@ -415,7 +498,7 @@ static __global__ void flash_attn_ext_f16(
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x;
KQ2_tmp[k0/WARP_SIZE] += mask ? mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
}
KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
@ -573,7 +656,9 @@ static __global__ void flash_attn_ext_f16(
}
template<int D, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta,
@ -642,7 +727,7 @@ static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
template <int D, int parallel_blocks> void launch_fattn_vec_f16(
template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
ggml_cuda_pool & pool, cudaStream_t main_stream
) {
@ -656,20 +741,29 @@ template <int D, int parallel_blocks> void launch_fattn_vec_f16(
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
const dim3 block_dim(WARP_SIZE, nwarps, 1);
const dim3 blocks_num(parallel_blocks*Q->ne[1], Q->ne[2], Q->ne[3]);
const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
const int shmem = 0;
float scale;
memcpy(&scale, KQV->op_params, sizeof(float));
float scale = 1.0f;
float max_bias = 0.0f;
flash_attn_vec_ext_f16<D, parallel_blocks>
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
const uint32_t n_head = Q->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
<<<blocks_num, block_dim, shmem, main_stream>>> (
(const char *) Q->data,
(const char *) K->data,
(const char *) V->data,
mask ? ((const char *) mask->data) : nullptr,
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale,
scale, max_bias, m0, m1, n_head_log2,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
@ -710,8 +804,17 @@ template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename K
const dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]);
const int shmem = 0;
float scale;
memcpy(&scale, KQV->op_params, sizeof(float));
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
const uint32_t n_head = Q->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>
<<<blocks_num, block_dim, shmem, main_stream>>> (
@ -720,7 +823,7 @@ template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename K
(const char *) V->data,
mask ? ((const char *) mask->data) : nullptr,
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale,
scale, max_bias, m0, m1, n_head_log2,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
@ -783,9 +886,98 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
ggml_cuda_set_device(ctx.device);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
const int32_t precision = KQV->op_params[1];
const int32_t precision = KQV->op_params[2];
if (!fp16_mma_available(cc)) {
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (precision != GGML_PREC_DEFAULT) {
if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
@ -845,16 +1037,17 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
}
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 256:
launch_fattn_vec_f16<256, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);

View file

@ -11,7 +11,7 @@ __device__ float __forceinline__ t2f32<half>(half val) {
}
template <bool vals_smem, int ncols_template, int block_size_template, typename T>
static __global__ void soft_max_f32(const float * x, const T * mask, const T * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
const int tid = threadIdx.x;
@ -23,16 +23,16 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE;
float slope = 0.0f;
float slope = 1.0f;
// ALiBi
if (max_bias > 0.0f) {
const int h = rowx/nrows_y; // head index
const float base = h < n_head_log2 ? m0 : m1;
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slope = powf(base, exp);
slope = powf(base, exph);
}
extern __shared__ float data_soft_max_f32[];
@ -53,7 +53,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
const int64_t ix = (int64_t)rowx*ncols + col;
const int64_t iy = (int64_t)rowy*ncols + col;
const float val = x[ix]*scale + (mask ? t2f32(mask[iy]) : 0.0f) + (pos ? slope*t2f32(pos[col]) : 0.0f);
const float val = x[ix]*scale + (mask ? slope*t2f32(mask[iy]) : 0.0f);
vals[col] = val;
max_val = max(max_val, val);
@ -125,7 +125,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
}
template<typename T>
static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
int nth = WARP_SIZE;
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const dim3 block_dims(nth, 1, 1);
@ -133,8 +133,8 @@ static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, fl
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
const uint32_t n_head_kv = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
const uint32_t n_head = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@ -142,43 +142,42 @@ static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, fl
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
switch (ncols_x) {
case 32:
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 64:
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 128:
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 256:
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 512:
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 1024:
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 2048:
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
case 4096:
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
default:
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break;
}
} else {
const size_t shmem_low = WARP_SIZE*sizeof(float);
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
}
}
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * src2 = dst->src[2];
const float * src0_d = (const float *)src0->data;
const void * src1_d = src1 ? (const void *)src1->data : nullptr;
@ -190,7 +189,6 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0);
@ -202,26 +200,15 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
// positions tensor
void * src2_d = nullptr;
const bool use_src2 = src2 != nullptr;
if (use_src2) {
src2_d = (void *)src2->data;
}
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
if (use_f16) {
const half * src1_dd = (const half *)src1_d;
const half * src2_dd = (const half *)src2_d;
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
} else {
const float * src1_dd = (const float *)src1_d;
const float * src2_dd = (const float *)src2_d;
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
}
}

View file

@ -1561,10 +1561,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
float scale;
memcpy(&scale, dst->op_params, sizeof(float));
#pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support")
#pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
GGML_ASSERT(src2 == nullptr);
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
} break;

View file

@ -169,7 +169,6 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
GGML_METAL_KERNEL_TYPE_ROPE_F32,
GGML_METAL_KERNEL_TYPE_ROPE_F16,
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
GGML_METAL_KERNEL_TYPE_IM2COL_F16,
GGML_METAL_KERNEL_TYPE_IM2COL_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
@ -623,7 +622,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
@ -759,7 +757,6 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
case GGML_OP_GROUP_NORM:
return ctx->support_simdgroup_reduction;
case GGML_OP_NORM:
case GGML_OP_ALIBI:
case GGML_OP_ROPE:
case GGML_OP_IM2COL:
return true;
@ -1357,13 +1354,12 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_SOFT_MAX:
{
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32);
int nth = 32; // SIMD width
id<MTLComputePipelineState> pipeline = nil;
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
if (ne00%4 == 0) {
while (nth < ne00/4 && nth < 256) {
@ -1394,8 +1390,8 @@ static enum ggml_status ggml_metal_graph_compute(
const int64_t nrows_x = ggml_nrows(src0);
const int64_t nrows_y = src0->ne[1];
const uint32_t n_head_kv = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv));
const uint32_t n_head = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@ -1407,20 +1403,15 @@ static enum ggml_status ggml_metal_graph_compute(
} else {
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
}
if (id_src2) {
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
} else {
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:2];
}
[encoder setBuffer:id_dst offset:offs_dst atIndex:3];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6];
[encoder setBytes:&scale length:sizeof(scale) atIndex:7];
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:9];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:10];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
[encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
[encoder setBytes:&scale length:sizeof(scale) atIndex:6];
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:8];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:9];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
@ -2225,49 +2216,6 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ALIBI:
{
GGML_ASSERT((src0t == GGML_TYPE_F32));
const int nth = MIN(1024, ne00);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
[encoder setBytes:&m1 length:sizeof( float) atIndex:19];
[encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ROPE:
{
GGML_ASSERT(ne10 == ne02);
@ -2565,7 +2513,7 @@ static enum ggml_status ggml_metal_graph_compute(
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
const int64_t ne31 = src3 ? src3->ne[1] : 0;
//const int64_t ne31 = src3 ? src3->ne[1] : 0;
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
@ -2577,7 +2525,16 @@ static enum ggml_status ggml_metal_graph_compute(
const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
float scale;
memcpy(&scale, dst->op_params, sizeof(float));
float max_bias;
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
const uint32_t n_head = src0->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
id<MTLComputePipelineState> pipeline = nil;
@ -2614,34 +2571,37 @@ static enum ggml_status ggml_metal_graph_compute(
}
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12];
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14];
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15];
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16];
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&ne31 length:sizeof( int64_t) atIndex:21];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:22];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:23];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:24];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:25];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26];
[encoder setBytes:&scale length:sizeof( float) atIndex:27];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2];
[encoder setBuffer:id_src3 offset:offs_src3 atIndex:3];
[encoder setBuffer:id_dst offset:offs_dst atIndex:4];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:6];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:7];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:8];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:10];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:11];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:12];
[encoder setBytes:&ne10 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&ne11 length:sizeof( int64_t) atIndex:14];
[encoder setBytes:&ne12 length:sizeof( int64_t) atIndex:15];
[encoder setBytes:&ne13 length:sizeof( int64_t) atIndex:16];
[encoder setBytes:&nb10 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:21];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:22];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:23];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:24];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:25];
[encoder setBytes:&scale length:sizeof( float) atIndex:26];
[encoder setBytes:&max_bias length:sizeof( float) atIndex:27];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:28];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:29];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:30];
if (!use_vec_kernel) {
// half8x8 kernel

View file

@ -356,7 +356,6 @@ template<typename T>
kernel void kernel_soft_max(
device const char * src0,
device const char * src1,
device const char * src2,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
@ -378,10 +377,9 @@ kernel void kernel_soft_max(
device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr;
device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
float slope = 0.0f;
float slope = 1.0f;
// ALiBi
if (max_bias > 0.0f) {
@ -397,7 +395,7 @@ kernel void kernel_soft_max(
float lmax = -INFINITY;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f));
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
}
// find the max value in the block
@ -422,7 +420,7 @@ kernel void kernel_soft_max(
// parallel sum
float lsum = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val);
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val);
lsum += exp_psrc0;
pdst[i00] = exp_psrc0;
}
@ -461,7 +459,6 @@ template<typename T>
kernel void kernel_soft_max_4(
device const char * src0,
device const char * src1,
device const char * src2,
device char * dst,
constant int64_t & ne00,
constant int64_t & ne01,
@ -483,10 +480,9 @@ kernel void kernel_soft_max_4(
device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr;
device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
float slope = 0.0f;
float slope = 1.0f;
if (max_bias > 0.0f) {
const int64_t h = i02;
@ -501,7 +497,7 @@ kernel void kernel_soft_max_4(
float4 lmax4 = -INFINITY;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)));
lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f)));
}
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@ -527,7 +523,7 @@ kernel void kernel_soft_max_4(
// parallel sum
float4 lsum4 = 0.0f;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))) - max_val);
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val);
lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4;
}
@ -1595,60 +1591,6 @@ kernel void kernel_mul_mv_f16_f32_l4(
}
}
kernel void kernel_alibi_f32(
device const float * src0,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant float & m0,
constant float & m1,
constant int & n_heads_log2_floor,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig[2];
const int64_t i02 = tgpig[1];
const int64_t i01 = tgpig[0];
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
const int64_t i3 = n / (ne2*ne1*ne0);
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
//const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
const int64_t k = i3*ne3 + i2;
float m_k;
if (k < n_heads_log2_floor) {
m_k = pow(m0, k + 1);
} else {
m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
}
device char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1;
device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01;
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
const float src_v = *(device float *)(src_row + i00*nb00);
device float * dst_v = (device float *)(dst_row + i00*nb0);
*dst_v = i00 * m_k + src_v;
}
}
static float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y));
@ -2116,13 +2058,16 @@ typedef void (flash_attn_ext_f16_t)(
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne31,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
constant float & max_bias,
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
threadgroup half * shared,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
@ -2154,13 +2099,16 @@ kernel void kernel_flash_attn_ext_f16(
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne31,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
constant float & max_bias,
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
@ -2257,6 +2205,19 @@ kernel void kernel_flash_attn_ext_f16(
// prepare diagonal scale matrix
simdgroup_float8x8 mscale(scale);
// prepare diagonal slope matrix
simdgroup_float8x8 mslope(1.0f);
// ALiBi
if (max_bias > 0.0f) {
const short h = iq2;
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
mslope = simdgroup_float8x8(pow(base, exph));
}
// loop over the KV cache
// each simdgroup handles blocks of Q rows and C columns
for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
@ -2279,9 +2240,10 @@ kernel void kernel_flash_attn_ext_f16(
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
}
// mqk = mqk*scale + mask
// mqk = mqk*scale + mask*slope
simdgroup_half8x8 mm;
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
simdgroup_multiply(mm, mslope, mm);
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
@ -2472,13 +2434,16 @@ kernel void kernel_flash_attn_ext_vec_f16(
constant uint64_t & nb11,
constant uint64_t & nb12,
constant uint64_t & nb13,
constant int64_t & ne31,
constant uint64_t & nb31,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant float & scale,
constant float & max_bias,
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
@ -2497,6 +2462,18 @@ kernel void kernel_flash_attn_ext_vec_f16(
const short T = D + 2*nsg*SH; // shared memory size per query in (half)
float slope = 1.0f;
// ALiBi
if (max_bias > 0.0f) {
const short h = iq2;
const float base = h < n_head_log2 ? m0 : m1;
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slope = pow(base, exp);
}
//threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data
threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4
threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
@ -2603,10 +2580,10 @@ kernel void kernel_flash_attn_ext_vec_f16(
mqk += simd_shuffle_down(mqk, 2);
mqk += simd_shuffle_down(mqk, 1);
// mqk = mqk*scale + mask
// mqk = mqk*scale + mask*slope
if (tiisg == 0) {
float4 mm = (float4) mp4[ic/4 + cc];
mqk = mqk*scale + mm;
mqk = mqk*scale + mm*slope;
ss4[cc] = mqk;
}
@ -2840,7 +2817,8 @@ kernel void kernel_cpy_f32_f16(
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
dst_data[i00] = src[0];
// TODO: is there a better way to handle -INFINITY?
dst_data[i00] = src[0] == -INFINITY ? -MAXHALF : src[0];
}
}

View file

@ -2119,6 +2119,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
if (alignment == (cl_uint)-1) {
ggml_cl_init();
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
alignment /= 8; // bits to bytes
}
return alignment;

View file

@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
#define SYCL_SCALE_BLOCK_SIZE 256
#define SYCL_CLAMP_BLOCK_SIZE 256
#define SYCL_ROPE_BLOCK_SIZE 256
#define SYCL_ALIBI_BLOCK_SIZE 32
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
#define SYCL_QUANTIZE_BLOCK_SIZE 256
#define SYCL_DEQUANTIZE_BLOCK_SIZE 256
@ -8330,24 +8329,26 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi;
// partial sum for each thread
const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
// partial sum for each thread
float tmp = 0.0f;
const block_q_t * x = (const block_q_t *) vx;
const block_q8_1 * y = (const block_q8_1 *) vy;
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row;
for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
i += blocks_per_warp) {
const int ibx = row*blocks_per_row + i; // x block index
const int ibx = row * blocks_per_row + i; // x block index
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
const int iby = i * (qk / QK8_1); // y block index that aligns with ibx
const int iqs =
vdr *
(item_ct1.get_local_id(2) %
(qi / vdr)); // x block quant index when casting the quants to int
const int iqs =
vdr *
(item_ct1.get_local_id(2) -
i * qi_vdr); // x block quant index when casting the quants to int
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
}
// sum up partial sums and write back result
@ -9314,32 +9315,6 @@ static void rope_glm_f32(
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
}
static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
const int n_heads_log2_floor, const float m0, const float m1,
const sycl::nd_item<3> &item_ct1) {
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
if (col >= ncols) {
return;
}
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
item_ct1.get_local_id(1);
const int i = row*ncols + col;
const int k = row/k_rows;
float m_k;
if (k < n_heads_log2_floor) {
m_k = dpct::pow(m0, k + 1);
} else {
m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
}
dst[i] = col * m_k + x[i];
}
static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
const sycl::nd_item<3> &item_ct1) {
const int row = item_ct1.get_group(1);
@ -9441,7 +9416,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
template <bool vals_smem, int ncols_template, int block_size_template>
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@ -9455,7 +9430,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
float slope = 0.0f;
float slope = 1.0f;
// ALiBi
if (max_bias > 0.0f) {
@ -9480,7 +9455,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col;
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f);
const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
vals[col] = val;
max_val = sycl::max(max_val, val);
@ -12962,20 +12937,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
});
}
static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
const int nrows, const int k_rows,
const int n_heads_log2_floor, const float m0,
const float m1, dpct::queue_ptr stream) {
const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
alibi_f32(x, dst, ncols, k_rows,
n_heads_log2_floor, m0, m1, item_ct1);
});
}
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
const int nrows, dpct::queue_ptr stream) {
const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@ -13056,7 +13017,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
}
template <bool vals_smem, int ncols_template, int block_size_template>
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par,
static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
const size_t n_local_scratch, dpct::queue_ptr stream) {
@ -13066,7 +13027,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par,
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
nrows_y, scale, max_bias, m0,
m1, n_head_log2, item_ct1,
local_buf_acc.get_pointer());
@ -13074,7 +13035,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
});
}
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos,
static void soft_max_f32_sycl(const float * x, const float * mask,
float * dst, const int ncols_x, const int nrows_x,
const int nrows_y, const float scale, const float max_bias,
dpct::queue_ptr stream) {
@ -13096,60 +13057,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
if (n_local_scratch*sizeof(float) < local_mem_size) {
if (ncols_x > max_block_size) {
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
return;
}
switch (ncols_x) {
case 32:
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 64:
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 128:
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 256:
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 512:
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 1024:
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 2048:
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
case 4096:
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
default:
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream);
break;
}
} else {
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale,
soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums,
block_dims, WARP_SIZE, stream);
}
@ -14560,36 +14521,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
(void) src1_dd;
}
inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
ggml_tensor *dst, const float *src0_dd,
const float *src1_dd, float *dst_dd,
const dpct::queue_ptr &main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
//GGML_ASSERT(ne01 + n_past == ne00);
GGML_ASSERT(n_head == ne02);
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
(void) src1;
(void) src1_dd;
}
static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd,
@ -14744,12 +14675,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const ggml_tensor * src2 = dst->src[2];
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0);
@ -14761,25 +14689,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
memcpy(&scale, dst->op_params + 0, sizeof(float));
memcpy(&max_bias, dst->op_params + 1, sizeof(float));
// positions tensor
float * src2_dd = nullptr;
sycl_pool_alloc<float> src2_f;
const bool use_src2 = src2 != nullptr;
if (use_src2) {
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
if (src2_on_device) {
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
src2_dd = (float *) src2_extra->data_device[g_main_device];
} else {
src2_dd = src2_f.alloc(ggml_nelements(src2));
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
}
}
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
nrows_x, nrows_y, scale, max_bias, main_stream);
}
@ -16230,10 +16140,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
}
static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
}
static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
}
@ -16610,9 +16516,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
case GGML_OP_ROPE:
func = ggml_sycl_rope;
break;
case GGML_OP_ALIBI:
func = ggml_sycl_alibi;
break;
case GGML_OP_IM2COL:
func = ggml_sycl_im2col;
break;
@ -17742,7 +17645,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE:
case GGML_OP_ALIBI:
case GGML_OP_IM2COL:
case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS:

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

307
ggml.c
View file

@ -2185,7 +2185,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"SOFT_MAX_BACK",
"ROPE",
"ROPE_BACK",
"ALIBI",
"CLAMP",
"CONV_TRANSPOSE_1D",
"IM2COL",
@ -2227,7 +2226,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK",
};
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 77");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none",
@ -2276,7 +2275,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"soft_max_back(x)",
"rope(x)",
"rope_back(x)",
"alibi(x)",
"clamp(x)",
"conv_transpose_1d(x)",
"im2col(x)",
@ -2318,7 +2316,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)",
};
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77");
static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 77");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -5646,7 +5644,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
struct ggml_tensor * pos,
float scale,
float max_bias,
bool inplace) {
@ -5660,20 +5657,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
GGML_ASSERT(mask->ne[1] >= a->ne[1]);
}
if (pos) {
GGML_ASSERT(ggml_is_vector(pos) || ggml_is_matrix(pos));
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
GGML_ASSERT(pos->ne[0] == a->ne[0]);
}
if (pos && mask) {
GGML_ASSERT(pos->type == mask->type);
}
if (max_bias > 0.0f) {
GGML_ASSERT(pos);
}
bool is_node = false;
if (a->grad) {
@ -5689,7 +5672,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
result->src[1] = mask;
result->src[2] = pos;
return result;
}
@ -5697,23 +5679,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false);
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
}
struct ggml_tensor * ggml_soft_max_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a) {
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true);
return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
}
struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
struct ggml_tensor * pos,
float scale,
float max_bias) {
return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false);
return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
}
// ggml_soft_max_back
@ -5928,37 +5909,6 @@ struct ggml_tensor * ggml_rope_back(
return result;
}
// ggml_alibi
struct ggml_tensor * ggml_alibi(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_head,
float bias_max) {
GGML_ASSERT(n_past >= 0);
bool is_node = false;
if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
// TODO: when implement backward, fix this:
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
int32_t op_params[3] = { n_past, n_head };
memcpy(op_params + 2, &bias_max, sizeof(float));
ggml_set_op_params(result, op_params, sizeof(op_params));
result->op = GGML_OP_ALIBI;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result;
}
// ggml_clamp
struct ggml_tensor * ggml_clamp(
@ -6486,7 +6436,8 @@ struct ggml_tensor * ggml_flash_attn_ext(
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * mask,
float scale) {
float scale,
float max_bias) {
GGML_ASSERT(ggml_can_mul_mat(k, q));
// TODO: check if vT can be multiplied by (k*qT)
if (mask) {
@ -6508,7 +6459,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
float params[] = { scale };
float params[] = { scale, max_bias };
ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_FLASH_ATTN_EXT;
@ -6528,7 +6479,7 @@ void ggml_flash_attn_ext_set_prec(
const int32_t prec_i32 = (int32_t) prec;
ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos
ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
}
// ggml_flash_ff
@ -13333,7 +13284,6 @@ static void ggml_compute_forward_soft_max_f32(
const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * src2 = dst->src[2];
assert(ggml_is_contiguous(dst));
assert(ggml_are_same_shape(src0, dst));
@ -13359,8 +13309,8 @@ static void ggml_compute_forward_soft_max_f32(
// TODO: is this supposed to be ceil instead of floor?
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
const uint32_t n_head_kv = ne02;
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv));
const uint32_t n_head = ne02;
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@ -13377,14 +13327,13 @@ static void ggml_compute_forward_soft_max_f32(
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching
const bool is_pos_matrix = src2 ? ggml_is_matrix(src2): false;
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
for (int i1 = ir0; i1 < ir1; i1++) {
// ALiBi
const uint32_t h = (i1/ne01)%ne02; // head
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
@ -13397,27 +13346,11 @@ static void ggml_compute_forward_soft_max_f32(
if (mp_f32) {
if (use_f16) {
for (int i = 0; i < nc; ++i) {
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]);
wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
}
} else {
for (int i = 0; i < nc; ++i) {
wp[i] += mp_f32[i];
}
}
}
// ALiBi bias
if (max_bias > 0.0f) {
const uint32_t h = (i1/ne01)%ne02; // head
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
if (use_f16) {
for (int i = 0; i < nc; ++i) {
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[is_pos_matrix ? i1%nc * nc + i: i]);
}
} else {
for (int i = 0; i < nc; ++i) {
wp[i] += slope*pos_f32[is_pos_matrix ? i1%nc * nc + i: i];
wp[i] += slope*mp_f32[i];
}
}
}
@ -13579,178 +13512,6 @@ static void ggml_compute_forward_soft_max_back(
}
}
// ggml_compute_forward_alibi
static void ggml_compute_forward_alibi_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
const int64_t n = ggml_nrows(src0);
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
const size_t nb0 = src0->nb[0];
const size_t nb1 = src0->nb[1];
const size_t nb2 = src0->nb[2];
//const int nb3 = src0->nb[3];
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(n_head == ne2);
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int64_t k = 0; k < ne2_ne3; k++) {
// TODO: k*nb2 or k*nb3
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
for (int64_t i = 0; i < ne0; i++) {
for (int64_t j = 0; j < ne1; j++) {
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
pdst[0] = i * m_k + src[0];
}
}
}
}
static void ggml_compute_forward_alibi_f16(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past
const int ne2 = src0->ne[2]; // n_head -> this is k
//const int ne3 = src0->ne[3]; // 1 -> bsz
const int n = ggml_nrows(src0);
const int ne2_ne3 = n/ne1; // ne2*ne3
const int nb0 = src0->nb[0];
const int nb1 = src0->nb[1];
const int nb2 = src0->nb[2];
//const int nb3 = src0->nb[3];
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
//GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
GGML_ASSERT(n_head == ne2);
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int k = 0; k < ne2_ne3; k++) {
// TODO: k*nb2 or k*nb3
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) {
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
// we return F32
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
}
}
}
}
static void ggml_compute_forward_alibi(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F16:
{
ggml_compute_forward_alibi_f16(params, dst);
} break;
case GGML_TYPE_F32:
{
ggml_compute_forward_alibi_f32(params, dst);
} break;
case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q8_K:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_I64:
case GGML_TYPE_F64:
case GGML_TYPE_COUNT:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_clamp
static void ggml_compute_forward_clamp_f32(
@ -15764,8 +15525,17 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int ir0 = dr*ith;
const int ir1 = MIN(ir0 + dr, nr);
float scale = 1.0f;
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
const uint32_t n_head = neq2;
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// loop over n_batch and n_head
for (int ir = ir0; ir < ir1; ++ir) {
@ -15774,6 +15544,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int iq2 = (ir - iq3*neq2*neq1)/neq1;
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
const uint32_t h = iq2; // head
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
float S = 0.0f;
float M = -INFINITY;
@ -15797,7 +15570,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
// loop over n_kv and n_head_kv
// ref: https://arxiv.org/pdf/2112.05682.pdf
for (int64_t ic = 0; ic < nek1; ++ic) {
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
if (mv == -INFINITY) {
continue;
}
@ -15868,7 +15641,7 @@ static void ggml_compute_forward_flash_attn_ext(
const struct ggml_tensor * v,
const struct ggml_tensor * mask,
struct ggml_tensor * dst) {
switch (dst->op_params[1]) {
switch (dst->op_params[2]) {
case GGML_PREC_DEFAULT:
case GGML_PREC_F32:
{
@ -17631,10 +17404,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{
ggml_compute_forward_rope_back(params, tensor);
} break;
case GGML_OP_ALIBI:
{
ggml_compute_forward_alibi(params, tensor);
} break;
case GGML_OP_CLAMP:
{
ggml_compute_forward_clamp(params, tensor);
@ -18653,10 +18422,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
zero_table);
}
} break;
case GGML_OP_ALIBI:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_CLAMP:
{
GGML_ASSERT(false); // TODO: not implemented
@ -19429,10 +19194,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
{
n_tasks = n_threads;
} break;
case GGML_OP_ALIBI:
{
n_tasks = 1; //TODO
} break;
case GGML_OP_CLAMP:
{
n_tasks = 1; //TODO

18
ggml.h
View file

@ -468,7 +468,6 @@ extern "C" {
GGML_OP_SOFT_MAX_BACK,
GGML_OP_ROPE,
GGML_OP_ROPE_BACK,
GGML_OP_ALIBI,
GGML_OP_CLAMP,
GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL,
@ -1428,15 +1427,13 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
// fused soft_max(a*scale + mask + pos[i]*(ALiBi slope))
// fused soft_max(a*scale + mask*(ALiBi slope))
// mask is optional
// pos is required when max_bias > 0.0f
// max_bias = 0.0f for no ALiBi
GGML_API struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * mask,
struct ggml_tensor * pos,
float scale,
float max_bias);
@ -1538,16 +1535,6 @@ extern "C" {
float xpos_base,
bool xpos_down);
// alibi position embedding
// in-place, returns view(a)
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_head,
float bias_max),
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
// clamp
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_clamp(
@ -1744,7 +1731,8 @@ extern "C" {
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * mask,
float scale);
float scale,
float max_bias);
GGML_API void ggml_flash_attn_ext_set_prec(
struct ggml_tensor * a,

File diff suppressed because it is too large Load diff

92
gguf-py/scripts/gguf-new-metadata.py Normal file → Executable file
View file

@ -7,7 +7,8 @@ import json
from pathlib import Path
import numpy as np
from typing import Any, Sequence
from tqdm import tqdm
from typing import Any, Sequence, NamedTuple
# Necessary to load the local gguf package
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
@ -18,6 +19,12 @@ import gguf
logger = logging.getLogger("gguf-new-metadata")
class MetadataDetails(NamedTuple):
type: gguf.GGUFValueType
value: Any
description: str = ''
def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# Host is little endian
@ -59,7 +66,16 @@ def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
return decode_field(field)
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, str], remove_metadata: Sequence[str]) -> None:
def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
token_ids = [index for index, value in enumerate(token_list) if value == token]
if len(token_ids) == 0:
raise LookupError(f'Unable to find "{token}" in token list!')
return token_ids
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None:
for field in reader.fields.values():
# Suppress virtual fields and fields written by GGUFWriter
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
@ -75,54 +91,64 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
logger.debug(f'Removing {field.name}')
continue
old_val = decode_field(field)
old_val = MetadataDetails(field.types[0], decode_field(field))
val = new_metadata.get(field.name, old_val)
if field.name in new_metadata:
logger.debug(f'Modifying {field.name}: "{old_val}" -> "{val}"')
logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}')
del new_metadata[field.name]
elif val is not None:
elif val.value is not None:
logger.debug(f'Copying {field.name}')
if val is not None:
if val.value is not None:
writer.add_key(field.name)
writer.add_val(val, field.types[0])
writer.add_val(val.value, val.type)
if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
logger.debug('Adding chat template(s)')
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE])
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
# TODO: Support other types than string?
for key, val in new_metadata.items():
logger.debug(f'Adding {key}: {val}')
logger.debug(f'Adding {key}: "{val.value}" {val.description}')
writer.add_key(key)
writer.add_val(val, gguf.GGUFValueType.STRING)
writer.add_val(val.value, val.type)
total_bytes = 0
for tensor in reader.tensors:
total_bytes += tensor.n_bytes
# Dimensions are written in reverse order, so flip them first
shape = np.flipud(tensor.shape).tolist()
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
writer.write_header_to_file()
writer.write_kv_data_to_file()
writer.write_ti_data_to_file()
for tensor in reader.tensors:
writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
writer.close()
def main() -> None:
tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_'))
token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id'))
parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
parser.add_argument("input", type=Path, help="GGUF format model input filename")
parser.add_argument("output", type=Path, help="GGUF format model output filename")
parser.add_argument("--general-name", type=str, help="The models general.name")
parser.add_argument("--general-description", type=str, help="The models general.description")
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)")
parser.add_argument("--chat-template-config", type=Path, help="Config file (tokenizer_config.json) containing chat template(s)")
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model")
parser.add_argument("--general-name", type=str, help="The models general.name", metavar='"name"')
parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
parser.add_argument("--special-token-by-id", action="append", type=str, help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation")
parser.add_argument("--verbose", action="store_true", help="Increase output verbosity")
args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
@ -133,20 +159,20 @@ def main() -> None:
remove_metadata = args.remove_metadata or []
if args.general_name:
new_metadata[gguf.Keys.General.NAME] = args.general_name
new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name)
if args.general_description:
new_metadata[gguf.Keys.General.DESCRIPTION] = args.general_description
new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description)
if args.chat_template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
if args.chat_template_config:
with open(args.chat_template_config, 'r') as fp:
config = json.load(fp)
template = config.get('chat_template')
if template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = template
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
if remove_metadata:
logger.warning('*** Warning *** Warning *** Warning **')
@ -166,6 +192,32 @@ def main() -> None:
arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
endianess = get_byteorder(reader)
token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
for name, token in args.special_token or []:
if name not in token_names:
logger.warning(f'Unknown special token "{name}", ignoring...')
else:
ids = find_token(token_list, token)
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}')
if len(ids) > 1:
logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:')
logger.warning(', '.join(str(i) for i in ids))
for name, id_string in args.special_token_by_id or []:
if name not in token_names:
logger.warning(f'Unknown special token "{name}", ignoring...')
elif not id_string.isdecimal():
raise LookupError(f'Token ID "{id_string}" is not a valid ID!')
else:
id_int = int(id_string)
if id_int >= 0 and id_int < len(token_list):
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}')
else:
raise LookupError(f'Token ID {id_int} is not within token list!')
if os.path.isfile(args.output) and not args.force:
logger.warning('*** Warning *** Warning *** Warning **')
logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')

205
llama.cpp
View file

@ -1866,7 +1866,7 @@ struct llama_hparams {
float f_logit_scale = 0.0f;
bool causal_attn = true;
bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models
bool use_alibi = false;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@ -2338,7 +2338,6 @@ struct llama_context {
struct ggml_tensor * inp_pos; // I32 [n_batch]
struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
struct ggml_tensor * inp_K_shift; // I32 [kv_size]
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
struct ggml_tensor * inp_cls; // I32 [n_batch]
@ -6586,7 +6585,6 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * wo_b,
struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask,
struct ggml_tensor * kq_pos,
int32_t n_tokens,
int32_t n_kv,
float kq_scale,
@ -6616,10 +6614,6 @@ static struct ggml_tensor * llm_build_kqv(
GGML_UNUSED(model);
GGML_UNUSED(n_ctx);
// note: if this assert triggers, then some check has failed earlier
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
// split cached v into n_head heads (not transposed)
struct ggml_tensor * v =
ggml_view_3d(ctx, kv.v_l[il],
@ -6629,7 +6623,7 @@ static struct ggml_tensor * llm_build_kqv(
0);
cb(v, "v", il);
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale);
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
@ -6660,28 +6654,8 @@ static struct ggml_tensor * llm_build_kqv(
kq = ggml_scale(ctx, kq, 30);
}
#if defined(GGML_USE_KOMPUTE)
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
if (hparams.use_alibi) {
kq = ggml_scale(ctx, kq, kq_scale);
cb(kq, "kq_scaled", il);
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
cb(kq, "kq_scaled_alibi", il);
kq = ggml_add(ctx, kq, kq_mask);
cb(kq, "kq_masked", il);
kq = ggml_soft_max(ctx, kq);
cb(kq, "kq_soft_max", il);
} else
#endif
{
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il);
}
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il);
GGML_ASSERT(kv.size == n_ctx);
@ -6731,7 +6705,6 @@ static struct ggml_tensor * llm_build_kv(
struct ggml_tensor * v_cur,
struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask,
struct ggml_tensor * kq_pos,
int32_t n_tokens,
int32_t kv_head,
int32_t n_kv,
@ -6750,7 +6723,7 @@ static struct ggml_tensor * llm_build_kv(
struct ggml_tensor * cur;
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il);
q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
cb(cur, "kqv_out", il);
return cur;
@ -6857,18 +6830,17 @@ struct llm_build_context {
ctx0 = ggml_init(params);
lctx.inp_tokens = nullptr;
lctx.inp_embd = nullptr;
lctx.inp_pos = nullptr;
lctx.inp_tokens = nullptr;
lctx.inp_embd = nullptr;
lctx.inp_pos = nullptr;
lctx.inp_out_ids = nullptr;
lctx.inp_KQ_mask = nullptr;
lctx.inp_KQ_pos = nullptr;
lctx.inp_K_shift = nullptr;
lctx.inp_mean = nullptr;
lctx.inp_cls = nullptr;
lctx.inp_s_copy = nullptr;
lctx.inp_s_mask = nullptr;
lctx.inp_s_seq = nullptr;
lctx.inp_mean = nullptr;
lctx.inp_cls = nullptr;
lctx.inp_s_copy = nullptr;
lctx.inp_s_mask = nullptr;
lctx.inp_s_seq = nullptr;
}
void free() {
@ -7018,19 +6990,6 @@ struct llm_build_context {
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
}
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
if (causal) {
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
} else {
// TODO: this will be needed for ALiBi-based BERT models
// https://github.com/ggerganov/llama.cpp/pull/6826
lctx.inp_KQ_pos = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
}
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
ggml_set_input(lctx.inp_KQ_pos);
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
}
struct ggml_tensor * build_inp_mean() {
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
cb(lctx.inp_mean, "inp_mean", -1);
@ -7136,7 +7095,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -7229,9 +7188,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
@ -7276,7 +7232,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -7346,9 +7302,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
@ -7383,7 +7336,7 @@ struct llm_build_context {
cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -7503,7 +7456,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -7628,7 +7581,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
}
if (il == n_layer - 1) {
@ -7780,7 +7733,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -7892,7 +7845,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -8096,7 +8049,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -8162,9 +8115,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL;
@ -8192,7 +8142,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -8280,9 +8230,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos(false);
// iterate layers
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur = inpL;
@ -8351,7 +8298,7 @@ struct llm_build_context {
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
cb(kq, "kq", il);
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, KQ_pos, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il);
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@ -8476,9 +8423,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
inpL = llm_build_norm(ctx0, inpL, hparams,
model.tok_norm,
model.tok_norm_b,
@ -8512,7 +8456,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -8577,9 +8521,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
if (model.pos_embd) {
// inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos();
@ -8643,13 +8584,13 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} else {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
}
@ -8793,7 +8734,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -8911,7 +8852,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -9024,7 +8965,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -9138,7 +9079,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -9293,7 +9234,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
}
if (il == n_layer - 1) {
@ -9410,7 +9351,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
}
if (il == n_layer - 1) {
@ -9523,7 +9464,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
struct ggml_tensor * sa_out = cur;
@ -9626,7 +9567,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -9733,7 +9674,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -9849,7 +9790,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -9966,7 +9907,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -10096,7 +10037,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -10217,7 +10158,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
}
if (il == n_layer - 1) {
@ -10336,7 +10277,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -10626,7 +10567,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -10757,7 +10698,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, nullptr,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
}
if (il == n_layer - 1) {
@ -11146,11 +11087,21 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
f = -INFINITY;
} else {
f = 0.0f;
if (hparams.use_alibi) {
f = -fabs(lctx.kv_self.cells[i].pos - pos);
} else {
f = 0.0f;
}
}
data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
}
}
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
for (int j = 0; j < n_kv; ++j) {
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
}
}
}
} else {
// when using kv cache, the mask needs to match the kv cache size
@ -11169,7 +11120,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
float f = -INFINITY;
for (int s = 0; s < batch.n_seq_id[i]; ++s) {
if (batch.seq_id[i][s] == seq_id) {
f = 0.0f;
if (hparams.use_alibi) {
f = -fabs(batch.pos[i] - batch.pos[j]);
} else {
f = 0.0f;
}
break;
}
}
@ -11185,32 +11140,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
}
}
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
// this allows to process multiple sequences in parallel with ALiBi-based models
if (hparams.use_alibi) {
const int64_t n_kv = kv_self.n;
GGML_ASSERT(lctx.inp_KQ_pos);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
GGML_ASSERT(ggml_is_vector(lctx.inp_KQ_pos) || ggml_is_matrix(lctx.inp_KQ_pos));
if (ggml_is_vector(lctx.inp_KQ_pos)) {
float * data = (float *) lctx.inp_KQ_pos->data;
for (int i = 0; i < n_kv; ++i) {
data[i] = float(lctx.kv_self.cells[i].pos);
}
} else if(ggml_is_matrix(lctx.inp_KQ_pos)) {
const int64_t n_tokens = batch.n_tokens;
float * data = (float *) lctx.inp_KQ_pos->data;
for (int i = 0; i < n_tokens; ++i) {
for (int j = 0; j < n_tokens; ++j) {
data[i * n_tokens + j] = -1.0 * abs(i - j);
}
}
}
}
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = batch.n_tokens;
@ -11563,7 +11492,7 @@ static int llama_decode_internal(
}
// non-causal masks do not use the KV cache
if (hparams.causal_attn || model.arch == LLM_ARCH_JINA_BERT_V2) {
if (hparams.causal_attn) {
llama_kv_cache_update(&lctx);
// if we have enough unused cells before the current head ->
@ -12613,7 +12542,7 @@ struct llm_tokenizer_wpm {
continue;
}
code = unicode_tolower(code);
if (type == CODEPOINT_TYPE_WHITESPACE) {
if (type == CODEPOINT_TYPE_SEPARATOR) {
code = ' ';
}
std::string s = unicode_cpt_to_utf8(code);
@ -15639,23 +15568,11 @@ struct llama_context * llama_new_context_with_model(
}
}
if (cparams.flash_attn && hparams.use_alibi) {
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
cparams.flash_attn = false;
}
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
cparams.flash_attn = false;
}
#ifdef GGML_USE_HIPBLAS
if (cparams.flash_attn) {
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
cparams.flash_attn = false;
}
#endif
if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL);
}
@ -18010,7 +17927,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
/*.n_sample =*/ std::max(1, ctx->n_sample),
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval),
/*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
/*.n_eval =*/ std::max(1, ctx->n_eval),
};

View file

@ -1,31 +1,14 @@
import regex
def cpt_to_utf8_str(cpt):
if cpt <= 0xFF:
return bytes([cpt, 0, 0, 0])
elif cpt <= 0xFFFF:
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
elif cpt <= 0xFFFFFF:
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
else:
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
def is_match(codepoint, regex_expr):
try:
res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
return res is not None
except Exception:
return False
def get_matches(regex_expr):
regex_expr_compiled = regex.compile(regex_expr)
unicode_ranges = []
current_range = None
for codepoint in range(0x110000):
if is_match(codepoint, regex_expr):
char = chr(codepoint)
if regex_expr_compiled.match(char):
if current_range is None:
current_range = [codepoint, codepoint]
else:
@ -40,27 +23,42 @@ def get_matches(regex_expr):
return unicode_ranges
def print_cat(cat, ranges):
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
cnt = 0
for start, end in ranges:
if cnt % 4 != 0:
print(" ", end="") # noqa: NP100
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
if cnt % 4 == 3:
print("") # noqa: NP100
cnt += 1
if cnt % 4 != 0:
print("") # noqa: NP100
def print_cat(mode, cat, ranges):
if mode == "range":
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
if mode == "map":
print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat)) # noqa: NP100
for i, values in enumerate(ranges):
end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", "
values = ["0x%08X" % value for value in values]
print("{" + ", ".join(values) + "}", end=end) # noqa: NP100
print("};") # noqa: NP100
print("") # noqa: NP100
print_cat("number", get_matches(r'\p{N}'))
print_cat("letter", get_matches(r'\p{L}'))
print_cat("whitespace", get_matches(r'\p{Z}'))
print_cat("accent_mark", get_matches(r'\p{M}'))
print_cat("punctuation", get_matches(r'\p{P}'))
print_cat("symbol", get_matches(r'\p{S}'))
print_cat("control", get_matches(r'\p{C}'))
print_cat("range", "number", get_matches(r'\p{N}'))
print_cat("range", "letter", get_matches(r'\p{L}'))
print_cat("range", "separator", get_matches(r'\p{Z}'))
print_cat("range", "accent_mark", get_matches(r'\p{M}'))
print_cat("range", "punctuation", get_matches(r'\p{P}'))
print_cat("range", "symbol", get_matches(r'\p{S}'))
print_cat("range", "control", get_matches(r'\p{C}'))
print_cat("range", "whitespace", get_matches(r'\s'))
map_lowercase = []
map_uppercase = []
for codepoint in range(0x110000):
char = chr(codepoint)
lower = ord(char.lower()[0])
upper = ord(char.upper()[0])
if codepoint != lower:
map_lowercase.append((codepoint, lower))
if codepoint != upper:
map_uppercase.append((codepoint, upper))
print_cat("map", "lowercase", map_lowercase)
print_cat("map", "uppercase", map_uppercase)
# TODO: generate unicode_map_nfd

View file

@ -1111,11 +1111,7 @@ struct test_soft_max : public test_case {
if (this->mask) {
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
}
ggml_tensor * pos = nullptr;
if (max_bias > 0.0f) {
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
}
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
return out;
}
};
@ -1490,23 +1486,25 @@ struct test_flash_attn_ext : public test_case {
const int64_t kv; // kv size
const int64_t nb; // batch size
const float max_bias; // ALiBi
std::string vars() override {
return VARS_TO_STR4(hs, nh, kv, nb);
return VARS_TO_STR5(hs, nh, kv, nb, max_bias);
}
double max_nmse_err() override {
return 5e-4;
}
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8)
: hs(hs), nh(nh), kv(kv), nb(nb) {}
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, float max_bias = 0.0f)
: hs(hs), nh(nh), kv(kv), nb(nb), max_bias(max_bias) {}
ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs));
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs), max_bias);
return out;
}
};
@ -1611,7 +1609,7 @@ public:
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f);
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
// split cached v into n_head heads
struct ggml_tensor * v =
@ -2175,11 +2173,17 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_timestep_embedding());
test_cases.emplace_back(new test_leaky_relu());
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
for (int hs : { 64, 128, }) { // other head sizes not implemented
#else
for (int hs : { 64, 80, 128, 256, }) {
for (int nh : { 32, }) {
for (int kv : { 512, 1024, }) {
for (int nb : { 1, 2, 4, 8, }) {
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb));
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
for (float max_bias : {0.0f, 8.0f}) {
for (int nh : { 32, }) {
for (int kv : { 512, 1024, }) {
for (int nb : { 1, 2, 4, 8, }) {
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, max_bias));
}
}
}
}

View file

@ -0,0 +1,295 @@
# Test libllama tokenizer == AutoTokenizer.
# Brute force random tokens/text generation.
#
# Sample usage:
#
# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
#
import time
import logging
import argparse
import subprocess
import random
from typing import Iterator
import cffi
from transformers import AutoTokenizer, PreTrainedTokenizerBase
logger = logging.getLogger("test-tokenizer-random-bpe")
class LibLlama:
DEFAULT_PATH_LLAMA_H = "./llama.h"
DEFAULT_PATH_LIBLLAMA = "./build/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON
def __init__(self, path_llama_h: str = None, path_libllama: str = None):
path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
(self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_libllama)
self.lib.llama_backend_init()
def _load_libllama_cffi(self, path_llama_h: str, path_libllama: str):
cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)=", path_llama_h]
res = subprocess.run(cmd, stdout=subprocess.PIPE)
assert (res.returncode == 0)
source = res.stdout.decode()
ffi = cffi.FFI()
if True: # workarounds for pycparser
source = "typedef struct { } __builtin_va_list;" + "\n" + source
source = source.replace("sizeof (int)", str(ffi.sizeof("int")))
source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
ffi.cdef(source, override=True)
lib = ffi.dlopen(path_libllama)
return (ffi, lib)
def model_default_params(self, **kwargs):
mparams = self.lib.llama_model_default_params()
for k, v in kwargs.items():
setattr(mparams, k, v)
return mparams
def context_default_params(self, **kwargs):
cparams = self.lib.llama_context_default_params()
for k, v in kwargs.items():
setattr(cparams, k, v)
return cparams
class LibLlamaModel:
def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
self.lib = libllama.lib
self.ffi = libllama.ffi
if isinstance(mparams, dict):
mparams = libllama.model_default_params(**mparams)
self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
if not self.model:
raise RuntimeError("error: failed to load model '%s'" % path_model)
if isinstance(cparams, dict):
cparams = libllama.context_default_params(**cparams)
self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
if not self.ctx:
raise RuntimeError("error: failed to create context for model '%s'" % path_model)
n_tokens_max = self.lib.llama_n_ctx(self.ctx)
self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
def free(self):
if self.ctx:
self.lib.llama_free(self.ctx)
if self.model:
self.lib.llama_free_model(self.model)
self.ctx = None
self.model = None
self.lib = None
def tokenize(self, text: str, n_tokens_max: int = 0, add_special: bool = False, parse_special: bool = False) -> list[int]:
n_tokens_max = n_tokens_max if n_tokens_max > 0 else len(self.token_ids)
text = text.encode("utf-8")
num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, n_tokens_max, add_special, parse_special)
if num < 0:
return []
return list(self.token_ids[0:num])
def generator_custom_text() -> Iterator[str]:
"""General tests"""
yield from [
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
]
def generator_custom_text_edge_cases() -> Iterator[str]:
"""Edge cases found while debugging"""
yield from [
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
'¼-a', # unicode_ranges_digit, 0x00BC
'½-a', # unicode_ranges_digit, 0x00BD
'¾-a', # unicode_ranges_digit, 0x00BE
'a b', # unicode_ranges_digit, 0x3007
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
'<s>a' # TODO: Phi-3 fail
]
def generator_random_chars(iterations = 100) -> Iterator[str]:
"""Brute force random text with simple characters"""
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
CHARS = list(set("""
ABCDEFGHIJKLMNOPQRSTUVWXYZ
abcdefghijklmnopqrstuvwxyz
ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
áéíóúàèìòùâêîôûäëïöü
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
"""))
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = []
num_words = rand.randint(300, 400)
for i in range(num_words):
k = rand.randint(1, 7)
word = rand.choices(CHARS, k=k)
space = rand.choice(WHITESPACES)
text.append("".join(word) + space)
yield "".join(text)
def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
"""Brute force random text with vocab characters"""
vocab_ids = list(tokenizer.vocab.values())
vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True)
vocab_chars = list(set(vocab_text))
del vocab_ids, vocab_text
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = rand.choices(vocab_chars, k=1024)
yield "".join(text)
def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
"""Brute force random text from vocab tokens"""
space_id = tokenizer.encode(" ", add_special_tokens=False)[0]
vocab_ids = list(tokenizer.vocab.values())
vocab_ids = list(sorted(vocab_ids + vocab_ids))
for i in range(1, len(vocab_ids), 2):
vocab_ids[i] = space_id
vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True)
vocab_tokens = vocab_tokens.split(" ")
del vocab_ids
yield from vocab_tokens
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = []
num_words = rand.randint(300, 400)
for i in range(num_words):
k = rand.randint(1, 3)
tokens = rand.choices(vocab_tokens, k=k)
tokens = [t.strip(" \n\r\t") for t in tokens]
sep = rand.choice(" \n\r\t")
text.append("".join(tokens) + sep)
yield "".join(text)
def generator_random_bytes(iterations = 100) -> Iterator[str]:
"""Brute force random bytes"""
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = []
num_words = rand.randint(300, 400)
for i in range(num_words):
k = rand.randint(1, 8)
word = [chr(r) for r in rand.randbytes(k) if r]
word.append(rand.choice(WHITESPACES))
text.append("".join(word))
yield "".join(text)
def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]):
def find_first_mismatch(ids1: list[int], ids2: list[int]):
for i, (a,b) in enumerate(zip(ids1, ids2)):
if a != b:
return i
if len(ids1) == len(ids2):
return -1
return min(len(ids1), len(ids2))
t0 = time.perf_counter()
logger.info("%s: %s" % (generator.__name__, "ini"))
for text in generator:
ids1 = model.tokenize(text, add_special=False, parse_special=False)
ids2 = tokenizer.encode(text, add_special_tokens=False)
if ids1 != ids2:
i = find_first_mismatch(ids1, ids2)
ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
text2 = tokenizer.decode(ids2, skip_special_tokens=True)
assert (text2 in text)
logger.info(" Text: " + repr(text2))
logger.info(" TokenIDs: " + str(ids1))
logger.info(" Expected: " + str(ids2))
raise Exception()
t1 = time.perf_counter()
logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
test_compare_tokenizer(model, tokenizer, generator_custom_text())
test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases())
test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000))
test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000))
test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000))
# test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL
model.free()

File diff suppressed because it is too large Load diff

View file

@ -7,6 +7,7 @@
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;

View file

@ -9,6 +9,7 @@
#include <stdexcept>
#include <string>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include <vector>
#include <locale>
@ -111,27 +112,27 @@ static uint32_t unicode_cpt_from_utf8(const std::string & utf8, size_t & offset)
static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
std::unordered_map<uint32_t, int> cpt_types;
for (auto p : unicode_ranges_number) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_NUMBER;
}
}
for (auto p : unicode_ranges_letter) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_LETTER;
}
}
for (auto p : unicode_ranges_whitespace) {
for (auto i = p.first; i <= p.second; ++ i) {
cpt_types[i] = CODEPOINT_TYPE_WHITESPACE;
for (auto p : unicode_ranges_separator) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
}
}
for (auto p : unicode_ranges_accent_mark) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
}
}
for (auto p : unicode_ranges_punctuation) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_PUNCTUATION;
}
}
@ -141,7 +142,7 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
}
}
for (auto p : unicode_ranges_control) {
for (auto i = p.first; i <= p.second; ++ i) {
for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_CONTROL;
}
}
@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
std::vector<size_t> bpe_offsets; // store the offset of each word
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
const auto cpts = unicode_cpts_from_utf8(text);
size_t start = 0;
for (auto offset : offsets) {
const size_t offset_ini = start;
const size_t offset_end = start + offset;
assert(offset_end <= cpts.size());
start = offset_end;
auto _get_cpt = [&] (const size_t pos) -> char32_t {
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
};
auto _get_cpt_type = [&] (const size_t pos) -> int {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
};
size_t _prev_end = offset_ini;
auto _add_token = [&] (const size_t end) -> size_t {
assert(_prev_end <= end && end <= offset_end);
size_t len = end - _prev_end;
if (len > 0) {
bpe_offsets.push_back(len);
}
_prev_end = end;
//if (len > 0) {
// std::string s = "";
// for(size_t p = end-len; p < end; p++)
// s += unicode_cpt_to_utf8(cpts[p]);
// printf(">>> '%s'\n", s.c_str());
//}
return len;
};
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
const char32_t cpt = _get_cpt(pos);
const int cpt_type = _get_cpt_type(pos);
// regex: 's|'t|'re|'ve|'m|'ll|'d
if (cpt == '\'' && pos+1 < offset_end) {
char32_t cpt_next = _get_cpt(pos+1);
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
pos += _add_token(pos+2);
continue;
}
if (pos+2 < offset_end) {
char32_t cpt_next_next = _get_cpt(pos+2);
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
(cpt_next == 'v' && cpt_next_next == 'e') ||
(cpt_next == 'l' && cpt_next_next == 'l')) {
pos += _add_token(pos+3);
continue;
}
}
}
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
// regex: <space>?\p{L}+
if (cpt2_type == CODEPOINT_TYPE_LETTER) {
pos += (cpt == ' ');
while (cpt2_type == CODEPOINT_TYPE_LETTER) {
cpt2_type = _get_cpt_type(++pos);
}
_add_token(pos);
continue;
}
// regex: <space>?\p{N}+
if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
pos += (cpt == ' ');
while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
cpt2_type = _get_cpt_type(++pos);
}
_add_token(pos);
continue;
}
// regex: <space>?[^\s\p{L}\p{N}]+
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
pos += (cpt == ' ');
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
cpt2_type = _get_cpt_type(++pos);
cpt2 = _get_cpt(pos);
}
_add_token(pos);
continue;
}
size_t num_whitespaces = 0;
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
num_whitespaces++;
}
// regex: \s+(?!\S)
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
pos += num_whitespaces - 1;
_add_token(pos);
continue;
}
// regex: \s+
if (num_whitespaces > 0) {
pos += num_whitespaces;
_add_token(pos);
continue;
}
// no matches
_add_token(++pos);
}
}
return bpe_offsets;
}
// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
std::vector<size_t> bpe_offsets; // store the offset of each word
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
const auto cpts = unicode_cpts_from_utf8(text);
size_t start = 0;
for (auto offset : offsets) {
std::string token;
const size_t offset_ini = start;
const size_t offset_end = start + offset;
assert(offset_end <= cpts.size());
start = offset_end;
bool collecting_numeric = false;
bool collecting_letter = false;
bool collecting_special = false;
bool collecting_whitespace_lookahead = false;
bool collecting = false;
auto _get_cpt = [&] (const size_t pos) -> char32_t {
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
};
std::vector<std::string> text_utf;
text_utf.reserve(offset);
auto _get_cpt_type = [&] (const size_t pos) -> int {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
};
for (size_t i = start; i < start + offset; ++i) {
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i]));
}
size_t _prev_end = offset_ini;
auto _add_token = [&] (const size_t end) -> size_t {
assert(_prev_end <= end && end <= offset_end);
size_t len = end - _prev_end;
if (len > 0) {
bpe_offsets.push_back(len);
}
_prev_end = end;
//if (len > 0) {
// std::string s = "";
// for(size_t p = end-len; p < end; p++)
// s += unicode_cpt_to_utf8(cpts[p]);
// printf(">>> '%s'\n", s.c_str());
//}
return len;
};
for (int i = 0; i < (int)text_utf.size(); i++) {
const std::string & utf_char = text_utf[i];
bool split_condition = false;
int bytes_remain = text_utf.size() - i;
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
const char32_t cpt = _get_cpt(pos);
const int cpt_type = _get_cpt_type(pos);
// forward backward lookups
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : "";
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : "";
// handling contractions
if (!split_condition && bytes_remain >= 2) {
// 's|'t|'m|'d
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
split_condition = true;
}
if (split_condition) {
if (token.size()) {
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
}
token = utf_char + utf_char_next;
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
token = "";
i++;
// regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
if (cpt == '\'' && pos+1 < offset_end) {
char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
pos += _add_token(pos+2);
continue;
}
}
if (!split_condition && bytes_remain >= 3) {
// 're|'ve|'ll
if (utf_char == "\'" && (
(utf_char_next == "r" && utf_char_next_next == "e") ||
(utf_char_next == "v" && utf_char_next_next == "e") ||
(utf_char_next == "l" && utf_char_next_next == "l"))
) {
split_condition = true;
}
if (split_condition) {
// current token + next token can be defined
if (token.size()) {
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
if (pos+2 < offset_end) {
char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
(cpt_next == 'v' && cpt_next_next == 'e') ||
(cpt_next == 'l' && cpt_next_next == 'l')) {
pos += _add_token(pos+3);
continue;
}
token = utf_char;
token += utf_char_next;
token += utf_char_next_next;
}
}
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
token = "";
i += 2;
// regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
pos++;
while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
pos++;
}
_add_token(pos);
continue;
}
}
if (!split_condition && !collecting) {
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) {
collecting_letter = true;
collecting = true;
}
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
collecting_numeric = true;
collecting = true;
}
else if (
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
(token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
) {
collecting_special = true;
collecting = true;
}
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
collecting_whitespace_lookahead = true;
collecting = true;
}
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
split_condition = true;
}
}
else if (!split_condition && collecting) {
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) {
split_condition = true;
}
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
split_condition = true;
}
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
split_condition = true;
}
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
split_condition = true;
// regex: \p{N}{1,3}
if (cpt_type == CODEPOINT_TYPE_NUMBER) {
size_t ini = pos;
while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
if (++pos - ini >= 3 ) {
_add_token(pos);
ini = pos;
}
}
_add_token(pos);
continue;
}
if (utf_char_next == "") {
split_condition = true; // final
token += utf_char;
// regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
pos += (cpt == ' ');
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
cpt2_type = _get_cpt_type(++pos);
cpt2 = _get_cpt(pos);
}
while (cpt2 == '\r' || cpt2 == '\n') {
cpt2 = _get_cpt(++pos);
}
_add_token(pos);
continue;
}
if (split_condition) {
if (token.size()) {
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
size_t num_whitespaces = 0;
size_t last_end_r_or_n = 0;
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
char32_t cpt2 = _get_cpt(pos+num_whitespaces);
if (cpt2 == '\r' || cpt2 == '\n') {
last_end_r_or_n = pos + num_whitespaces + 1;
}
token = utf_char;
collecting = false;
collecting_letter = false;
collecting_numeric = false;
collecting_special = false;
collecting_whitespace_lookahead = false;
num_whitespaces++;
}
else {
token += utf_char;
// regex: \s*[\r\n]+
if (last_end_r_or_n > 0) {
pos = last_end_r_or_n;
_add_token(pos);
continue;
}
// regex: \s+(?!\S)
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
pos += num_whitespaces - 1;
_add_token(pos);
continue;
}
// regex: \s+
if (num_whitespaces > 0) {
pos += num_whitespaces;
_add_token(pos);
continue;
}
// no matches
_add_token(++pos);
}
start += offset;
}
return bpe_offsets;
@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
std::vector<size_t> bpe_offsets;
(void)(text);
(void)(regex_expr);
(void)(offsets);
// TODO: this implementation is actually wrong, uncomment and run:
// make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf
//if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
// bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
//}
if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
} else if (
regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
}
return bpe_offsets;
}
@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
}
bool unicode_cpt_is_whitespace(uint32_t cp) {
static const std::unordered_set<uint32_t> is_whitespace = [] {
std::unordered_set<uint32_t> is_whitespace;
for (auto p : unicode_ranges_whitespace) {
for (auto i = p.first; i <= p.second; ++i) {
is_whitespace.insert(i);
}
}
return is_whitespace;
}();
return (bool)is_whitespace.count(cp);
}
std::string unicode_byte_to_utf8(uint8_t byte) {
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
return map.at(byte);

View file

@ -7,7 +7,7 @@
#define CODEPOINT_TYPE_UNIDENTIFIED 0
#define CODEPOINT_TYPE_NUMBER 1
#define CODEPOINT_TYPE_LETTER 2
#define CODEPOINT_TYPE_WHITESPACE 3
#define CODEPOINT_TYPE_SEPARATOR 3
#define CODEPOINT_TYPE_ACCENT_MARK 4
#define CODEPOINT_TYPE_PUNCTUATION 5
#define CODEPOINT_TYPE_SYMBOL 6
@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
int unicode_cpt_type(uint32_t cp);
int unicode_cpt_type(const std::string & utf8);
bool unicode_cpt_is_whitespace(uint32_t cp);
std::string unicode_byte_to_utf8(uint8_t byte);
uint8_t unicode_utf8_to_byte(const std::string & utf8);