Merge remote-tracking branch 'origin/gg/refactor-alibi-2' into HEAD

This commit is contained in:
Georgi Gerganov 2024-05-10 15:27:21 +03:00
commit d9adb8832b
No known key found for this signature in database
GPG key ID: 449E073F9DC10735
29 changed files with 45973 additions and 36626 deletions

View file

@ -2,7 +2,7 @@
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png) ![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml) [Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
@ -140,6 +140,7 @@ Typically finetunes of the base models below are supported as well.
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM) - [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL) - [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM) - [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
**HTTP server** **HTTP server**
@ -175,6 +176,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
- [nat/openplayground](https://github.com/nat/openplayground) - [nat/openplayground](https://github.com/nat/openplayground)
- [Faraday](https://faraday.dev/) (proprietary) - [Faraday](https://faraday.dev/) (proprietary)
- [LMStudio](https://lmstudio.ai/) (proprietary) - [LMStudio](https://lmstudio.ai/) (proprietary)
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
- [LocalAI](https://github.com/mudler/LocalAI) (MIT) - [LocalAI](https://github.com/mudler/LocalAI) (MIT)
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL) - [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile) - [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)

View file

@ -49,6 +49,10 @@ chktxt = '\n \n\n \n\n\n \t \t\t \t\n \n \n \n \n🚀 (normal) 😶‍
if len(sys.argv) == 2: if len(sys.argv) == 2:
token = sys.argv[1] token = sys.argv[1]
if not token.startswith("hf_"):
logger.info("Huggingface token seems invalid")
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1)
else: else:
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>") logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
sys.exit(1) sys.exit(1)
@ -260,6 +264,7 @@ tests = [
"3333333", "3333333",
"33333333", "33333333",
"333333333", "333333333",
# "Cửa Việt", # llama-bpe fails on this
chktxt, chktxt,
] ]

View file

@ -52,15 +52,15 @@ static void ggml_print_tensor(uint8_t * data, ggml_type type, const int64_t * ne
size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0]; size_t i = i3 * nb[3] + i2 * nb[2] + i1 * nb[1] + i0 * nb[0];
float v; float v;
if (type == GGML_TYPE_F16) { if (type == GGML_TYPE_F16) {
v = ggml_fp16_to_fp32(*(ggml_fp16_t *) data + i); v = ggml_fp16_to_fp32(*(ggml_fp16_t *) &data[i]);
} else if (type == GGML_TYPE_F32) { } else if (type == GGML_TYPE_F32) {
v = *(float *) data + i; v = *(float *) &data[i];
} else if (type == GGML_TYPE_I32) { } else if (type == GGML_TYPE_I32) {
v = (float) *(int32_t *) data + i; v = (float) *(int32_t *) &data[i];
} else if (type == GGML_TYPE_I16) { } else if (type == GGML_TYPE_I16) {
v = (float) *(int16_t *) data + i; v = (float) *(int16_t *) &data[i];
} else if (type == GGML_TYPE_I8) { } else if (type == GGML_TYPE_I8) {
v = (float) *(int8_t *) data + i; v = (float) *(int8_t *) &data[i];
} else { } else {
GGML_ASSERT(false); GGML_ASSERT(false);
} }

View file

@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
#define TN_POS_EMBD "%s.position_embd.weight" #define TN_POS_EMBD "%s.position_embd.weight"
#define TN_CLASS_EMBD "v.class_embd" #define TN_CLASS_EMBD "v.class_embd"
#define TN_PATCH_EMBD "v.patch_embd.weight" #define TN_PATCH_EMBD "v.patch_embd.weight"
#define TN_PATCH_BIAS "v.patch_embd.bias"
#define TN_ATTN_K "%s.blk.%d.attn_k.%s" #define TN_ATTN_K "%s.blk.%d.attn_k.%s"
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s" #define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
#define TN_ATTN_V "%s.blk.%d.attn_v.%s" #define TN_ATTN_V "%s.blk.%d.attn_v.%s"
@ -425,6 +426,7 @@ struct clip_vision_model {
// embeddings // embeddings
struct ggml_tensor * class_embedding; struct ggml_tensor * class_embedding;
struct ggml_tensor * patch_embeddings; struct ggml_tensor * patch_embeddings;
struct ggml_tensor * patch_bias;
struct ggml_tensor * position_embeddings; struct ggml_tensor * position_embeddings;
struct ggml_tensor * pre_ln_w; struct ggml_tensor * pre_ln_w;
@ -501,6 +503,11 @@ struct clip_ctx {
bool use_gelu = false; bool use_gelu = false;
int32_t ftype = 1; int32_t ftype = 1;
bool has_class_embedding = true;
bool has_pre_norm = true;
bool has_post_norm = false;
bool has_patch_bias = false;
struct gguf_context * ctx_gguf; struct gguf_context * ctx_gguf;
struct ggml_context * ctx_data; struct ggml_context * ctx_data;
@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side); const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
const int num_positions = num_patches + 1; const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
const int hidden_size = hparams.hidden_size; const int hidden_size = hparams.hidden_size;
const int n_head = hparams.n_head; const int n_head = hparams.n_head;
const int d_head = hidden_size / n_head; const int d_head = hidden_size / n_head;
@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size); inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3)); inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
if (ctx->has_patch_bias) {
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
inp = ggml_add(ctx0, inp, model.patch_bias);
}
// concat class_embeddings and patch_embeddings // concat class_embeddings and patch_embeddings
struct ggml_tensor * embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size); struct ggml_tensor * embeddings = inp;
if (ctx->has_class_embedding) {
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
ggml_set_name(embeddings, "embeddings"); ggml_set_name(embeddings, "embeddings");
ggml_set_input(embeddings); ggml_set_input(embeddings);
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding, embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
embeddings = ggml_acc(ctx0, embeddings, inp, embeddings = ggml_acc(ctx0, embeddings, inp,
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]); embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
}
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions); struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
ggml_set_name(positions, "positions"); ggml_set_name(positions, "positions");
@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions)); ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
// pre-layernorm // pre-layernorm
{ if (ctx->has_pre_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps); embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "pre_ln"); ggml_set_name(embeddings, "pre_ln");
@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
embeddings = cur; embeddings = cur;
} }
// post-layernorm
if (ctx->has_post_norm) {
embeddings = ggml_norm(ctx0, embeddings, eps);
ggml_set_name(embeddings, "post_ln");
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
}
// llava projector // llava projector
{ {
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]); embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
@ -1149,11 +1171,38 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
} }
try { try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD); vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v")); new_clip->has_class_embedding = true;
} catch (const std::exception& e) {
new_clip->has_class_embedding = false;
}
try {
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight")); vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias")); vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
new_clip->has_pre_norm = true;
} catch (std::exception & e) {
new_clip->has_pre_norm = false;
}
try {
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
new_clip->has_post_norm = true;
} catch (std::exception & e) {
new_clip->has_post_norm = false;
}
try {
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
new_clip->has_patch_bias = true;
} catch (std::exception & e) {
new_clip->has_patch_bias = false;
}
try {
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
} catch(const std::exception& e) { } catch(const std::exception& e) {
LOG_TEE("%s: failed to load vision model tensors\n", __func__); LOG_TEE("%s: failed to load vision model tensors\n", __func__);
} }
@ -1797,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
const int image_size = hparams.image_size; const int image_size = hparams.image_size;
const int patch_size = hparams.patch_size; const int patch_size = hparams.patch_size;
const int num_patches = ((image_size / patch_size) * (image_size / patch_size)); const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
const int num_positions = num_patches + 1; const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
{ {
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw"); struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
@ -1825,6 +1874,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
} }
{ {
if (ctx->has_class_embedding) {
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings"); struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
void* zero_mem = malloc(ggml_nbytes(embeddings)); void* zero_mem = malloc(ggml_nbytes(embeddings));
@ -1832,6 +1882,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings)); ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
free(zero_mem); free(zero_mem);
} }
}
{ {
struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions"); struct ggml_tensor * positions = ggml_graph_get_tensor(gf, "positions");

View file

@ -4,7 +4,6 @@
#include "ggml-cuda/common.cuh" #include "ggml-cuda/common.cuh"
#include "ggml-cuda/acc.cuh" #include "ggml-cuda/acc.cuh"
#include "ggml-cuda/alibi.cuh"
#include "ggml-cuda/arange.cuh" #include "ggml-cuda/arange.cuh"
#include "ggml-cuda/argsort.cuh" #include "ggml-cuda/argsort.cuh"
#include "ggml-cuda/binbcast.cuh" #include "ggml-cuda/binbcast.cuh"
@ -2277,9 +2276,6 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg
case GGML_OP_ROPE: case GGML_OP_ROPE:
ggml_cuda_op_rope(ctx, dst); ggml_cuda_op_rope(ctx, dst);
break; break;
case GGML_OP_ALIBI:
ggml_cuda_op_alibi(ctx, dst);
break;
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
ggml_cuda_op_im2col(ctx, dst); ggml_cuda_op_im2col(ctx, dst);
break; break;
@ -2829,7 +2825,6 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_ALIBI:
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:

View file

@ -1,63 +0,0 @@
#include "alibi.cuh"
static __global__ void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
const int n_heads_log2_floor, const float m0, const float m1) {
const int col = blockDim.x*blockIdx.x + threadIdx.x;
if (col >= ncols) {
return;
}
const int row = blockDim.y*blockIdx.y + threadIdx.y;
const int i = row*ncols + col;
const int k = row/k_rows;
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
dst[i] = col * m_k + x[i];
}
static void alibi_f32_cuda(const float * x, float * dst, const int ncols, const int nrows,
const int k_rows, const int n_heads_log2_floor, const float m0,
const float m1, cudaStream_t stream) {
const dim3 block_dims(CUDA_ALIBI_BLOCK_SIZE, 1, 1);
const int num_blocks_x = (ncols + CUDA_ALIBI_BLOCK_SIZE - 1) / (CUDA_ALIBI_BLOCK_SIZE);
const dim3 block_nums(num_blocks_x, nrows, 1);
alibi_f32<<<block_nums, block_dims, 0, stream>>>(x, dst, ncols, k_rows, n_heads_log2_floor, m0, m1);
}
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0];
const float * src0_d = (const float *)src0->data;
float * dst_d = (float *)dst->data;
cudaStream_t stream = ctx.stream();
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
const int64_t ne00 = src0->ne[0];
const int64_t ne01 = src0->ne[1];
const int64_t ne02 = src0->ne[2];
const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
//GGML_ASSERT(ne01 + n_past == ne00);
GGML_ASSERT(n_head == ne02);
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
alibi_f32_cuda(src0_d, dst_d, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, stream);
}

View file

@ -1,5 +0,0 @@
#include "common.cuh"
#define CUDA_ALIBI_BLOCK_SIZE 32
void ggml_cuda_op_alibi(ggml_backend_cuda_context & ctx, ggml_tensor * dst);

View file

@ -234,122 +234,6 @@ typedef float dfloat; // dequantize float
typedef float2 dfloat2; typedef float2 dfloat2;
#endif //GGML_CUDA_F16 #endif //GGML_CUDA_F16
[[noreturn]]
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch);
GGML_UNUSED(arch_list);
#else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
__trap();
GGML_UNUSED(no_device_code); // suppress unused function warning
}
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
}
return x;
}
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
}
return a;
}
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
}
return a;
#else
GGML_UNUSED(a);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
}
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX
return __hmax(a, b);
#else
return __half2float(a) > __half2float(b) ? a : b;
#endif // CUDART_VERSION >= CUDART_HMAX
#else
GGML_UNUSED(a);
GGML_UNUSED(b);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
}
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX
return __hmax2(a, b);
#else
half2 ret;
reinterpret_cast<half&>(ret.x) = __low2float(a) > __low2float(b) ? __low2half(a) : __low2half(b);
reinterpret_cast<half&>(ret.y) = __high2float(a) > __high2float(b) ? __high2half(a) : __high2half(b);
return ret;
#endif // CUDART_VERSION >= CUDART_HMAX
#else
GGML_UNUSED(a);
GGML_UNUSED(b);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
}
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
#else
GGML_UNUSED(x);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#if CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high;
}
#endif // CUDART_VERSION < 12000
#if defined(GGML_USE_HIPBLAS) #if defined(GGML_USE_HIPBLAS)
#define __CUDA_ARCH__ 1300 #define __CUDA_ARCH__ 1300
@ -433,11 +317,143 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
} }
#endif // defined(GGML_USE_HIPBLAS) #endif // defined(GGML_USE_HIPBLAS)
#define FP16_AVAILABLE defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) ? \ #define FP16_AVAILABLE (defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ >= CC_PASCAL
defined(RDNA1) || defined(RDNA2) || defined(RDNA3) : __CUDA_ARCH__ >= CC_PASCAL
#define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA #define FP16_MMA_AVAILABLE !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_VOLTA
static bool fp16_mma_available(const int cc) {
return cc < CC_OFFSET_AMD && cc >= CC_VOLTA;
}
[[noreturn]]
static __device__ void no_device_code(
const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
printf("%s:%d: ERROR: HIP kernel %s has no device code compatible with HIP arch %d.\n",
file_name, line, function_name, arch);
GGML_UNUSED(arch_list);
#else
printf("%s:%d: ERROR: CUDA kernel %s has no device code compatible with CUDA arch %d. ggml-cuda.cu was compiled for: %s\n",
file_name, line, function_name, arch, arch_list);
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
__trap();
GGML_UNUSED(no_device_code); // suppress unused function warning
}
#ifdef __CUDA_ARCH__
#define NO_DEVICE_CODE no_device_code(__FILE__, __LINE__, __FUNCTION__, __CUDA_ARCH__, STRINGIZE(__CUDA_ARCH_LIST__))
#else
#define NO_DEVICE_CODE //GGML_ASSERT(false && "NO_DEVICE_CODE not valid in host code.")
#endif // __CUDA_ARCH__
static __device__ __forceinline__ float warp_reduce_sum(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x += __shfl_xor_sync(0xffffffff, x, mask, 32);
}
return x;
}
static __device__ __forceinline__ float2 warp_reduce_sum(float2 a) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a.x += __shfl_xor_sync(0xffffffff, a.x, mask, 32);
a.y += __shfl_xor_sync(0xffffffff, a.y, mask, 32);
}
return a;
}
static __device__ __forceinline__ half2 warp_reduce_sum(half2 a) {
#if FP16_AVAILABLE
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
const half2 a_other = __shfl_xor_sync(0xffffffff, a, mask, 32);
reinterpret_cast<half&>(a.x) += __low2half(a_other);
reinterpret_cast<half&>(a.y) += __high2half(a_other);
}
return a;
#else
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
a = __hadd2(a, __shfl_xor_sync(0xffffffff, a, mask, 32));
}
return a;
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
#else
NO_DEVICE_CODE;
return a;
#endif // FP16_AVAILABLE
}
static __device__ __forceinline__ float warp_reduce_max(float x) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = fmaxf(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
}
static __device__ __forceinline__ half ggml_cuda_hmax(const half a, const half b) {
#if FP16_AVAILABLE
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
return __float2half(fmaxf(__half2float(a), __half2float(b)));
#else
return __hmax(a, b);
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && CUDART_VERSION < CUDART_HMAX
#else
NO_DEVICE_CODE;
GGML_UNUSED(b);
return a;
#endif // FP16_AVAILABLE
}
static __device__ __forceinline__ half2 ggml_cuda_hmax2(const half2 a, const half2 b) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
#if CUDART_VERSION >= CUDART_HMAX
return __hmax2(a, b);
#else
half2 ret;
reinterpret_cast<half&>(ret.x) = __float2half(fmaxf( __low2float(a), __low2float(b)));
reinterpret_cast<half&>(ret.y) = __float2half(fmaxf(__high2float(a), __high2float(b)));
return ret;
#endif // CUDART_VERSION >= CUDART_HMAX
#else
GGML_UNUSED(a);
GGML_UNUSED(b);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
}
static __device__ __forceinline__ half2 warp_reduce_max(half2 x) {
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1) {
x = ggml_cuda_hmax2(x, __shfl_xor_sync(0xffffffff, x, mask, 32));
}
return x;
#else
GGML_UNUSED(x);
NO_DEVICE_CODE;
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= CC_PASCAL
}
#if CUDART_VERSION < CUDART_HMASK
static __device__ __forceinline__ uint32_t __hgt2_mask(const half2 a, const half2 b) {
const uint32_t mask_low = 0x0000FFFF * (float( __low2half(a)) > float( __low2half(b)));
const uint32_t mask_high = 0xFFFF0000 * (float(__high2half(a)) > float(__high2half(b)));
return mask_low | mask_high;
}
#endif // CUDART_VERSION < 12000
// TODO: move to ggml-common.h // TODO: move to ggml-common.h
static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113}; static const __device__ int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};

View file

@ -11,8 +11,10 @@
#define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction. #define HALF_MAX_HALF __float2half(65504.0f/2) // Use neg. of this instead of -INFINITY to initialize KQ max vals to avoid NaN upon subtraction.
#define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs. #define SOFTMAX_FTZ_THRESHOLD -20.0f // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
template<int D, int parallel_blocks> // D == head size template<int D, int ncols, int parallel_blocks> // D == head size
__launch_bounds__(((D + WARP_SIZE - 1) / WARP_SIZE)*WARP_SIZE, 1) #if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_vec_ext_f16( static __global__ void flash_attn_vec_ext_f16(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,
@ -21,6 +23,10 @@ static __global__ void flash_attn_vec_ext_f16(
float * __restrict__ dst, float * __restrict__ dst,
float2 * __restrict__ dst_meta, float2 * __restrict__ dst_meta,
const float scale, const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00, const int ne00,
const int ne01, const int ne01,
const int ne02, const int ne02,
@ -44,55 +50,89 @@ static __global__ void flash_attn_vec_ext_f16(
#if FP16_AVAILABLE #if FP16_AVAILABLE
//In this kernel Q, K, V are matrices while i, j, k are matrix indices. //In this kernel Q, K, V are matrices while i, j, k are matrix indices.
const int ic = blockIdx.x / parallel_blocks; // Index of the Q/QKV column to work on. const int ic0 = (blockIdx.x / parallel_blocks) * ncols; // Index of the Q/QKV column to work on.
const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel. const int ip = blockIdx.x % parallel_blocks; // Index in group of blocks running for the same column in parallel.
const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix. const int gqa_ratio = ne02 / ne12; // With grouped query attention there are > 1 Q matrices per K, V matrix.
const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic); const float2 * Q_f2 = (const float2 *) (Q + nb02* blockIdx.y + nb01*ic0);
const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio)); const half2 * K_h2 = (const half2 *) (K + nb12*(blockIdx.y / gqa_ratio));
const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape const half * V_h = (const half *) (V + nb12*(blockIdx.y / gqa_ratio)); // K and V have same shape
const half * maskh = (const half *) mask + ne11*ic; const half * maskh = (const half *) mask + ne11*ic0;
const int stride_KV = nb11 / sizeof(half); const int stride_KV = nb11 / sizeof(half);
const int stride_KV2 = nb11 / sizeof(half2); const int stride_KV2 = nb11 / sizeof(half2);
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE; half slopeh = __float2half(1.0f);
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < nwarps*WARP_SIZE);
__shared__ half KQ[nwarps*WARP_SIZE]; // ALiBi
KQ[tid] = -INFINITY; if (max_bias > 0.0f) {
const int h = blockIdx.y;
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slopeh = __float2half(powf(base, exph));
}
static_assert(D % (2*WARP_SIZE) == 0, "D not divisible by 2*WARP_SIZE == 64.");
constexpr int nwarps = D / WARP_SIZE;
const int tid = WARP_SIZE*threadIdx.y + threadIdx.x;
__builtin_assume(tid < D);
__shared__ half KQ[ncols*D];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
KQ[j*D + tid] = -HALF_MAX_HALF;
}
half2 * KQ2 = (half2 *) KQ; half2 * KQ2 = (half2 *) KQ;
half kqmax = -HALF_MAX_HALF; half kqmax[ncols];
half kqsum = 0.0f; #pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax[j] = -HALF_MAX_HALF;
}
half kqsum[ncols] = {0.0f};
__shared__ half kqmax_shared[WARP_SIZE]; __shared__ half kqmax_shared[ncols][WARP_SIZE];
__shared__ half kqsum_shared[WARP_SIZE]; __shared__ half kqsum_shared[ncols][WARP_SIZE];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
if (threadIdx.y == 0) { if (threadIdx.y == 0) {
kqmax_shared[threadIdx.x] = -HALF_MAX_HALF; kqmax_shared[j][threadIdx.x] = -HALF_MAX_HALF;
kqsum_shared[threadIdx.x] = 0.0f; kqsum_shared[j][threadIdx.x] = 0.0f;
}
} }
__syncthreads(); __syncthreads();
// Convert Q to half2 and store in registers: // Convert Q to half2 and store in registers:
half2 Q_h2[(D/2 + WARP_SIZE - 1) / WARP_SIZE]; half2 Q_h2[ncols][D/(2*WARP_SIZE)];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
#pragma unroll #pragma unroll
for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) { for (int i0 = 0; i0 < D/2; i0 += WARP_SIZE) {
const int i = i0 + threadIdx.x; const int i = i0 + threadIdx.x;
if (i0 + WARP_SIZE > D/2 && i >= D/2) {
break; const float2 tmp = Q_f2[j*(nb01/sizeof(float2)) + i];
Q_h2[j][i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(tmp.x, tmp.y);
}
} }
Q_h2[i0/WARP_SIZE] = make_half2(scale, scale) * make_half2(Q_f2[i].x, Q_f2[i].y); half2 VKQ[ncols] = {{0.0f, 0.0f}};
}
half2 VKQ = make_half2(0.0f, 0.0f); // Each thread calculates a single VKQ value.
const int k_start = parallel_blocks == 1 ? 0 : ip*D; const int k_start = parallel_blocks == 1 ? 0 : ip*D;
for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) { for (int k_VKQ_0 = k_start; k_VKQ_0 < ne11; k_VKQ_0 += parallel_blocks*D) {
// Calculate KQ tile and keep track of new maximum KQ values: // Calculate KQ tile and keep track of new maximum KQ values:
half kqmax_new = kqmax;
// For unknown reasons using a half array of size 1 for kqmax_new causes a performance regression,
// see https://github.com/ggerganov/llama.cpp/pull/7061 .
// Therefore this variable is defined twice but only used once (so that the compiler can optimize out the unused variable).
half kqmax_new = kqmax[0];
half kqmax_new_arr[ncols];
#pragma unroll
for (int j = 0; j < ncols; ++j) {
kqmax_new_arr[j] = kqmax[j];
}
#pragma unroll #pragma unroll
for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) { for (int i_KQ_0 = 0; i_KQ_0 < D; i_KQ_0 += nwarps) {
const int i_KQ = i_KQ_0 + threadIdx.y; const int i_KQ = i_KQ_0 + threadIdx.y;
@ -101,47 +141,65 @@ static __global__ void flash_attn_vec_ext_f16(
break; break;
} }
half2 sum2 = make_half2(0.0f, 0.0f); half2 sum2[ncols] = {{0.0f, 0.0f}};
#pragma unroll #pragma unroll
for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) { for (int k_KQ_0 = 0; k_KQ_0 < D/2; k_KQ_0 += WARP_SIZE) {
const int k_KQ = k_KQ_0 + threadIdx.x; const int k_KQ = k_KQ_0 + threadIdx.x;
if (k_KQ_0 + WARP_SIZE > D/2 && k_KQ >= D/2) {
break;
}
const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ]; const half2 K_ik = K_h2[(k_VKQ_0 + i_KQ)*stride_KV2 + k_KQ];
sum2 += K_ik * Q_h2[k_KQ_0/WARP_SIZE]; #pragma unroll
for (int j = 0; j < ncols; ++j) {
sum2[j] += K_ik * Q_h2[j][k_KQ_0/WARP_SIZE];
}
} }
sum2 = warp_reduce_sum(sum2); #pragma unroll
half sum = __low2half(sum2) + __high2half(sum2); for (int j = 0; j < ncols; ++j) {
sum += mask ? maskh[k_VKQ_0 + i_KQ] : __float2half(0.0f); sum2[j] = warp_reduce_sum(sum2[j]);
half sum = __low2half(sum2[j]) + __high2half(sum2[j]);
sum += mask ? slopeh*maskh[j*ne11 + k_VKQ_0 + i_KQ] : __float2half(0.0f);
if (ncols == 1) {
kqmax_new = ggml_cuda_hmax(kqmax_new, sum); kqmax_new = ggml_cuda_hmax(kqmax_new, sum);
} else {
kqmax_new_arr[j] = ggml_cuda_hmax(kqmax_new_arr[j], sum);
}
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
KQ[i_KQ] = sum; KQ[j*D + i_KQ] = sum;
}
} }
} }
kqmax_new = warp_reduce_max(kqmax_new); #pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = ncols == 1 ? kqmax_new : kqmax_new_arr[j];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
kqmax_shared[threadIdx.y] = kqmax_new; kqmax_shared[j][threadIdx.y] = kqmax_new_j;
}
}
__syncthreads();
#pragma unroll
for (int j = 0; j < ncols; ++j) {
half kqmax_new_j = kqmax_shared[j][threadIdx.x];
kqmax_new_j = warp_reduce_max(kqmax_new_j);
const half KQ_max_scale = hexp(kqmax[j] - kqmax_new_j);
kqmax[j] = kqmax_new_j;
const half val = hexp(KQ[j*D + tid] - kqmax[j]);
kqsum[j] = kqsum[j]*KQ_max_scale + val;
KQ[j*D + tid] = val;
VKQ[j] *= __half2half2(KQ_max_scale);
} }
__syncthreads();
kqmax_new = kqmax_shared[threadIdx.x];
kqmax_new = warp_reduce_max(kqmax_new);
const half KQ_max_scale = hexp(kqmax - kqmax_new);
kqmax = kqmax_new;
const half val = hexp(KQ[tid] - kqmax);
kqsum = kqsum*KQ_max_scale + val;
KQ[tid] = val;
VKQ *= __half2half2(KQ_max_scale);
__syncthreads(); __syncthreads();
if (tid < D) {
#pragma unroll #pragma unroll
for (int k0 = 0; k0 < D; k0 += 2) { for (int k0 = 0; k0 < D; k0 += 2) {
if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) { if (FATTN_KQ_STRIDE % D != 0 && k_VKQ_0 + k0 >= ne11) {
@ -151,39 +209,44 @@ static __global__ void flash_attn_vec_ext_f16(
half2 V_k; half2 V_k;
reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid]; reinterpret_cast<half&>(V_k.x) = V_h[(k_VKQ_0 + k0 + 0)*stride_KV + tid];
reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid]; reinterpret_cast<half&>(V_k.y) = V_h[(k_VKQ_0 + k0 + 1)*stride_KV + tid];
VKQ += V_k*KQ2[k0/2]; #pragma unroll
for (int j = 0; j < ncols; ++j) {
VKQ[j] += V_k*KQ2[j*(D/2) + k0/2];
} }
} }
__syncthreads(); __syncthreads();
} }
if (tid >= D) { #pragma unroll
kqsum = 0.0f; for (int j = 0; j < ncols; ++j) {
} kqsum[j] = warp_reduce_sum(kqsum[j]);
kqsum = warp_reduce_sum(kqsum);
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
kqsum_shared[threadIdx.y] = kqsum; kqsum_shared[j][threadIdx.y] = kqsum[j];
} }
}
__syncthreads(); __syncthreads();
kqsum = kqsum_shared[threadIdx.x];
kqsum = warp_reduce_sum(kqsum);
if (tid >= D) { #pragma unroll
return; for (int j_VKQ = 0; j_VKQ < ncols; ++j_VKQ) {
} kqsum[j_VKQ] = kqsum_shared[j_VKQ][threadIdx.x];
kqsum[j_VKQ] = warp_reduce_sum(kqsum[j_VKQ]);
half dst_val = (__low2half(VKQ) + __high2half(VKQ)); half dst_val = (__low2half(VKQ[j_VKQ]) + __high2half(VKQ[j_VKQ]));
if (parallel_blocks == 1) { if (parallel_blocks == 1) {
dst_val /= kqsum; dst_val /= kqsum[j_VKQ];
}
const int j_dst = (ic0 + j_VKQ)*parallel_blocks + ip;
dst[j_dst*D*gridDim.y + D*blockIdx.y + tid] = dst_val;
} }
dst[D*gridDim.y*blockIdx.x + D*blockIdx.y + tid] = dst_val;
if (parallel_blocks == 1 || tid != 0) { if (parallel_blocks != 1 && tid != 0) {
return; #pragma unroll
for (int j = 0; j < ncols; ++j) {
dst_meta[(ic0 + j)*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax[j], kqsum[j]);
}
} }
dst_meta[ic*gridDim.y*parallel_blocks + blockIdx.y*parallel_blocks + ip] = make_float2(kqmax, kqsum);
#else #else
NO_DEVICE_CODE; NO_DEVICE_CODE;
#endif // FP16_AVAILABLE #endif // FP16_AVAILABLE
@ -191,7 +254,9 @@ static __global__ void flash_attn_vec_ext_f16(
// D == head size, VKQ_stride == num VKQ rows calculated in parallel: // D == head size, VKQ_stride == num VKQ rows calculated in parallel:
template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t> template<int D, int ncols, int nwarps, int VKQ_stride, int parallel_blocks, typename KQ_acc_t>
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(nwarps*WARP_SIZE, 1) __launch_bounds__(nwarps*WARP_SIZE, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_ext_f16( static __global__ void flash_attn_ext_f16(
const char * __restrict__ Q, const char * __restrict__ Q,
const char * __restrict__ K, const char * __restrict__ K,
@ -200,6 +265,10 @@ static __global__ void flash_attn_ext_f16(
float * __restrict__ dst, float * __restrict__ dst,
float2 * __restrict__ dst_meta, float2 * __restrict__ dst_meta,
const float scale, const float scale,
const float max_bias,
const float m0,
const float m1,
const uint32_t n_head_log2,
const int ne00, const int ne00,
const int ne01, const int ne01,
const int ne02, const int ne02,
@ -256,6 +325,20 @@ static __global__ void flash_attn_ext_f16(
const int stride_Q = nb01 / sizeof(float); const int stride_Q = nb01 / sizeof(float);
const int stride_KV = nb11 / sizeof(half); const int stride_KV = nb11 / sizeof(half);
half slopeh = __float2half(1.0f);
half2 slope2 = make_half2(1.0f, 1.0f);
// ALiBi
if (max_bias > 0.0f) {
const int h = blockIdx.y;
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slopeh = __float2half(powf(base, exph));
slope2 = make_half2(slopeh, slopeh);
}
frag_b Q_b[D/16][ncols/frag_n]; frag_b Q_b[D/16][ncols/frag_n];
// A single buffer for temporarily holding tiles of KQ and VKQ parts: // A single buffer for temporarily holding tiles of KQ and VKQ parts:
@ -372,7 +455,7 @@ static __global__ void flash_attn_ext_f16(
for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) { for (int k0 = 0; k0 < FATTN_KQ_STRIDE; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x; const int k = k0 + threadIdx.x;
KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f; KQ_f_tmp[k0/WARP_SIZE] += mask ? __half2float(slopeh*maskh[j*(nb31/sizeof(half)) + k_VKQ_0 + k]) : 0.0f;
KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]); KQ_max_new = max(KQ_max_new, KQ_f_tmp[k0/WARP_SIZE]);
} }
KQ_max_new = warp_reduce_max(KQ_max_new); KQ_max_new = warp_reduce_max(KQ_max_new);
@ -415,7 +498,7 @@ static __global__ void flash_attn_ext_f16(
for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) { for (int k0 = 0; k0 < FATTN_KQ_STRIDE/2; k0 += WARP_SIZE) {
const int k = k0 + threadIdx.x; const int k = k0 + threadIdx.x;
KQ2_tmp[k0/WARP_SIZE] += mask ? mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f); KQ2_tmp[k0/WARP_SIZE] += mask ? slope2*mask2[(j*ne11 + k_VKQ_0)/2 + k] : make_half2(0.0f, 0.0f);
KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]); KQ_max_new = ggml_cuda_hmax2(KQ_max_new, KQ2_tmp[k0/WARP_SIZE]);
} }
KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new)))); KQ_max_new = __half2half2(warp_reduce_max(ggml_cuda_hmax(__low2half(KQ_max_new), __high2half(KQ_max_new))));
@ -573,7 +656,9 @@ static __global__ void flash_attn_ext_f16(
} }
template<int D, int parallel_blocks> // D == head size template<int D, int parallel_blocks> // D == head size
#if !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
__launch_bounds__(D, 1) __launch_bounds__(D, 1)
#endif // !(defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__))
static __global__ void flash_attn_combine_results( static __global__ void flash_attn_combine_results(
const float * __restrict__ VKQ_parts, const float * __restrict__ VKQ_parts,
const float2 * __restrict__ VKQ_meta, const float2 * __restrict__ VKQ_meta,
@ -642,7 +727,7 @@ static_assert(get_VKQ_stride( 80, 1, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed."); static_assert(get_VKQ_stride( 80, 2, 16) == 16, "Test failed.");
static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed."); static_assert(get_VKQ_stride( 80, 4, 16) == 16, "Test failed.");
template <int D, int parallel_blocks> void launch_fattn_vec_f16( template <int D, int cols_per_block, int parallel_blocks> void launch_fattn_vec_f16(
const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask, const ggml_tensor * Q, const ggml_tensor * K, const ggml_tensor * V, ggml_tensor * KQV, const ggml_tensor * mask,
ggml_cuda_pool & pool, cudaStream_t main_stream ggml_cuda_pool & pool, cudaStream_t main_stream
) { ) {
@ -656,20 +741,29 @@ template <int D, int parallel_blocks> void launch_fattn_vec_f16(
constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE; constexpr int nwarps = (D + WARP_SIZE - 1) / WARP_SIZE;
const dim3 block_dim(WARP_SIZE, nwarps, 1); const dim3 block_dim(WARP_SIZE, nwarps, 1);
const dim3 blocks_num(parallel_blocks*Q->ne[1], Q->ne[2], Q->ne[3]); const dim3 blocks_num(parallel_blocks*((Q->ne[1] + cols_per_block - 1) / cols_per_block), Q->ne[2], Q->ne[3]);
const int shmem = 0; const int shmem = 0;
float scale; float scale = 1.0f;
memcpy(&scale, KQV->op_params, sizeof(float)); float max_bias = 0.0f;
flash_attn_vec_ext_f16<D, parallel_blocks> memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
const uint32_t n_head = Q->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
flash_attn_vec_ext_f16<D, cols_per_block, parallel_blocks>
<<<blocks_num, block_dim, shmem, main_stream>>> ( <<<blocks_num, block_dim, shmem, main_stream>>> (
(const char *) Q->data, (const char *) Q->data,
(const char *) K->data, (const char *) K->data,
(const char *) V->data, (const char *) V->data,
mask ? ((const char *) mask->data) : nullptr, mask ? ((const char *) mask->data) : nullptr,
parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr, parallel_blocks == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale, scale, max_bias, m0, m1, n_head_log2,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3], K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
@ -710,8 +804,17 @@ template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename K
const dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]); const dim3 blocks_num(parallel_blocks*(Q->ne[1] + cols_per_block - 1) / cols_per_block, Q->ne[2], Q->ne[3]);
const int shmem = 0; const int shmem = 0;
float scale; float scale = 1.0f;
memcpy(&scale, KQV->op_params, sizeof(float)); float max_bias = 0.0f;
memcpy(&scale, (float *) KQV->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) KQV->op_params + 1, sizeof(float));
const uint32_t n_head = Q->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t> flash_attn_ext_f16<D, cols_per_block, nwarps, get_VKQ_stride(D, nwarps, frag_m), parallel_blocks, KQ_acc_t>
<<<blocks_num, block_dim, shmem, main_stream>>> ( <<<blocks_num, block_dim, shmem, main_stream>>> (
@ -720,7 +823,7 @@ template <int D, int cols_per_block, int nwarps, int parallel_blocks, typename K
(const char *) V->data, (const char *) V->data,
mask ? ((const char *) mask->data) : nullptr, mask ? ((const char *) mask->data) : nullptr,
(parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr, (parallel_blocks) == 1 ? (float *) KQV->data : dst_tmp.ptr, dst_tmp_meta.ptr,
scale, scale, max_bias, m0, m1, n_head_log2,
Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3], Q->ne[0], Q->ne[1], Q->ne[2], Q->ne[3],
K->ne[0], K->ne[1], K->ne[2], K->ne[3], K->ne[0], K->ne[1], K->ne[2], K->ne[3],
mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0, mask ? mask->ne[1] : 0, mask ? mask->nb[1] : 0,
@ -783,9 +886,98 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
ggml_cuda_set_device(ctx.device); ggml_cuda_set_device(ctx.device);
const int cc = ggml_cuda_info().devices[ggml_cuda_get_device()].cc;
const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm; const int nsm = ggml_cuda_info().devices[ggml_cuda_get_device()].nsm;
const int32_t precision = KQV->op_params[1]; const int32_t precision = KQV->op_params[2];
if (!fp16_mma_available(cc)) {
GGML_ASSERT(precision == GGML_PREC_DEFAULT);
GGML_ASSERT(Q->ne[0] == 64 || Q->ne[0] == 128 && "FlashAttention without tensor cores only supports head sizes 64 and 128.");
if (Q->ne[1] == 1) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] == 2) {
constexpr int cols_per_block = 2;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 4) {
constexpr int cols_per_block = 4;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (Q->ne[1] <= 8) {
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 4;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
constexpr int cols_per_block = 8;
constexpr int parallel_blocks = 1;
switch (Q->ne[0]) {
case 64:
launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
case 128:
launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break;
default:
GGML_ASSERT(false);
break;
}
return;
}
if (precision != GGML_PREC_DEFAULT) { if (precision != GGML_PREC_DEFAULT) {
if (Q->ne[1] <= 32 || Q->ne[0] > 128) { if (Q->ne[1] <= 32 || Q->ne[0] > 128) {
@ -845,16 +1037,17 @@ void ggml_cuda_flash_attn_ext(ggml_backend_cuda_context & ctx, ggml_tensor * dst
} }
if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) { if (Q->ne[1] == 1 && Q->ne[0] % (2*WARP_SIZE) == 0) {
constexpr int cols_per_block = 1;
constexpr int parallel_blocks = 4; constexpr int parallel_blocks = 4;
switch (Q->ne[0]) { switch (Q->ne[0]) {
case 64: case 64:
launch_fattn_vec_f16< 64, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); launch_fattn_vec_f16< 64, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break; break;
case 128: case 128:
launch_fattn_vec_f16<128, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); launch_fattn_vec_f16<128, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break; break;
case 256: case 256:
launch_fattn_vec_f16<256, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream()); launch_fattn_vec_f16<256, cols_per_block, parallel_blocks>(Q, K, V, KQV, mask, ctx.pool(), ctx.stream());
break; break;
default: default:
GGML_ASSERT(false); GGML_ASSERT(false);

View file

@ -11,7 +11,7 @@ __device__ float __forceinline__ t2f32<half>(half val) {
} }
template <bool vals_smem, int ncols_template, int block_size_template, typename T> template <bool vals_smem, int ncols_template, int block_size_template, typename T>
static __global__ void soft_max_f32(const float * x, const T * mask, const T * pos, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) { static __global__ void soft_max_f32(const float * x, const T * mask, float * dst, const int ncols_par, const int nrows_y, const float scale, const float max_bias, const float m0, const float m1, uint32_t n_head_log2) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template; const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
const int tid = threadIdx.x; const int tid = threadIdx.x;
@ -23,16 +23,16 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
const int warp_id = threadIdx.x / WARP_SIZE; const int warp_id = threadIdx.x / WARP_SIZE;
const int lane_id = threadIdx.x % WARP_SIZE; const int lane_id = threadIdx.x % WARP_SIZE;
float slope = 0.0f; float slope = 1.0f;
// ALiBi // ALiBi
if (max_bias > 0.0f) { if (max_bias > 0.0f) {
const int h = rowx/nrows_y; // head index const int h = rowx/nrows_y; // head index
const float base = h < n_head_log2 ? m0 : m1; const float base = h < n_head_log2 ? m0 : m1;
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slope = powf(base, exp); slope = powf(base, exph);
} }
extern __shared__ float data_soft_max_f32[]; extern __shared__ float data_soft_max_f32[];
@ -53,7 +53,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
const int64_t ix = (int64_t)rowx*ncols + col; const int64_t ix = (int64_t)rowx*ncols + col;
const int64_t iy = (int64_t)rowy*ncols + col; const int64_t iy = (int64_t)rowy*ncols + col;
const float val = x[ix]*scale + (mask ? t2f32(mask[iy]) : 0.0f) + (pos ? slope*t2f32(pos[col]) : 0.0f); const float val = x[ix]*scale + (mask ? slope*t2f32(mask[iy]) : 0.0f);
vals[col] = val; vals[col] = val;
max_val = max(max_val, val); max_val = max(max_val, val);
@ -125,7 +125,7 @@ static __global__ void soft_max_f32(const float * x, const T * mask, const T * p
} }
template<typename T> template<typename T>
static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) { static void soft_max_f32_cuda(const float * x, const T * mask, float * dst, const int ncols_x, const int nrows_x, const int nrows_y, const float scale, const float max_bias, cudaStream_t stream) {
int nth = WARP_SIZE; int nth = WARP_SIZE;
while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2; while (nth < ncols_x && nth < CUDA_SOFT_MAX_BLOCK_SIZE) nth *= 2;
const dim3 block_dims(nth, 1, 1); const dim3 block_dims(nth, 1, 1);
@ -133,8 +133,8 @@ static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, fl
const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float); const size_t shmem = (GGML_PAD(ncols_x, WARP_SIZE) + WARP_SIZE)*sizeof(float);
static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted."); static_assert(CUDA_SOFT_MAX_BLOCK_SIZE == 1024, "These values need to be adjusted.");
const uint32_t n_head_kv = nrows_x/nrows_y; const uint32_t n_head = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@ -142,43 +142,42 @@ static void soft_max_f32_cuda(const float * x, const T * mask, const T * pos, fl
if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) { if (shmem < ggml_cuda_info().devices[ggml_cuda_get_device()].smpb) {
switch (ncols_x) { switch (ncols_x) {
case 32: case 32:
soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 32, 32><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 64: case 64:
soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 64, 64><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 128: case 128:
soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 128, 128><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 256: case 256:
soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 256, 256><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 512: case 512:
soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 512, 512><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 1024: case 1024:
soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 1024, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 2048: case 2048:
soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 2048, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
case 4096: case 4096:
soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 4096, 1024><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
default: default:
soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<true, 0, 0><<<block_nums, block_dims, shmem, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
break; break;
} }
} else { } else {
const size_t shmem_low = WARP_SIZE*sizeof(float); const size_t shmem_low = WARP_SIZE*sizeof(float);
soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, pos, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2); soft_max_f32<false, 0, 0><<<block_nums, block_dims, shmem_low, stream>>>(x, mask, dst, ncols_x, nrows_y, scale, max_bias, m0, m1, n_head_log2);
} }
} }
void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * src0 = dst->src[0]; const ggml_tensor * src0 = dst->src[0];
const ggml_tensor * src1 = dst->src[1]; const ggml_tensor * src1 = dst->src[1];
const ggml_tensor * src2 = dst->src[2];
const float * src0_d = (const float *)src0->data; const float * src0_d = (const float *)src0->data;
const void * src1_d = src1 ? (const void *)src1->data : nullptr; const void * src1_d = src1 ? (const void *)src1->data : nullptr;
@ -190,7 +189,6 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(src0);
@ -202,26 +200,15 @@ void ggml_cuda_op_soft_max(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float)); memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
// positions tensor const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
void * src2_d = nullptr;
const bool use_src2 = src2 != nullptr;
if (use_src2) {
src2_d = (void *)src2->data;
}
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
if (use_f16) { if (use_f16) {
const half * src1_dd = (const half *)src1_d; const half * src1_dd = (const half *)src1_d;
const half * src2_dd = (const half *)src2_d;
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
} else { } else {
const float * src1_dd = (const float *)src1_d; const float * src1_dd = (const float *)src1_d;
const float * src2_dd = (const float *)src2_d;
soft_max_f32_cuda(src0_d, src1_dd, src2_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream); soft_max_f32_cuda(src0_d, src1_dd, dst_d, ne00, nrows_x, nrows_y, scale, max_bias, stream);
} }
} }

View file

@ -1561,10 +1561,9 @@ static void ggml_vk_graph_compute(struct ggml_kompute_context * ctx, struct ggml
float scale; float scale;
memcpy(&scale, dst->op_params, sizeof(float)); memcpy(&scale, dst->op_params, sizeof(float));
#pragma message("TODO: add ggml_vk_soft_max() F16/F32 src1 and src2 support") #pragma message("TODO: add ggml_vk_soft_max() F16 src1 support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32); GGML_ASSERT(!src1 || src1t == GGML_TYPE_F32);
GGML_ASSERT(src2 == nullptr);
ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale); ggml_vk_soft_max(seq, id_src0, id_src1, id_dst, off_src0, off_src1, off_dst, ne00, ne01, ne02, ne03, scale);
} break; } break;

View file

@ -169,7 +169,6 @@ enum ggml_metal_kernel_type {
GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32,
GGML_METAL_KERNEL_TYPE_ROPE_F32, GGML_METAL_KERNEL_TYPE_ROPE_F32,
GGML_METAL_KERNEL_TYPE_ROPE_F16, GGML_METAL_KERNEL_TYPE_ROPE_F16,
GGML_METAL_KERNEL_TYPE_ALIBI_F32,
GGML_METAL_KERNEL_TYPE_IM2COL_F16, GGML_METAL_KERNEL_TYPE_IM2COL_F16,
GGML_METAL_KERNEL_TYPE_IM2COL_F32, GGML_METAL_KERNEL_TYPE_IM2COL_F32,
GGML_METAL_KERNEL_TYPE_UPSCALE_F32, GGML_METAL_KERNEL_TYPE_UPSCALE_F32,
@ -623,7 +622,6 @@ static struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_MUL_MM_ID_IQ4_XS_F32, mul_mm_id_iq4_xs_f32, ctx->support_simdgroup_mm);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F32, rope_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ROPE_F16, rope_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_ALIBI_F32, alibi_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F16, im2col_f16, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_IM2COL_F32, im2col_f32, true);
GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true); GGML_METAL_ADD_KERNEL(GGML_METAL_KERNEL_TYPE_UPSCALE_F32, upscale_f32, true);
@ -759,7 +757,6 @@ static bool ggml_metal_supports_op(const struct ggml_metal_context * ctx, const
case GGML_OP_GROUP_NORM: case GGML_OP_GROUP_NORM:
return ctx->support_simdgroup_reduction; return ctx->support_simdgroup_reduction;
case GGML_OP_NORM: case GGML_OP_NORM:
case GGML_OP_ALIBI:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
return true; return true;
@ -1357,13 +1354,12 @@ static enum ggml_status ggml_metal_graph_compute(
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
{ {
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32); GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F16 || src1->type == GGML_TYPE_F32);
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F16 || src2->type == GGML_TYPE_F32);
int nth = 32; // SIMD width int nth = 32; // SIMD width
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16); const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
if (ne00%4 == 0) { if (ne00%4 == 0) {
while (nth < ne00/4 && nth < 256) { while (nth < ne00/4 && nth < 256) {
@ -1394,8 +1390,8 @@ static enum ggml_status ggml_metal_graph_compute(
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(src0);
const int64_t nrows_y = src0->ne[1]; const int64_t nrows_y = src0->ne[1];
const uint32_t n_head_kv = nrows_x/nrows_y; const uint32_t n_head = nrows_x/nrows_y;
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head_kv)); const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@ -1407,20 +1403,15 @@ static enum ggml_status ggml_metal_graph_compute(
} else { } else {
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
} }
if (id_src2) { [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
[encoder setBuffer:id_src2 offset:offs_src2 atIndex:2]; [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3];
} else { [encoder setBytes:&ne01 length:sizeof(ne01) atIndex:4];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:2]; [encoder setBytes:&ne02 length:sizeof(ne02) atIndex:5];
} [encoder setBytes:&scale length:sizeof(scale) atIndex:6];
[encoder setBuffer:id_dst offset:offs_dst atIndex:3]; [encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:7];
[encoder setBytes:&ne00 length:sizeof(ne00) atIndex:4]; [encoder setBytes:&m0 length:sizeof(m0) atIndex:8];
[encoder setBytes:&ne01 length:sizeof(ne01) atIndex:5]; [encoder setBytes:&m1 length:sizeof(m1) atIndex:9];
[encoder setBytes:&ne02 length:sizeof(ne02) atIndex:6]; [encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:10];
[encoder setBytes:&scale length:sizeof(scale) atIndex:7];
[encoder setBytes:&max_bias length:sizeof(max_bias) atIndex:8];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:9];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:10];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:11];
[encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0]; [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
[encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(ne01*ne02*ne03, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
@ -2225,49 +2216,6 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)]; [encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break; } break;
case GGML_OP_ALIBI:
{
GGML_ASSERT((src0t == GGML_TYPE_F32));
const int nth = MIN(1024, ne00);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
id<MTLComputePipelineState> pipeline = ctx->kernels[GGML_METAL_KERNEL_TYPE_ALIBI_F32].pipeline;
[encoder setComputePipelineState:pipeline];
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
[encoder setBytes:&m1 length:sizeof( float) atIndex:19];
[encoder setBytes:&n_heads_log2_floor length:sizeof(int) atIndex:20];
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
} break;
case GGML_OP_ROPE: case GGML_OP_ROPE:
{ {
GGML_ASSERT(ne10 == ne02); GGML_ASSERT(ne10 == ne02);
@ -2565,7 +2513,7 @@ static enum ggml_status ggml_metal_graph_compute(
"the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big"); "the Flash-Attention Metal kernel requires the mask to be padded to 8 and at least n_queries big");
const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30); const int64_t ne30 = src3 ? src3->ne[0] : 0; GGML_UNUSED(ne30);
const int64_t ne31 = src3 ? src3->ne[1] : 0; //const int64_t ne31 = src3 ? src3->ne[1] : 0;
const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32); const int64_t ne32 = src3 ? src3->ne[2] : 0; GGML_UNUSED(ne32);
const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33); const int64_t ne33 = src3 ? src3->ne[3] : 0; GGML_UNUSED(ne33);
@ -2577,7 +2525,16 @@ static enum ggml_status ggml_metal_graph_compute(
const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t); const enum ggml_type src2t = src2 ? src2->type : GGML_TYPE_COUNT; GGML_UNUSED(src2t);
float scale; float scale;
memcpy(&scale, dst->op_params, sizeof(float)); float max_bias;
memcpy(&scale, ((int32_t *) dst->op_params) + 0, sizeof(scale));
memcpy(&max_bias, ((int32_t *) dst->op_params) + 1, sizeof(max_bias));
const uint32_t n_head = src0->ne[2];
const uint32_t n_head_log2 = 1u << (uint32_t) floorf(log2f((float) n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
id<MTLComputePipelineState> pipeline = nil; id<MTLComputePipelineState> pipeline = nil;
@ -2635,13 +2592,16 @@ static enum ggml_status ggml_metal_graph_compute(
[encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18]; [encoder setBytes:&nb11 length:sizeof(uint64_t) atIndex:18];
[encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19]; [encoder setBytes:&nb12 length:sizeof(uint64_t) atIndex:19];
[encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20]; [encoder setBytes:&nb13 length:sizeof(uint64_t) atIndex:20];
[encoder setBytes:&ne31 length:sizeof( int64_t) atIndex:21]; [encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:21];
[encoder setBytes:&nb31 length:sizeof(uint64_t) atIndex:22]; [encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:22];
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:23]; [encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:23];
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:24]; [encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:24];
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:25]; [encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:25];
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:26]; [encoder setBytes:&scale length:sizeof( float) atIndex:26];
[encoder setBytes:&scale length:sizeof( float) atIndex:27]; [encoder setBytes:&max_bias length:sizeof( float) atIndex:27];
[encoder setBytes:&m0 length:sizeof(m0) atIndex:28];
[encoder setBytes:&m1 length:sizeof(m1) atIndex:29];
[encoder setBytes:&n_head_log2 length:sizeof(n_head_log2) atIndex:30];
if (!use_vec_kernel) { if (!use_vec_kernel) {
// half8x8 kernel // half8x8 kernel

View file

@ -356,7 +356,6 @@ template<typename T>
kernel void kernel_soft_max( kernel void kernel_soft_max(
device const char * src0, device const char * src0,
device const char * src1, device const char * src1,
device const char * src2,
device char * dst, device char * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
@ -378,10 +377,9 @@ kernel void kernel_soft_max(
device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device const float * psrc0 = (device const float *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr; device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00 : nullptr;
device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device float * pdst = (device float *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
float slope = 0.0f; float slope = 1.0f;
// ALiBi // ALiBi
if (max_bias > 0.0f) { if (max_bias > 0.0f) {
@ -397,7 +395,7 @@ kernel void kernel_soft_max(
float lmax = -INFINITY; float lmax = -INFINITY;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)); lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f));
} }
// find the max value in the block // find the max value in the block
@ -422,7 +420,7 @@ kernel void kernel_soft_max(
// parallel sum // parallel sum
float lsum = 0.0f; float lsum = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) { for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f)) - max_val); const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? slope*pmask[i00] : 0.0f)) - max_val);
lsum += exp_psrc0; lsum += exp_psrc0;
pdst[i00] = exp_psrc0; pdst[i00] = exp_psrc0;
} }
@ -461,7 +459,6 @@ template<typename T>
kernel void kernel_soft_max_4( kernel void kernel_soft_max_4(
device const char * src0, device const char * src0,
device const char * src1, device const char * src1,
device const char * src2,
device char * dst, device char * dst,
constant int64_t & ne00, constant int64_t & ne00,
constant int64_t & ne01, constant int64_t & ne01,
@ -483,10 +480,9 @@ kernel void kernel_soft_max_4(
device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; device const float4 * psrc4 = (device const float4 *) src0 + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr; device const T * pmask = src1 != src0 ? (device const T *) src1 + i01*ne00/4 : nullptr;
device const T * ppos = src2 != src0 ? (device const T *) src2 : nullptr;
device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4; device float4 * pdst4 = (device float4 *) dst + (i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00)/4;
float slope = 0.0f; float slope = 1.0f;
if (max_bias > 0.0f) { if (max_bias > 0.0f) {
const int64_t h = i02; const int64_t h = i02;
@ -501,7 +497,7 @@ kernel void kernel_soft_max_4(
float4 lmax4 = -INFINITY; float4 lmax4 = -INFINITY;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))); lmax4 = fmax(lmax4, psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f)));
} }
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@ -527,7 +523,7 @@ kernel void kernel_soft_max_4(
// parallel sum // parallel sum
float4 lsum4 = 0.0f; float4 lsum4 = 0.0f;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? pmask[i00] : 0.0f) + (ppos ? slope*ppos[i00] : 0.0f))) - max_val); const float4 exp_psrc4 = exp((psrc4[i00]*scale + (float4)((pmask ? slope*pmask[i00] : 0.0f))) - max_val);
lsum4 += exp_psrc4; lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4; pdst4[i00] = exp_psrc4;
} }
@ -1595,60 +1591,6 @@ kernel void kernel_mul_mv_f16_f32_l4(
} }
} }
kernel void kernel_alibi_f32(
device const float * src0,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant int64_t & ne03,
constant uint64_t & nb00,
constant uint64_t & nb01,
constant uint64_t & nb02,
constant uint64_t & nb03,
constant int64_t & ne0,
constant int64_t & ne1,
constant int64_t & ne2,
constant int64_t & ne3,
constant uint64_t & nb0,
constant uint64_t & nb1,
constant uint64_t & nb2,
constant uint64_t & nb3,
constant float & m0,
constant float & m1,
constant int & n_heads_log2_floor,
uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]],
uint3 ntg[[threads_per_threadgroup]]) {
const int64_t i03 = tgpig[2];
const int64_t i02 = tgpig[1];
const int64_t i01 = tgpig[0];
const int64_t n = i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
const int64_t i3 = n / (ne2*ne1*ne0);
const int64_t i2 = (n - i3*ne2*ne1*ne0) / (ne1*ne0);
const int64_t i1 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0) / ne0;
//const int64_t i0 = (n - i3*ne2*ne1*ne0 - i2*ne1*ne0 - i1*ne0);
const int64_t k = i3*ne3 + i2;
float m_k;
if (k < n_heads_log2_floor) {
m_k = pow(m0, k + 1);
} else {
m_k = pow(m1, 2 * (k - n_heads_log2_floor) + 1);
}
device char * dst_row = (device char *) dst + i3*nb3 + i2*nb2 + i1*nb1;
device const char * src_row = (device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01;
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
const float src_v = *(device float *)(src_row + i00*nb00);
device float * dst_v = (device float *)(dst_row + i00*nb0);
*dst_v = i00 * m_k + src_v;
}
}
static float rope_yarn_ramp(const float low, const float high, const int i0) { static float rope_yarn_ramp(const float low, const float high, const int i0) {
const float y = (i0 / 2 - low) / max(0.001f, high - low); const float y = (i0 / 2 - low) / max(0.001f, high - low);
return 1.0f - min(1.0f, max(0.0f, y)); return 1.0f - min(1.0f, max(0.0f, y));
@ -2116,13 +2058,16 @@ typedef void (flash_attn_ext_f16_t)(
constant uint64_t & nb11, constant uint64_t & nb11,
constant uint64_t & nb12, constant uint64_t & nb12,
constant uint64_t & nb13, constant uint64_t & nb13,
constant int64_t & ne31,
constant uint64_t & nb31, constant uint64_t & nb31,
constant int64_t & ne0, constant int64_t & ne0,
constant int64_t & ne1, constant int64_t & ne1,
constant int64_t & ne2, constant int64_t & ne2,
constant int64_t & ne3, constant int64_t & ne3,
constant float & scale, constant float & scale,
constant float & max_bias,
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
threadgroup half * shared, threadgroup half * shared,
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
@ -2154,13 +2099,16 @@ kernel void kernel_flash_attn_ext_f16(
constant uint64_t & nb11, constant uint64_t & nb11,
constant uint64_t & nb12, constant uint64_t & nb12,
constant uint64_t & nb13, constant uint64_t & nb13,
constant int64_t & ne31,
constant uint64_t & nb31, constant uint64_t & nb31,
constant int64_t & ne0, constant int64_t & ne0,
constant int64_t & ne1, constant int64_t & ne1,
constant int64_t & ne2, constant int64_t & ne2,
constant int64_t & ne3, constant int64_t & ne3,
constant float & scale, constant float & scale,
constant float & max_bias,
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
threadgroup half * shared [[threadgroup(0)]], threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
@ -2257,6 +2205,19 @@ kernel void kernel_flash_attn_ext_f16(
// prepare diagonal scale matrix // prepare diagonal scale matrix
simdgroup_float8x8 mscale(scale); simdgroup_float8x8 mscale(scale);
// prepare diagonal slope matrix
simdgroup_float8x8 mslope(1.0f);
// ALiBi
if (max_bias > 0.0f) {
const short h = iq2;
const float base = h < n_head_log2 ? m0 : m1;
const int exph = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
mslope = simdgroup_float8x8(pow(base, exph));
}
// loop over the KV cache // loop over the KV cache
// each simdgroup handles blocks of Q rows and C columns // each simdgroup handles blocks of Q rows and C columns
for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) { for (int ic0 = 0; ic0 < ne11; ic0 += C*nsg) {
@ -2279,9 +2240,10 @@ kernel void kernel_flash_attn_ext_f16(
simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk); simdgroup_multiply_accumulate(mqk, mq[i], mk, mqk);
} }
// mqk = mqk*scale + mask // mqk = mqk*scale + mask*slope
simdgroup_half8x8 mm; simdgroup_half8x8 mm;
simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false); simdgroup_load(mm, mp + ic + 8*cc, nb31/sizeof(half), 0, false);
simdgroup_multiply(mm, mslope, mm);
simdgroup_multiply_accumulate(mqk, mqk, mscale, mm); simdgroup_multiply_accumulate(mqk, mqk, mscale, mm);
simdgroup_store(mqk, ss + 8*cc, TF, 0, false); simdgroup_store(mqk, ss + 8*cc, TF, 0, false);
@ -2472,13 +2434,16 @@ kernel void kernel_flash_attn_ext_vec_f16(
constant uint64_t & nb11, constant uint64_t & nb11,
constant uint64_t & nb12, constant uint64_t & nb12,
constant uint64_t & nb13, constant uint64_t & nb13,
constant int64_t & ne31,
constant uint64_t & nb31, constant uint64_t & nb31,
constant int64_t & ne0, constant int64_t & ne0,
constant int64_t & ne1, constant int64_t & ne1,
constant int64_t & ne2, constant int64_t & ne2,
constant int64_t & ne3, constant int64_t & ne3,
constant float & scale, constant float & scale,
constant float & max_bias,
constant float & m0,
constant float & m1,
constant uint32_t & n_head_log2,
threadgroup half * shared [[threadgroup(0)]], threadgroup half * shared [[threadgroup(0)]],
uint3 tgpig[[threadgroup_position_in_grid]], uint3 tgpig[[threadgroup_position_in_grid]],
uint3 tpitg[[thread_position_in_threadgroup]], uint3 tpitg[[thread_position_in_threadgroup]],
@ -2497,6 +2462,18 @@ kernel void kernel_flash_attn_ext_vec_f16(
const short T = D + 2*nsg*SH; // shared memory size per query in (half) const short T = D + 2*nsg*SH; // shared memory size per query in (half)
float slope = 1.0f;
// ALiBi
if (max_bias > 0.0f) {
const short h = iq2;
const float base = h < n_head_log2 ? m0 : m1;
const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
slope = pow(base, exp);
}
//threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data //threadgroup half * sq = (threadgroup half *) (shared + 0*D); // holds the query data
threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4 threadgroup half4 * sq4 = (threadgroup half4 *) (shared + 0*D); // same as above but in half4
threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix threadgroup float * ss = (threadgroup float *) (shared + 2*sgitg*SH + 1*D); // scratch buffer for attention and diagonal matrix
@ -2603,10 +2580,10 @@ kernel void kernel_flash_attn_ext_vec_f16(
mqk += simd_shuffle_down(mqk, 2); mqk += simd_shuffle_down(mqk, 2);
mqk += simd_shuffle_down(mqk, 1); mqk += simd_shuffle_down(mqk, 1);
// mqk = mqk*scale + mask // mqk = mqk*scale + mask*slope
if (tiisg == 0) { if (tiisg == 0) {
float4 mm = (float4) mp4[ic/4 + cc]; float4 mm = (float4) mp4[ic/4 + cc];
mqk = mqk*scale + mm; mqk = mqk*scale + mm*slope;
ss4[cc] = mqk; ss4[cc] = mqk;
} }
@ -2840,7 +2817,8 @@ kernel void kernel_cpy_f32_f16(
for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) { for (int64_t i00 = tpitg.x; i00 < ne00; i00 += ntg.x) {
device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00); device const float * src = (device float *)((device char *) src0 + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00);
dst_data[i00] = src[0]; // TODO: is there a better way to handle -INFINITY?
dst_data[i00] = src[0] == -INFINITY ? -MAXHALF : src[0];
} }
} }

View file

@ -2119,6 +2119,7 @@ static size_t ggml_backend_opencl_buffer_type_get_alignment(ggml_backend_buffer_
if (alignment == (cl_uint)-1) { if (alignment == (cl_uint)-1) {
ggml_cl_init(); ggml_cl_init();
clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL); clGetDeviceInfo(device, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &alignment, NULL);
alignment /= 8; // bits to bytes
} }
return alignment; return alignment;

View file

@ -3154,7 +3154,6 @@ typedef float (*vec_dot_q_mul_mat_sycl_t)(
#define SYCL_SCALE_BLOCK_SIZE 256 #define SYCL_SCALE_BLOCK_SIZE 256
#define SYCL_CLAMP_BLOCK_SIZE 256 #define SYCL_CLAMP_BLOCK_SIZE 256
#define SYCL_ROPE_BLOCK_SIZE 256 #define SYCL_ROPE_BLOCK_SIZE 256
#define SYCL_ALIBI_BLOCK_SIZE 32
#define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32 #define SYCL_DIAG_MASK_INF_BLOCK_SIZE 32
#define SYCL_QUANTIZE_BLOCK_SIZE 256 #define SYCL_QUANTIZE_BLOCK_SIZE 256
#define SYCL_DEQUANTIZE_BLOCK_SIZE 256 #define SYCL_DEQUANTIZE_BLOCK_SIZE 256
@ -8330,13 +8329,15 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
const int blocks_per_row = ncols / qk; const int blocks_per_row = ncols / qk;
const int blocks_per_warp = vdr * WARP_SIZE / qi; const int blocks_per_warp = vdr * WARP_SIZE / qi;
const int qi_vdr = (qi / vdr); // N_threads processing 1 qk block
// partial sum for each thread // partial sum for each thread
float tmp = 0.0f; float tmp = 0.0f;
const block_q_t * x = (const block_q_t *) vx; const block_q_t * x = (const block_q_t *) vx;
const block_q8_1 * y = (const block_q8_1 *) vy; const block_q8_1 * y = (const block_q8_1 *) vy;
for (int i = item_ct1.get_local_id(2) / (qi / vdr); i < blocks_per_row; for (int i = item_ct1.get_local_id(2) / qi_vdr; i < blocks_per_row;
i += blocks_per_warp) { i += blocks_per_warp) {
const int ibx = row * blocks_per_row + i; // x block index const int ibx = row * blocks_per_row + i; // x block index
@ -8344,8 +8345,8 @@ static void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict_
const int iqs = const int iqs =
vdr * vdr *
(item_ct1.get_local_id(2) % (item_ct1.get_local_id(2) -
(qi / vdr)); // x block quant index when casting the quants to int i * qi_vdr); // x block quant index when casting the quants to int
tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs); tmp += vec_dot_q_sycl(&x[ibx], &y[iby], iqs);
} }
@ -9314,32 +9315,6 @@ static void rope_glm_f32(
dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta; dst[i + half_n_dims * 3] = x2*sin_block_theta + x3*cos_block_theta;
} }
static void alibi_f32(const float * x, float * dst, const int ncols, const int k_rows,
const int n_heads_log2_floor, const float m0, const float m1,
const sycl::nd_item<3> &item_ct1) {
const int col = item_ct1.get_local_range(2) * item_ct1.get_group(2) +
item_ct1.get_local_id(2);
if (col >= ncols) {
return;
}
const int row = item_ct1.get_local_range(1) * item_ct1.get_group(1) +
item_ct1.get_local_id(1);
const int i = row*ncols + col;
const int k = row/k_rows;
float m_k;
if (k < n_heads_log2_floor) {
m_k = dpct::pow(m0, k + 1);
} else {
m_k = dpct::pow(m1, 2 * (k - n_heads_log2_floor) + 1);
}
dst[i] = col * m_k + x[i];
}
static void k_sum_rows_f32(const float * x, float * dst, const int ncols, static void k_sum_rows_f32(const float * x, float * dst, const int ncols,
const sycl::nd_item<3> &item_ct1) { const sycl::nd_item<3> &item_ct1) {
const int row = item_ct1.get_group(1); const int row = item_ct1.get_group(1);
@ -9441,7 +9416,7 @@ static void diag_mask_inf_f32(const float * x, float * dst, const int ncols, con
template <bool vals_smem, int ncols_template, int block_size_template> template <bool vals_smem, int ncols_template, int block_size_template>
static void soft_max_f32(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par, static void soft_max_f32(const float * x, const float * mask, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0, const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) { const float m1, uint32_t n_head_log2, const sycl::nd_item<3> &item_ct1, float *buf) {
const int ncols = ncols_template == 0 ? ncols_par : ncols_template; const int ncols = ncols_template == 0 ? ncols_par : ncols_template;
@ -9455,7 +9430,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE; const int warp_id = item_ct1.get_local_id(2) / WARP_SIZE;
const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE; const int lane_id = item_ct1.get_local_id(2) % WARP_SIZE;
float slope = 0.0f; float slope = 1.0f;
// ALiBi // ALiBi
if (max_bias > 0.0f) { if (max_bias > 0.0f) {
@ -9480,7 +9455,7 @@ static void soft_max_f32(const float * x, const float * mask, const float *pos,
const int ix = rowx*ncols + col; const int ix = rowx*ncols + col;
const int iy = rowy*ncols + col; const int iy = rowy*ncols + col;
const float val = x[ix]*scale + (mask ? mask[iy] : 0.0f) + (pos ? slope*pos[col] : 0.0f); const float val = x[ix]*scale + (mask ? slope*mask[iy] : 0.0f);
vals[col] = val; vals[col] = val;
max_val = sycl::max(max_val, val); max_val = sycl::max(max_val, val);
@ -12962,20 +12937,6 @@ static void rope_glm_f32_sycl(const float *x, float *dst, int ncols, int nrows,
}); });
} }
static void alibi_f32_sycl(const float *x, float *dst, const int ncols,
const int nrows, const int k_rows,
const int n_heads_log2_floor, const float m0,
const float m1, dpct::queue_ptr stream) {
const sycl::range<3> block_dims(1, 1, SYCL_ALIBI_BLOCK_SIZE);
const int num_blocks_x = (ncols + SYCL_ALIBI_BLOCK_SIZE - 1) / (SYCL_ALIBI_BLOCK_SIZE);
const sycl::range<3> block_nums(1, nrows, num_blocks_x);
stream->parallel_for(sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) {
alibi_f32(x, dst, ncols, k_rows,
n_heads_log2_floor, m0, m1, item_ct1);
});
}
static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols, static void sum_rows_f32_sycl(const float *x, float *dst, const int ncols,
const int nrows, dpct::queue_ptr stream) { const int nrows, dpct::queue_ptr stream) {
const sycl::range<3> block_dims(1, 1, WARP_SIZE); const sycl::range<3> block_dims(1, 1, WARP_SIZE);
@ -13056,7 +13017,7 @@ static void diag_mask_inf_f32_sycl(const float *x, float *dst,
} }
template <bool vals_smem, int ncols_template, int block_size_template> template <bool vals_smem, int ncols_template, int block_size_template>
static void soft_max_f32_submitter(const float * x, const float * mask, const float *pos, float * dst, const int ncols_par, static void soft_max_f32_submitter(const float * x, const float * mask, float * dst, const int ncols_par,
const int nrows_y, const float scale, const float max_bias, const float m0, const int nrows_y, const float scale, const float max_bias, const float m0,
const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims, const float m1, uint32_t n_head_log2, sycl::range<3> block_nums, sycl::range<3> block_dims,
const size_t n_local_scratch, dpct::queue_ptr stream) { const size_t n_local_scratch, dpct::queue_ptr stream) {
@ -13066,7 +13027,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
cgh.parallel_for( cgh.parallel_for(
sycl::nd_range<3>(block_nums * block_dims, block_dims), sycl::nd_range<3>(block_nums * block_dims, block_dims),
[=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] { [=](sycl::nd_item<3> item_ct1) [[intel::reqd_sub_group_size(32)]] {
soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, pos, dst, ncols_par, soft_max_f32<vals_smem, ncols_template, block_size_template>(x, mask, dst, ncols_par,
nrows_y, scale, max_bias, m0, nrows_y, scale, max_bias, m0,
m1, n_head_log2, item_ct1, m1, n_head_log2, item_ct1,
local_buf_acc.get_pointer()); local_buf_acc.get_pointer());
@ -13074,7 +13035,7 @@ static void soft_max_f32_submitter(const float * x, const float * mask, const fl
}); });
} }
static void soft_max_f32_sycl(const float * x, const float * mask, const float * pos, static void soft_max_f32_sycl(const float * x, const float * mask,
float * dst, const int ncols_x, const int nrows_x, float * dst, const int ncols_x, const int nrows_x,
const int nrows_y, const float scale, const float max_bias, const int nrows_y, const float scale, const float max_bias,
dpct::queue_ptr stream) { dpct::queue_ptr stream) {
@ -13096,60 +13057,60 @@ static void soft_max_f32_sycl(const float * x, const float * mask, const float *
const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>(); const size_t local_mem_size = stream->get_device().get_info<sycl::info::device::local_mem_size>();
if (n_local_scratch*sizeof(float) < local_mem_size) { if (n_local_scratch*sizeof(float) < local_mem_size) {
if (ncols_x > max_block_size) { if (ncols_x > max_block_size) {
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
return; return;
} }
switch (ncols_x) { switch (ncols_x) {
case 32: case 32:
soft_max_f32_submitter<true, 32, 32>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 32, 32>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 64: case 64:
soft_max_f32_submitter<true, 64, 64>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 64, 64>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 128: case 128:
soft_max_f32_submitter<true, 128, 128>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 128, 128>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 256: case 256:
soft_max_f32_submitter<true, 256, 256>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 256, 256>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 512: case 512:
soft_max_f32_submitter<true, 512, 512>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 512, 512>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 1024: case 1024:
soft_max_f32_submitter<true, 1024, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 1024, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 2048: case 2048:
soft_max_f32_submitter<true, 2048, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 2048, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
case 4096: case 4096:
soft_max_f32_submitter<true, 4096, 1024>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 4096, 1024>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
default: default:
soft_max_f32_submitter<true, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<true, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, n_local_scratch, stream); block_dims, n_local_scratch, stream);
break; break;
} }
} else { } else {
soft_max_f32_submitter<false, 0, 0>(x, mask, pos, dst, ncols_x, nrows_y, scale, soft_max_f32_submitter<false, 0, 0>(x, mask, dst, ncols_x, nrows_y, scale,
max_bias, m0, m1, n_head_log2, block_nums, max_bias, m0, m1, n_head_log2, block_nums,
block_dims, WARP_SIZE, stream); block_dims, WARP_SIZE, stream);
} }
@ -14560,36 +14521,6 @@ inline void ggml_sycl_op_rope(const ggml_tensor *src0, const ggml_tensor *src1,
(void) src1_dd; (void) src1_dd;
} }
inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
ggml_tensor *dst, const float *src0_dd,
const float *src1_dd, float *dst_dd,
const dpct::queue_ptr &main_stream) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
const int64_t nrows = ggml_nrows(src0);
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
//GGML_ASSERT(ne01 + n_past == ne00);
GGML_ASSERT(n_head == ne02);
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
alibi_f32_sycl(src0_dd, dst_dd, ne00, nrows, ne01, n_heads_log2_floor, m0, m1, main_stream);
(void) src1;
(void) src1_dd;
}
static void ggml_sycl_op_pool2d(const ggml_tensor *src0, static void ggml_sycl_op_pool2d(const ggml_tensor *src0,
const ggml_tensor *src1, ggml_tensor *dst, const ggml_tensor *src1, ggml_tensor *dst,
const float *src0_dd, const float *src1_dd, const float *src0_dd, const float *src1_dd,
@ -14744,12 +14675,9 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32);
const ggml_tensor * src2 = dst->src[2]; #pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 support")
#pragma message("TODO: add ggml_sycl_op_soft_max() F16 src1 and src2 support")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021") #pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5021")
GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional GGML_ASSERT(!src1 || src1->type == GGML_TYPE_F32); // src1 contains mask and it is optional
GGML_ASSERT(!src2 || src2->type == GGML_TYPE_F32); // src2 contains positions and it is optional
const int64_t ne00 = src0->ne[0]; const int64_t ne00 = src0->ne[0];
const int64_t nrows_x = ggml_nrows(src0); const int64_t nrows_x = ggml_nrows(src0);
@ -14761,25 +14689,7 @@ inline void ggml_sycl_op_soft_max(const ggml_tensor *src0,
memcpy(&scale, dst->op_params + 0, sizeof(float)); memcpy(&scale, dst->op_params + 0, sizeof(float));
memcpy(&max_bias, dst->op_params + 1, sizeof(float)); memcpy(&max_bias, dst->op_params + 1, sizeof(float));
// positions tensor soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, dst_dd, ne00,
float * src2_dd = nullptr;
sycl_pool_alloc<float> src2_f;
const bool use_src2 = src2 != nullptr;
if (use_src2) {
const bool src2_on_device = src2->backend == GGML_BACKEND_TYPE_GPU;
if (src2_on_device) {
ggml_tensor_extra_gpu * src2_extra = (ggml_tensor_extra_gpu *) src2->extra;
src2_dd = (float *) src2_extra->data_device[g_main_device];
} else {
src2_dd = src2_f.alloc(ggml_nelements(src2));
SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src2_dd, src2, 0, 0, 0, 1, main_stream));
}
}
soft_max_f32_sycl(src0_dd, src1 ? src1_dd : nullptr, src2_dd, dst_dd, ne00,
nrows_x, nrows_y, scale, max_bias, main_stream); nrows_x, nrows_y, scale, max_bias, main_stream);
} }
@ -16230,10 +16140,6 @@ static void ggml_sycl_rope(const ggml_tensor * src0, const ggml_tensor * src1, g
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rope);
} }
static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi);
}
static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d);
} }
@ -16610,9 +16516,6 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_
case GGML_OP_ROPE: case GGML_OP_ROPE:
func = ggml_sycl_rope; func = ggml_sycl_rope;
break; break;
case GGML_OP_ALIBI:
func = ggml_sycl_alibi;
break;
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
func = ggml_sycl_im2col; func = ggml_sycl_im2col;
break; break;
@ -17742,7 +17645,6 @@ GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, cons
case GGML_OP_DIAG_MASK_INF: case GGML_OP_DIAG_MASK_INF:
case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX:
case GGML_OP_ROPE: case GGML_OP_ROPE:
case GGML_OP_ALIBI:
case GGML_OP_IM2COL: case GGML_OP_IM2COL:
case GGML_OP_POOL_2D: case GGML_OP_POOL_2D:
case GGML_OP_SUM_ROWS: case GGML_OP_SUM_ROWS:

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

303
ggml.c
View file

@ -2185,7 +2185,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"SOFT_MAX_BACK", "SOFT_MAX_BACK",
"ROPE", "ROPE",
"ROPE_BACK", "ROPE_BACK",
"ALIBI",
"CLAMP", "CLAMP",
"CONV_TRANSPOSE_1D", "CONV_TRANSPOSE_1D",
"IM2COL", "IM2COL",
@ -2227,7 +2226,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
"CROSS_ENTROPY_LOSS_BACK", "CROSS_ENTROPY_LOSS_BACK",
}; };
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77"); static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 77");
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"none", "none",
@ -2276,7 +2275,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"soft_max_back(x)", "soft_max_back(x)",
"rope(x)", "rope(x)",
"rope_back(x)", "rope_back(x)",
"alibi(x)",
"clamp(x)", "clamp(x)",
"conv_transpose_1d(x)", "conv_transpose_1d(x)",
"im2col(x)", "im2col(x)",
@ -2318,7 +2316,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
"cross_entropy_loss_back(x,y)", "cross_entropy_loss_back(x,y)",
}; };
static_assert(GGML_OP_COUNT == 77, "GGML_OP_COUNT != 77"); static_assert(GGML_OP_COUNT == 76, "GGML_OP_COUNT != 77");
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
@ -5646,7 +5644,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * mask, struct ggml_tensor * mask,
struct ggml_tensor * pos,
float scale, float scale,
float max_bias, float max_bias,
bool inplace) { bool inplace) {
@ -5660,20 +5657,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
GGML_ASSERT(mask->ne[1] >= a->ne[1]); GGML_ASSERT(mask->ne[1] >= a->ne[1]);
} }
if (pos) {
GGML_ASSERT(ggml_is_vector(pos) || ggml_is_matrix(pos));
GGML_ASSERT(pos->type == GGML_TYPE_F16 || pos->type == GGML_TYPE_F32);
GGML_ASSERT(pos->ne[0] == a->ne[0]);
}
if (pos && mask) {
GGML_ASSERT(pos->type == mask->type);
}
if (max_bias > 0.0f) {
GGML_ASSERT(pos);
}
bool is_node = false; bool is_node = false;
if (a->grad) { if (a->grad) {
@ -5689,7 +5672,6 @@ static struct ggml_tensor * ggml_soft_max_impl(
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a; result->src[0] = a;
result->src[1] = mask; result->src[1] = mask;
result->src[2] = pos;
return result; return result;
} }
@ -5697,23 +5679,22 @@ static struct ggml_tensor * ggml_soft_max_impl(
struct ggml_tensor * ggml_soft_max( struct ggml_tensor * ggml_soft_max(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a) { struct ggml_tensor * a) {
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, false); return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, false);
} }
struct ggml_tensor * ggml_soft_max_inplace( struct ggml_tensor * ggml_soft_max_inplace(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a) { struct ggml_tensor * a) {
return ggml_soft_max_impl(ctx, a, NULL, NULL, 1.0f, 0.0f, true); return ggml_soft_max_impl(ctx, a, NULL, 1.0f, 0.0f, true);
} }
struct ggml_tensor * ggml_soft_max_ext( struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * mask, struct ggml_tensor * mask,
struct ggml_tensor * pos,
float scale, float scale,
float max_bias) { float max_bias) {
return ggml_soft_max_impl(ctx, a, mask, pos, scale, max_bias, false); return ggml_soft_max_impl(ctx, a, mask, scale, max_bias, false);
} }
// ggml_soft_max_back // ggml_soft_max_back
@ -5928,37 +5909,6 @@ struct ggml_tensor * ggml_rope_back(
return result; return result;
} }
// ggml_alibi
struct ggml_tensor * ggml_alibi(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_head,
float bias_max) {
GGML_ASSERT(n_past >= 0);
bool is_node = false;
if (a->grad) {
GGML_ASSERT(false); // TODO: implement backward
is_node = true;
}
// TODO: when implement backward, fix this:
//struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
struct ggml_tensor * result = ggml_view_tensor(ctx, a);
int32_t op_params[3] = { n_past, n_head };
memcpy(op_params + 2, &bias_max, sizeof(float));
ggml_set_op_params(result, op_params, sizeof(op_params));
result->op = GGML_OP_ALIBI;
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
result->src[0] = a;
return result;
}
// ggml_clamp // ggml_clamp
struct ggml_tensor * ggml_clamp( struct ggml_tensor * ggml_clamp(
@ -6486,7 +6436,8 @@ struct ggml_tensor * ggml_flash_attn_ext(
struct ggml_tensor * k, struct ggml_tensor * k,
struct ggml_tensor * v, struct ggml_tensor * v,
struct ggml_tensor * mask, struct ggml_tensor * mask,
float scale) { float scale,
float max_bias) {
GGML_ASSERT(ggml_can_mul_mat(k, q)); GGML_ASSERT(ggml_can_mul_mat(k, q));
// TODO: check if vT can be multiplied by (k*qT) // TODO: check if vT can be multiplied by (k*qT)
if (mask) { if (mask) {
@ -6508,7 +6459,7 @@ struct ggml_tensor * ggml_flash_attn_ext(
int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] }; int64_t ne[4] = { q->ne[0], q->ne[2], q->ne[1], q->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
float params[] = { scale }; float params[] = { scale, max_bias };
ggml_set_op_params(result, params, sizeof(params)); ggml_set_op_params(result, params, sizeof(params));
result->op = GGML_OP_FLASH_ATTN_EXT; result->op = GGML_OP_FLASH_ATTN_EXT;
@ -6528,7 +6479,7 @@ void ggml_flash_attn_ext_set_prec(
const int32_t prec_i32 = (int32_t) prec; const int32_t prec_i32 = (int32_t) prec;
ggml_set_op_params_i32(a, 1, prec_i32); // scale is on first pos ggml_set_op_params_i32(a, 2, prec_i32); // scale is on first pos, max_bias on second
} }
// ggml_flash_ff // ggml_flash_ff
@ -13333,7 +13284,6 @@ static void ggml_compute_forward_soft_max_f32(
const struct ggml_tensor * src0 = dst->src[0]; const struct ggml_tensor * src0 = dst->src[0];
const struct ggml_tensor * src1 = dst->src[1]; const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * src2 = dst->src[2];
assert(ggml_is_contiguous(dst)); assert(ggml_is_contiguous(dst));
assert(ggml_are_same_shape(src0, dst)); assert(ggml_are_same_shape(src0, dst));
@ -13359,8 +13309,8 @@ static void ggml_compute_forward_soft_max_f32(
// TODO: is this supposed to be ceil instead of floor? // TODO: is this supposed to be ceil instead of floor?
// https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370 // https://huggingface.co/mosaicml/mpt-7b/blob/main/attention.py#L370
const uint32_t n_head_kv = ne02; const uint32_t n_head = ne02;
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head_kv)); const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
@ -13377,14 +13327,13 @@ static void ggml_compute_forward_soft_max_f32(
float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith; float * wp = (float *) params->wdata + (nc + CACHE_LINE_SIZE_F32) * ith;
// when max_bias <= 0.0f, src2 is not used and we default it to src0 to avoid branching const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16);
const bool is_pos_matrix = src2 ? ggml_is_matrix(src2): false;
ggml_fp16_t * pos_f16 = src2 ? (ggml_fp16_t *) src2->data : src0->data;
float * pos_f32 = src2 ? (float *) src2->data : src0->data;
const bool use_f16 = (src1 && src1->type == GGML_TYPE_F16) || (src2 && src2->type == GGML_TYPE_F16);
for (int i1 = ir0; i1 < ir1; i1++) { for (int i1 = ir0; i1 < ir1; i1++) {
// ALiBi
const uint32_t h = (i1/ne01)%ne02; // head
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
float * sp = (float *)((char *) src0->data + i1*src0->nb[1]); float * sp = (float *)((char *) src0->data + i1*src0->nb[1]);
float * dp = (float *)((char *) dst->data + i1*dst->nb[1]); float * dp = (float *)((char *) dst->data + i1*dst->nb[1]);
@ -13397,27 +13346,11 @@ static void ggml_compute_forward_soft_max_f32(
if (mp_f32) { if (mp_f32) {
if (use_f16) { if (use_f16) {
for (int i = 0; i < nc; ++i) { for (int i = 0; i < nc; ++i) {
wp[i] += GGML_FP16_TO_FP32(mp_f16[i]); wp[i] += slope*GGML_FP16_TO_FP32(mp_f16[i]);
} }
} else { } else {
for (int i = 0; i < nc; ++i) { for (int i = 0; i < nc; ++i) {
wp[i] += mp_f32[i]; wp[i] += slope*mp_f32[i];
}
}
}
// ALiBi bias
if (max_bias > 0.0f) {
const uint32_t h = (i1/ne01)%ne02; // head
const float slope = h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1);
if (use_f16) {
for (int i = 0; i < nc; ++i) {
wp[i] += slope*GGML_FP16_TO_FP32(pos_f16[is_pos_matrix ? i1%nc * nc + i: i]);
}
} else {
for (int i = 0; i < nc; ++i) {
wp[i] += slope*pos_f32[is_pos_matrix ? i1%nc * nc + i: i];
} }
} }
} }
@ -13579,178 +13512,6 @@ static void ggml_compute_forward_soft_max_back(
} }
} }
// ggml_compute_forward_alibi
static void ggml_compute_forward_alibi_f32(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
const int64_t ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int64_t ne1 = src0->ne[1]; // seq_len_without_past
const int64_t ne2 = src0->ne[2]; // n_head -> this is k
//const int64_t ne3 = src0->ne[3]; // 1 -> bsz
const int64_t n = ggml_nrows(src0);
const int64_t ne2_ne3 = n/ne1; // ne2*ne3
const size_t nb0 = src0->nb[0];
const size_t nb1 = src0->nb[1];
const size_t nb2 = src0->nb[2];
//const int nb3 = src0->nb[3];
GGML_ASSERT(nb0 == sizeof(float));
GGML_ASSERT(n_head == ne2);
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int64_t k = 0; k < ne2_ne3; k++) {
// TODO: k*nb2 or k*nb3
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
for (int64_t i = 0; i < ne0; i++) {
for (int64_t j = 0; j < ne1; j++) {
float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
pdst[0] = i * m_k + src[0];
}
}
}
}
static void ggml_compute_forward_alibi_f16(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
assert(params->ith == 0);
if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
return;
}
//const int n_past = ((int32_t *) dst->op_params)[0];
const int n_head = ((int32_t *) dst->op_params)[1];
float max_bias;
memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float));
const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
const int ne1 = src0->ne[1]; // seq_len_without_past
const int ne2 = src0->ne[2]; // n_head -> this is k
//const int ne3 = src0->ne[3]; // 1 -> bsz
const int n = ggml_nrows(src0);
const int ne2_ne3 = n/ne1; // ne2*ne3
const int nb0 = src0->nb[0];
const int nb1 = src0->nb[1];
const int nb2 = src0->nb[2];
//const int nb3 = src0->nb[3];
GGML_ASSERT(nb0 == sizeof(ggml_fp16_t));
//GGML_ASSERT(ne1 + n_past == ne0); (void) n_past;
GGML_ASSERT(n_head == ne2);
// add alibi to src0 (KQ_scaled)
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_heads_log2_floor);
for (int k = 0; k < ne2_ne3; k++) {
// TODO: k*nb2 or k*nb3
float m_k;
if (k < n_heads_log2_floor) {
m_k = powf(m0, k + 1);
} else {
m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
}
for (int i = 0; i < ne0; i++) {
for (int j = 0; j < ne1; j++) {
ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
// we return F32
pdst[0] = i * m_k + GGML_FP16_TO_FP32(src[0]);
}
}
}
}
static void ggml_compute_forward_alibi(
const struct ggml_compute_params * params,
struct ggml_tensor * dst) {
const struct ggml_tensor * src0 = dst->src[0];
switch (src0->type) {
case GGML_TYPE_F16:
{
ggml_compute_forward_alibi_f16(params, dst);
} break;
case GGML_TYPE_F32:
{
ggml_compute_forward_alibi_f32(params, dst);
} break;
case GGML_TYPE_BF16:
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
case GGML_TYPE_Q5_0:
case GGML_TYPE_Q5_1:
case GGML_TYPE_Q8_0:
case GGML_TYPE_Q8_1:
case GGML_TYPE_Q2_K:
case GGML_TYPE_Q3_K:
case GGML_TYPE_Q4_K:
case GGML_TYPE_Q5_K:
case GGML_TYPE_Q6_K:
case GGML_TYPE_IQ2_XXS:
case GGML_TYPE_IQ2_XS:
case GGML_TYPE_IQ3_XXS:
case GGML_TYPE_IQ1_S:
case GGML_TYPE_IQ1_M:
case GGML_TYPE_IQ4_NL:
case GGML_TYPE_IQ4_XS:
case GGML_TYPE_IQ3_S:
case GGML_TYPE_IQ2_S:
case GGML_TYPE_Q8_K:
case GGML_TYPE_I8:
case GGML_TYPE_I16:
case GGML_TYPE_I32:
case GGML_TYPE_I64:
case GGML_TYPE_F64:
case GGML_TYPE_COUNT:
{
GGML_ASSERT(false);
} break;
}
}
// ggml_compute_forward_clamp // ggml_compute_forward_clamp
static void ggml_compute_forward_clamp_f32( static void ggml_compute_forward_clamp_f32(
@ -15765,7 +15526,16 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int ir1 = MIN(ir0 + dr, nr); const int ir1 = MIN(ir0 + dr, nr);
float scale = 1.0f; float scale = 1.0f;
float max_bias = 0.0f;
memcpy(&scale, (float *) dst->op_params + 0, sizeof(float)); memcpy(&scale, (float *) dst->op_params + 0, sizeof(float));
memcpy(&max_bias, (float *) dst->op_params + 1, sizeof(float));
const uint32_t n_head = neq2;
const uint32_t n_head_log2 = 1u << (uint32_t) floor(log2(n_head));
const float m0 = powf(2.0f, -(max_bias ) / n_head_log2);
const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2);
// loop over n_batch and n_head // loop over n_batch and n_head
for (int ir = ir0; ir < ir1; ++ir) { for (int ir = ir0; ir < ir1; ++ir) {
@ -15774,6 +15544,9 @@ static void ggml_compute_forward_flash_attn_ext_f16(
const int iq2 = (ir - iq3*neq2*neq1)/neq1; const int iq2 = (ir - iq3*neq2*neq1)/neq1;
const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1); const int iq1 = (ir - iq3*neq2*neq1 - iq2*neq1);
const uint32_t h = iq2; // head
const float slope = (max_bias > 0.0f) ? h < n_head_log2 ? powf(m0, h + 1) : powf(m1, 2*(h - n_head_log2) + 1) : 1.0f;
float S = 0.0f; float S = 0.0f;
float M = -INFINITY; float M = -INFINITY;
@ -15797,7 +15570,7 @@ static void ggml_compute_forward_flash_attn_ext_f16(
// loop over n_kv and n_head_kv // loop over n_kv and n_head_kv
// ref: https://arxiv.org/pdf/2112.05682.pdf // ref: https://arxiv.org/pdf/2112.05682.pdf
for (int64_t ic = 0; ic < nek1; ++ic) { for (int64_t ic = 0; ic < nek1; ++ic) {
const float mv = mp ? GGML_FP16_TO_FP32(mp[ic]) : 0.0f; const float mv = mp ? slope*GGML_FP16_TO_FP32(mp[ic]) : 0.0f;
if (mv == -INFINITY) { if (mv == -INFINITY) {
continue; continue;
} }
@ -15868,7 +15641,7 @@ static void ggml_compute_forward_flash_attn_ext(
const struct ggml_tensor * v, const struct ggml_tensor * v,
const struct ggml_tensor * mask, const struct ggml_tensor * mask,
struct ggml_tensor * dst) { struct ggml_tensor * dst) {
switch (dst->op_params[1]) { switch (dst->op_params[2]) {
case GGML_PREC_DEFAULT: case GGML_PREC_DEFAULT:
case GGML_PREC_F32: case GGML_PREC_F32:
{ {
@ -17631,10 +17404,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
{ {
ggml_compute_forward_rope_back(params, tensor); ggml_compute_forward_rope_back(params, tensor);
} break; } break;
case GGML_OP_ALIBI:
{
ggml_compute_forward_alibi(params, tensor);
} break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
{ {
ggml_compute_forward_clamp(params, tensor); ggml_compute_forward_clamp(params, tensor);
@ -18653,10 +18422,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
zero_table); zero_table);
} }
} break; } break;
case GGML_OP_ALIBI:
{
GGML_ASSERT(false); // TODO: not implemented
} break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
{ {
GGML_ASSERT(false); // TODO: not implemented GGML_ASSERT(false); // TODO: not implemented
@ -19429,10 +19194,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_
{ {
n_tasks = n_threads; n_tasks = n_threads;
} break; } break;
case GGML_OP_ALIBI:
{
n_tasks = 1; //TODO
} break;
case GGML_OP_CLAMP: case GGML_OP_CLAMP:
{ {
n_tasks = 1; //TODO n_tasks = 1; //TODO

18
ggml.h
View file

@ -468,7 +468,6 @@ extern "C" {
GGML_OP_SOFT_MAX_BACK, GGML_OP_SOFT_MAX_BACK,
GGML_OP_ROPE, GGML_OP_ROPE,
GGML_OP_ROPE_BACK, GGML_OP_ROPE_BACK,
GGML_OP_ALIBI,
GGML_OP_CLAMP, GGML_OP_CLAMP,
GGML_OP_CONV_TRANSPOSE_1D, GGML_OP_CONV_TRANSPOSE_1D,
GGML_OP_IM2COL, GGML_OP_IM2COL,
@ -1428,15 +1427,13 @@ extern "C" {
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a); struct ggml_tensor * a);
// fused soft_max(a*scale + mask + pos[i]*(ALiBi slope)) // fused soft_max(a*scale + mask*(ALiBi slope))
// mask is optional // mask is optional
// pos is required when max_bias > 0.0f
// max_bias = 0.0f for no ALiBi // max_bias = 0.0f for no ALiBi
GGML_API struct ggml_tensor * ggml_soft_max_ext( GGML_API struct ggml_tensor * ggml_soft_max_ext(
struct ggml_context * ctx, struct ggml_context * ctx,
struct ggml_tensor * a, struct ggml_tensor * a,
struct ggml_tensor * mask, struct ggml_tensor * mask,
struct ggml_tensor * pos,
float scale, float scale,
float max_bias); float max_bias);
@ -1538,16 +1535,6 @@ extern "C" {
float xpos_base, float xpos_base,
bool xpos_down); bool xpos_down);
// alibi position embedding
// in-place, returns view(a)
GGML_DEPRECATED(GGML_API struct ggml_tensor * ggml_alibi(
struct ggml_context * ctx,
struct ggml_tensor * a,
int n_past,
int n_head,
float bias_max),
"use ggml_soft_max_ext instead (will be removed in Mar 2024)");
// clamp // clamp
// in-place, returns view(a) // in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_clamp( GGML_API struct ggml_tensor * ggml_clamp(
@ -1744,7 +1731,8 @@ extern "C" {
struct ggml_tensor * k, struct ggml_tensor * k,
struct ggml_tensor * v, struct ggml_tensor * v,
struct ggml_tensor * mask, struct ggml_tensor * mask,
float scale); float scale,
float max_bias);
GGML_API void ggml_flash_attn_ext_set_prec( GGML_API void ggml_flash_attn_ext_set_prec(
struct ggml_tensor * a, struct ggml_tensor * a,

File diff suppressed because it is too large Load diff

92
gguf-py/scripts/gguf-new-metadata.py Normal file → Executable file
View file

@ -7,7 +7,8 @@ import json
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
from typing import Any, Sequence from tqdm import tqdm
from typing import Any, Sequence, NamedTuple
# Necessary to load the local gguf package # Necessary to load the local gguf package
if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists(): if "NO_LOCAL_GGUF" not in os.environ and (Path(__file__).parent.parent.parent / 'gguf-py').exists():
@ -18,6 +19,12 @@ import gguf
logger = logging.getLogger("gguf-new-metadata") logger = logging.getLogger("gguf-new-metadata")
class MetadataDetails(NamedTuple):
type: gguf.GGUFValueType
value: Any
description: str = ''
def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian: def get_byteorder(reader: gguf.GGUFReader) -> gguf.GGUFEndian:
if np.uint32(1) == np.uint32(1).newbyteorder("<"): if np.uint32(1) == np.uint32(1).newbyteorder("<"):
# Host is little endian # Host is little endian
@ -59,7 +66,16 @@ def get_field_data(reader: gguf.GGUFReader, key: str) -> Any:
return decode_field(field) return decode_field(field)
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, str], remove_metadata: Sequence[str]) -> None: def find_token(token_list: Sequence[int], token: str) -> Sequence[int]:
token_ids = [index for index, value in enumerate(token_list) if value == token]
if len(token_ids) == 0:
raise LookupError(f'Unable to find "{token}" in token list!')
return token_ids
def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new_metadata: dict[str, MetadataDetails], remove_metadata: Sequence[str]) -> None:
for field in reader.fields.values(): for field in reader.fields.values():
# Suppress virtual fields and fields written by GGUFWriter # Suppress virtual fields and fields written by GGUFWriter
if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'): if field.name == gguf.Keys.General.ARCHITECTURE or field.name.startswith('GGUF.'):
@ -75,54 +91,64 @@ def copy_with_new_metadata(reader: gguf.GGUFReader, writer: gguf.GGUFWriter, new
logger.debug(f'Removing {field.name}') logger.debug(f'Removing {field.name}')
continue continue
old_val = decode_field(field) old_val = MetadataDetails(field.types[0], decode_field(field))
val = new_metadata.get(field.name, old_val) val = new_metadata.get(field.name, old_val)
if field.name in new_metadata: if field.name in new_metadata:
logger.debug(f'Modifying {field.name}: "{old_val}" -> "{val}"') logger.debug(f'Modifying {field.name}: "{old_val.value}" -> "{val.value}" {val.description}')
del new_metadata[field.name] del new_metadata[field.name]
elif val is not None: elif val.value is not None:
logger.debug(f'Copying {field.name}') logger.debug(f'Copying {field.name}')
if val is not None: if val.value is not None:
writer.add_key(field.name) writer.add_key(field.name)
writer.add_val(val, field.types[0]) writer.add_val(val.value, val.type)
if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata: if gguf.Keys.Tokenizer.CHAT_TEMPLATE in new_metadata:
logger.debug('Adding chat template(s)') logger.debug('Adding chat template(s)')
writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]) writer.add_chat_template(new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE].value)
del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] del new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE]
# TODO: Support other types than string?
for key, val in new_metadata.items(): for key, val in new_metadata.items():
logger.debug(f'Adding {key}: {val}') logger.debug(f'Adding {key}: "{val.value}" {val.description}')
writer.add_key(key) writer.add_key(key)
writer.add_val(val, gguf.GGUFValueType.STRING) writer.add_val(val.value, val.type)
total_bytes = 0
for tensor in reader.tensors: for tensor in reader.tensors:
total_bytes += tensor.n_bytes
# Dimensions are written in reverse order, so flip them first # Dimensions are written in reverse order, so flip them first
shape = np.flipud(tensor.shape).tolist() shape = np.flipud(tensor.shape).tolist()
writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type) writer.add_tensor_info(tensor.name, shape, tensor.data.dtype, tensor.data.nbytes, tensor.tensor_type)
bar = tqdm(desc="Writing", total=total_bytes, unit="byte", unit_scale=True)
writer.write_header_to_file() writer.write_header_to_file()
writer.write_kv_data_to_file() writer.write_kv_data_to_file()
writer.write_ti_data_to_file() writer.write_ti_data_to_file()
for tensor in reader.tensors: for tensor in reader.tensors:
writer.write_tensor_data(tensor.data) writer.write_tensor_data(tensor.data)
bar.update(tensor.n_bytes)
writer.close() writer.close()
def main() -> None: def main() -> None:
tokenizer_metadata = (getattr(gguf.Keys.Tokenizer, n) for n in gguf.Keys.Tokenizer.__dict__.keys() if not n.startswith('_'))
token_names = dict((n.split('.')[-1][:-len('_token_id')], n) for n in tokenizer_metadata if n.endswith('_token_id'))
parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata") parser = argparse.ArgumentParser(description="Make a copy of a GGUF file with new metadata")
parser.add_argument("input", type=Path, help="GGUF format model input filename") parser.add_argument("input", type=Path, help="GGUF format model input filename")
parser.add_argument("output", type=Path, help="GGUF format model output filename") parser.add_argument("output", type=Path, help="GGUF format model output filename")
parser.add_argument("--general-name", type=str, help="The models general.name") parser.add_argument("--general-name", type=str, help="The models general.name", metavar='"name"')
parser.add_argument("--general-description", type=str, help="The models general.description") parser.add_argument("--general-description", type=str, help="The models general.description", metavar='"Description ..."')
parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)") parser.add_argument("--chat-template", type=str, help="Chat template string (or JSON string containing templates)", metavar='"{% ... %} ..."')
parser.add_argument("--chat-template-config", type=Path, help="Config file (tokenizer_config.json) containing chat template(s)") parser.add_argument("--chat-template-config", type=Path, help="Config file containing chat template(s)", metavar='tokenizer_config.json')
parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model") parser.add_argument("--remove-metadata", action="append", type=str, help="Remove metadata (by key name) from output model", metavar='general.url')
parser.add_argument("--special-token", action="append", type=str, help="Special token by value", nargs=2, metavar=(' | '.join(token_names.keys()), '"<token>"'))
parser.add_argument("--special-token-by-id", action="append", type=str, help="Special token by id", nargs=2, metavar=(' | '.join(token_names.keys()), '0'))
parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation") parser.add_argument("--force", action="store_true", help="Bypass warnings without confirmation")
parser.add_argument("--verbose", action="store_true", help="Increase output verbosity") parser.add_argument("--verbose", action="store_true", help="Increase output verbosity")
args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"]) args = parser.parse_args(None if len(sys.argv) > 2 else ["--help"])
@ -133,20 +159,20 @@ def main() -> None:
remove_metadata = args.remove_metadata or [] remove_metadata = args.remove_metadata or []
if args.general_name: if args.general_name:
new_metadata[gguf.Keys.General.NAME] = args.general_name new_metadata[gguf.Keys.General.NAME] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_name)
if args.general_description: if args.general_description:
new_metadata[gguf.Keys.General.DESCRIPTION] = args.general_description new_metadata[gguf.Keys.General.DESCRIPTION] = MetadataDetails(gguf.GGUFValueType.STRING, args.general_description)
if args.chat_template: if args.chat_template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, json.loads(args.chat_template) if args.chat_template.startswith('[') else args.chat_template)
if args.chat_template_config: if args.chat_template_config:
with open(args.chat_template_config, 'r') as fp: with open(args.chat_template_config, 'r') as fp:
config = json.load(fp) config = json.load(fp)
template = config.get('chat_template') template = config.get('chat_template')
if template: if template:
new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = template new_metadata[gguf.Keys.Tokenizer.CHAT_TEMPLATE] = MetadataDetails(gguf.GGUFValueType.STRING, template)
if remove_metadata: if remove_metadata:
logger.warning('*** Warning *** Warning *** Warning **') logger.warning('*** Warning *** Warning *** Warning **')
@ -166,6 +192,32 @@ def main() -> None:
arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE) arch = get_field_data(reader, gguf.Keys.General.ARCHITECTURE)
endianess = get_byteorder(reader) endianess = get_byteorder(reader)
token_list = get_field_data(reader, gguf.Keys.Tokenizer.LIST) or []
for name, token in args.special_token or []:
if name not in token_names:
logger.warning(f'Unknown special token "{name}", ignoring...')
else:
ids = find_token(token_list, token)
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, ids[0], f'= {token}')
if len(ids) > 1:
logger.warning(f'Multiple "{token}" tokens found, choosing ID {ids[0]}, use --special-token-by-id if you want another:')
logger.warning(', '.join(str(i) for i in ids))
for name, id_string in args.special_token_by_id or []:
if name not in token_names:
logger.warning(f'Unknown special token "{name}", ignoring...')
elif not id_string.isdecimal():
raise LookupError(f'Token ID "{id_string}" is not a valid ID!')
else:
id_int = int(id_string)
if id_int >= 0 and id_int < len(token_list):
new_metadata[token_names[name]] = MetadataDetails(gguf.GGUFValueType.UINT32, id_int, f'= {token_list[id_int]}')
else:
raise LookupError(f'Token ID {id_int} is not within token list!')
if os.path.isfile(args.output) and not args.force: if os.path.isfile(args.output) and not args.force:
logger.warning('*** Warning *** Warning *** Warning **') logger.warning('*** Warning *** Warning *** Warning **')
logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!') logger.warning(f'* The "{args.output}" GGUF file already exists, it will be overwritten!')

183
llama.cpp
View file

@ -1866,7 +1866,7 @@ struct llama_hparams {
float f_logit_scale = 0.0f; float f_logit_scale = 0.0f;
bool causal_attn = true; bool causal_attn = true;
bool use_alibi = false; // currently, we need KQ_pos data for ALiBi-based models bool use_alibi = false;
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE; enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE; enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
@ -2338,7 +2338,6 @@ struct llama_context {
struct ggml_tensor * inp_pos; // I32 [n_batch] struct ggml_tensor * inp_pos; // I32 [n_batch]
struct ggml_tensor * inp_out_ids; // I32 [n_outputs] struct ggml_tensor * inp_out_ids; // I32 [n_outputs]
struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch] struct ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
struct ggml_tensor * inp_KQ_pos; // F32 [n_kv]
struct ggml_tensor * inp_K_shift; // I32 [kv_size] struct ggml_tensor * inp_K_shift; // I32 [kv_size]
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch] struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
struct ggml_tensor * inp_cls; // I32 [n_batch] struct ggml_tensor * inp_cls; // I32 [n_batch]
@ -6586,7 +6585,6 @@ static struct ggml_tensor * llm_build_kqv(
struct ggml_tensor * wo_b, struct ggml_tensor * wo_b,
struct ggml_tensor * q_cur, struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask, struct ggml_tensor * kq_mask,
struct ggml_tensor * kq_pos,
int32_t n_tokens, int32_t n_tokens,
int32_t n_kv, int32_t n_kv,
float kq_scale, float kq_scale,
@ -6616,10 +6614,6 @@ static struct ggml_tensor * llm_build_kqv(
GGML_UNUSED(model); GGML_UNUSED(model);
GGML_UNUSED(n_ctx); GGML_UNUSED(n_ctx);
// note: if this assert triggers, then some check has failed earlier
// the idea is to detect during context creation that ALiBi would be used and disable Flash Attention
GGML_ASSERT(kq_pos == nullptr && "ALiBi is not yet supported with Flash Attention");
// split cached v into n_head heads (not transposed) // split cached v into n_head heads (not transposed)
struct ggml_tensor * v = struct ggml_tensor * v =
ggml_view_3d(ctx, kv.v_l[il], ggml_view_3d(ctx, kv.v_l[il],
@ -6629,7 +6623,7 @@ static struct ggml_tensor * llm_build_kqv(
0); 0);
cb(v, "v", il); cb(v, "v", il);
cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale); cur = ggml_flash_attn_ext(ctx, q, k, v, kq_mask, kq_scale, hparams.f_max_alibi_bias);
if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) { if (model.arch == LLM_ARCH_PHI2 || model.arch == LLM_ARCH_PHI3) {
ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32); ggml_flash_attn_ext_set_prec(cur, GGML_PREC_F32);
@ -6660,28 +6654,8 @@ static struct ggml_tensor * llm_build_kqv(
kq = ggml_scale(ctx, kq, 30); kq = ggml_scale(ctx, kq, 30);
} }
#if defined(GGML_USE_KOMPUTE) kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, hparams.f_max_alibi_bias);
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Kompute")
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
if (hparams.use_alibi) {
kq = ggml_scale(ctx, kq, kq_scale);
cb(kq, "kq_scaled", il);
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, hparams.f_max_alibi_bias);
cb(kq, "kq_scaled_alibi", il);
kq = ggml_add(ctx, kq, kq_mask);
cb(kq, "kq_masked", il);
kq = ggml_soft_max(ctx, kq);
cb(kq, "kq_soft_max", il);
} else
#endif
{
kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_pos, kq_scale, hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il); cb(kq, "kq_soft_max_ext", il);
}
GGML_ASSERT(kv.size == n_ctx); GGML_ASSERT(kv.size == n_ctx);
@ -6731,7 +6705,6 @@ static struct ggml_tensor * llm_build_kv(
struct ggml_tensor * v_cur, struct ggml_tensor * v_cur,
struct ggml_tensor * q_cur, struct ggml_tensor * q_cur,
struct ggml_tensor * kq_mask, struct ggml_tensor * kq_mask,
struct ggml_tensor * kq_pos,
int32_t n_tokens, int32_t n_tokens,
int32_t kv_head, int32_t kv_head,
int32_t n_kv, int32_t n_kv,
@ -6750,7 +6723,7 @@ static struct ggml_tensor * llm_build_kv(
struct ggml_tensor * cur; struct ggml_tensor * cur;
cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b, cur = llm_build_kqv(ctx, model, hparams, cparams, kv, graph, wo, wo_b,
q_cur, kq_mask, kq_pos, n_tokens, n_kv, kq_scale, cb, il); q_cur, kq_mask, n_tokens, n_kv, kq_scale, cb, il);
cb(cur, "kqv_out", il); cb(cur, "kqv_out", il);
return cur; return cur;
@ -6862,7 +6835,6 @@ struct llm_build_context {
lctx.inp_pos = nullptr; lctx.inp_pos = nullptr;
lctx.inp_out_ids = nullptr; lctx.inp_out_ids = nullptr;
lctx.inp_KQ_mask = nullptr; lctx.inp_KQ_mask = nullptr;
lctx.inp_KQ_pos = nullptr;
lctx.inp_K_shift = nullptr; lctx.inp_K_shift = nullptr;
lctx.inp_mean = nullptr; lctx.inp_mean = nullptr;
lctx.inp_cls = nullptr; lctx.inp_cls = nullptr;
@ -7018,19 +6990,6 @@ struct llm_build_context {
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask; return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_mask, GGML_TYPE_F16) : lctx.inp_KQ_mask;
} }
struct ggml_tensor * build_inp_KQ_pos(bool causal = true) {
if (causal) {
lctx.inp_KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, n_kv);
} else {
// TODO: this will be needed for ALiBi-based BERT models
// https://github.com/ggerganov/llama.cpp/pull/6826
lctx.inp_KQ_pos = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
}
cb(lctx.inp_KQ_pos, "KQ_pos", -1);
ggml_set_input(lctx.inp_KQ_pos);
return flash_attn ? ggml_cast(ctx0, lctx.inp_KQ_pos, GGML_TYPE_F16) : lctx.inp_KQ_pos;
}
struct ggml_tensor * build_inp_mean() { struct ggml_tensor * build_inp_mean() {
lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens); lctx.inp_mean = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_tokens, n_tokens);
cb(lctx.inp_mean, "inp_mean", -1); cb(lctx.inp_mean, "inp_mean", -1);
@ -7136,7 +7095,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -7229,9 +7188,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -7276,7 +7232,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -7346,9 +7302,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -7383,7 +7336,7 @@ struct llm_build_context {
cb(Kcur, "Kcur", il); cb(Kcur, "Kcur", il);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -7503,7 +7456,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -7628,7 +7581,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -7780,7 +7733,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -7892,7 +7845,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -8096,7 +8049,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Q, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Q, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -8162,9 +8115,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * inpSA = inpL; struct ggml_tensor * inpSA = inpL;
@ -8192,7 +8142,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -8280,9 +8230,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false); struct ggml_tensor * KQ_mask = build_inp_KQ_mask(false);
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos(false);
// iterate layers // iterate layers
for (int il = 0; il < n_layer; ++il) { for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur = inpL; struct ggml_tensor * cur = inpL;
@ -8351,7 +8298,7 @@ struct llm_build_context {
struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q); struct ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
cb(kq, "kq", il); cb(kq, "kq", il);
kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, KQ_pos, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias); kq = ggml_soft_max_ext(ctx0, kq, KQ_mask, 1.0f/sqrtf(float(n_embd_head)), hparams.f_max_alibi_bias);
cb(kq, "kq_soft_max_ext", il); cb(kq, "kq_soft_max_ext", il);
struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens))); struct ggml_tensor * v = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, Vcur, n_embd_gqa, n_tokens)));
@ -8476,9 +8423,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
inpL = llm_build_norm(ctx0, inpL, hparams, inpL = llm_build_norm(ctx0, inpL, hparams,
model.tok_norm, model.tok_norm,
model.tok_norm_b, model.tok_norm_b,
@ -8512,7 +8456,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -8577,9 +8521,6 @@ struct llm_build_context {
// KQ_mask (mask for 1 head, it will be broadcasted to all heads) // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = build_inp_KQ_mask(); struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
// positions of the tokens in the KV cache
struct ggml_tensor * KQ_pos = build_inp_KQ_pos();
if (model.pos_embd) { if (model.pos_embd) {
// inp_pos - contains the positions // inp_pos - contains the positions
struct ggml_tensor * inp_pos = build_inp_pos(); struct ggml_tensor * inp_pos = build_inp_pos();
@ -8643,13 +8584,13 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} else { } else {
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
} }
@ -8793,7 +8734,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -8911,7 +8852,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -9024,7 +8965,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -9138,7 +9079,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -9293,7 +9234,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -9410,7 +9351,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -9523,7 +9464,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
struct ggml_tensor * sa_out = cur; struct ggml_tensor * sa_out = cur;
@ -9626,7 +9567,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -9733,7 +9674,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -9849,7 +9790,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -9966,7 +9907,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -10096,7 +10037,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -10217,7 +10158,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, NULL, model.layers[il].wo, NULL,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f, cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f, cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -10336,7 +10277,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -10626,7 +10567,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, model.layers[il].bo, model.layers[il].wo, model.layers[il].bo,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -10757,7 +10698,7 @@ struct llm_build_context {
cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf, cur = llm_build_kv(ctx0, model, hparams, cparams, kv_self, gf,
model.layers[il].wo, nullptr, model.layers[il].wo, nullptr,
Kcur, Vcur, Qcur, KQ_mask, nullptr, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); Kcur, Vcur, Qcur, KQ_mask, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);
} }
if (il == n_layer - 1) { if (il == n_layer - 1) {
@ -11145,12 +11086,22 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
float f; float f;
if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) { if (!lctx.kv_self.cells[i].has_seq_id(seq_id) || lctx.kv_self.cells[i].pos > pos) {
f = -INFINITY; f = -INFINITY;
} else {
if (hparams.use_alibi) {
f = -fabs(lctx.kv_self.cells[i].pos - pos);
} else { } else {
f = 0.0f; f = 0.0f;
} }
}
data[h*(n_kv*n_tokens) + j*n_kv + i] = f; data[h*(n_kv*n_tokens) + j*n_kv + i] = f;
} }
} }
for (int i = n_tokens; i < GGML_PAD(n_tokens, GGML_KQ_MASK_PAD); ++i) {
for (int j = 0; j < n_kv; ++j) {
data[h*(n_kv*n_tokens) + i*n_kv + j] = -INFINITY;
}
}
} }
} else { } else {
// when using kv cache, the mask needs to match the kv cache size // when using kv cache, the mask needs to match the kv cache size
@ -11169,7 +11120,11 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
float f = -INFINITY; float f = -INFINITY;
for (int s = 0; s < batch.n_seq_id[i]; ++s) { for (int s = 0; s < batch.n_seq_id[i]; ++s) {
if (batch.seq_id[i][s] == seq_id) { if (batch.seq_id[i][s] == seq_id) {
if (hparams.use_alibi) {
f = -fabs(batch.pos[i] - batch.pos[j]);
} else {
f = 0.0f; f = 0.0f;
}
break; break;
} }
} }
@ -11185,32 +11140,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
} }
} }
// ALiBi requires the KQ_pos tensor to provide the sequence position of each token in the batch
// this allows to process multiple sequences in parallel with ALiBi-based models
if (hparams.use_alibi) {
const int64_t n_kv = kv_self.n;
GGML_ASSERT(lctx.inp_KQ_pos);
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_KQ_pos->buffer));
GGML_ASSERT(ggml_is_vector(lctx.inp_KQ_pos) || ggml_is_matrix(lctx.inp_KQ_pos));
if (ggml_is_vector(lctx.inp_KQ_pos)) {
float * data = (float *) lctx.inp_KQ_pos->data;
for (int i = 0; i < n_kv; ++i) {
data[i] = float(lctx.kv_self.cells[i].pos);
}
} else if(ggml_is_matrix(lctx.inp_KQ_pos)) {
const int64_t n_tokens = batch.n_tokens;
float * data = (float *) lctx.inp_KQ_pos->data;
for (int i = 0; i < n_tokens; ++i) {
for (int j = 0; j < n_tokens; ++j) {
data[i * n_tokens + j] = -1.0 * abs(i - j);
}
}
}
}
if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) { if (cparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
const int64_t n_tokens = batch.n_tokens; const int64_t n_tokens = batch.n_tokens;
@ -11563,7 +11492,7 @@ static int llama_decode_internal(
} }
// non-causal masks do not use the KV cache // non-causal masks do not use the KV cache
if (hparams.causal_attn || model.arch == LLM_ARCH_JINA_BERT_V2) { if (hparams.causal_attn) {
llama_kv_cache_update(&lctx); llama_kv_cache_update(&lctx);
// if we have enough unused cells before the current head -> // if we have enough unused cells before the current head ->
@ -12613,7 +12542,7 @@ struct llm_tokenizer_wpm {
continue; continue;
} }
code = unicode_tolower(code); code = unicode_tolower(code);
if (type == CODEPOINT_TYPE_WHITESPACE) { if (type == CODEPOINT_TYPE_SEPARATOR) {
code = ' '; code = ' ';
} }
std::string s = unicode_cpt_to_utf8(code); std::string s = unicode_cpt_to_utf8(code);
@ -15639,23 +15568,11 @@ struct llama_context * llama_new_context_with_model(
} }
} }
if (cparams.flash_attn && hparams.use_alibi) {
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with ALiBi - forcing off\n", __func__);
cparams.flash_attn = false;
}
if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) { if (cparams.flash_attn && model->arch == LLM_ARCH_GROK) {
LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__); LLAMA_LOG_WARN("%s: flash_attn is not compatible with Grok - forcing off\n", __func__);
cparams.flash_attn = false; cparams.flash_attn = false;
} }
#ifdef GGML_USE_HIPBLAS
if (cparams.flash_attn) {
LLAMA_LOG_WARN("%s: flash_attn is not yet compatible with HIPBLAS builds - forcing off\n", __func__);
cparams.flash_attn = false;
}
#endif
if (params.seed == LLAMA_DEFAULT_SEED) { if (params.seed == LLAMA_DEFAULT_SEED) {
params.seed = time(NULL); params.seed = time(NULL);
} }
@ -18010,7 +17927,7 @@ struct llama_timings llama_get_timings(struct llama_context * ctx) {
/*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us, /*.t_eval_ms =*/ 1e-3 * ctx->t_eval_us,
/*.n_sample =*/ std::max(1, ctx->n_sample), /*.n_sample =*/ std::max(1, ctx->n_sample),
/*.n_p_eval =*/ std::max(1, ctx->n_p_eval), /*.n_p_eval =*/ std::max(0, ctx->n_p_eval),
/*.n_eval =*/ std::max(1, ctx->n_eval), /*.n_eval =*/ std::max(1, ctx->n_eval),
}; };

View file

@ -1,31 +1,14 @@
import regex import regex
def cpt_to_utf8_str(cpt):
if cpt <= 0xFF:
return bytes([cpt, 0, 0, 0])
elif cpt <= 0xFFFF:
return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
elif cpt <= 0xFFFFFF:
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
else:
return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
def is_match(codepoint, regex_expr):
try:
res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
return res is not None
except Exception:
return False
def get_matches(regex_expr): def get_matches(regex_expr):
regex_expr_compiled = regex.compile(regex_expr)
unicode_ranges = [] unicode_ranges = []
current_range = None current_range = None
for codepoint in range(0x110000): for codepoint in range(0x110000):
if is_match(codepoint, regex_expr): char = chr(codepoint)
if regex_expr_compiled.match(char):
if current_range is None: if current_range is None:
current_range = [codepoint, codepoint] current_range = [codepoint, codepoint]
else: else:
@ -40,27 +23,42 @@ def get_matches(regex_expr):
return unicode_ranges return unicode_ranges
def print_cat(cat, ranges): def print_cat(mode, cat, ranges):
if mode == "range":
print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100 print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
cnt = 0 if mode == "map":
for start, end in ranges: print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat)) # noqa: NP100
if cnt % 4 != 0: for i, values in enumerate(ranges):
print(" ", end="") # noqa: NP100 end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", "
print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100 values = ["0x%08X" % value for value in values]
if cnt % 4 == 3: print("{" + ", ".join(values) + "}", end=end) # noqa: NP100
print("") # noqa: NP100
cnt += 1
if cnt % 4 != 0:
print("") # noqa: NP100
print("};") # noqa: NP100 print("};") # noqa: NP100
print("") # noqa: NP100 print("") # noqa: NP100
print_cat("number", get_matches(r'\p{N}')) print_cat("range", "number", get_matches(r'\p{N}'))
print_cat("letter", get_matches(r'\p{L}')) print_cat("range", "letter", get_matches(r'\p{L}'))
print_cat("whitespace", get_matches(r'\p{Z}')) print_cat("range", "separator", get_matches(r'\p{Z}'))
print_cat("accent_mark", get_matches(r'\p{M}')) print_cat("range", "accent_mark", get_matches(r'\p{M}'))
print_cat("punctuation", get_matches(r'\p{P}')) print_cat("range", "punctuation", get_matches(r'\p{P}'))
print_cat("symbol", get_matches(r'\p{S}')) print_cat("range", "symbol", get_matches(r'\p{S}'))
print_cat("control", get_matches(r'\p{C}')) print_cat("range", "control", get_matches(r'\p{C}'))
print_cat("range", "whitespace", get_matches(r'\s'))
map_lowercase = []
map_uppercase = []
for codepoint in range(0x110000):
char = chr(codepoint)
lower = ord(char.lower()[0])
upper = ord(char.upper()[0])
if codepoint != lower:
map_lowercase.append((codepoint, lower))
if codepoint != upper:
map_uppercase.append((codepoint, upper))
print_cat("map", "lowercase", map_lowercase)
print_cat("map", "uppercase", map_uppercase)
# TODO: generate unicode_map_nfd

View file

@ -1111,11 +1111,7 @@ struct test_soft_max : public test_case {
if (this->mask) { if (this->mask) {
mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]); mask = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, ne[0], ne[1]);
} }
ggml_tensor * pos = nullptr; ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, scale, max_bias);
if (max_bias > 0.0f) {
pos = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ne[0]);
}
ggml_tensor * out = ggml_soft_max_ext(ctx, a, mask, pos, scale, max_bias);
return out; return out;
} }
}; };
@ -1490,23 +1486,25 @@ struct test_flash_attn_ext : public test_case {
const int64_t kv; // kv size const int64_t kv; // kv size
const int64_t nb; // batch size const int64_t nb; // batch size
const float max_bias; // ALiBi
std::string vars() override { std::string vars() override {
return VARS_TO_STR4(hs, nh, kv, nb); return VARS_TO_STR5(hs, nh, kv, nb, max_bias);
} }
double max_nmse_err() override { double max_nmse_err() override {
return 5e-4; return 5e-4;
} }
test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8) test_flash_attn_ext(int64_t hs = 128, int64_t nh = 32, int64_t kv = 96, int64_t nb = 8, float max_bias = 0.0f)
: hs(hs), nh(nh), kv(kv), nb(nb) {} : hs(hs), nh(nh), kv(kv), nb(nb), max_bias(max_bias) {}
ggml_tensor * build_graph(ggml_context * ctx) override { ggml_tensor * build_graph(ggml_context * ctx) override {
ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1); ggml_tensor * q = ggml_new_tensor_4d(ctx, GGML_TYPE_F32, hs, nb, nh, 1);
ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1); ggml_tensor * k = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1); ggml_tensor * v = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, hs, kv, nh, 1);
ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1); ggml_tensor * mask = ggml_new_tensor_4d(ctx, GGML_TYPE_F16, kv, GGML_PAD(nb, GGML_KQ_MASK_PAD), 1, 1);
ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs)); ggml_tensor * out = ggml_flash_attn_ext(ctx, q, k, v, mask, 1.0f/sqrtf(hs), max_bias);
return out; return out;
} }
}; };
@ -1611,7 +1609,7 @@ public:
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q); struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
kq = ggml_soft_max_ext(ctx, kq, kq_mask, nullptr, kq_scale, 0.0f); kq = ggml_soft_max_ext(ctx, kq, kq_mask, kq_scale, 0.0f);
// split cached v into n_head heads // split cached v into n_head heads
struct ggml_tensor * v = struct ggml_tensor * v =
@ -2175,11 +2173,17 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_timestep_embedding()); test_cases.emplace_back(new test_timestep_embedding());
test_cases.emplace_back(new test_leaky_relu()); test_cases.emplace_back(new test_leaky_relu());
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
for (int hs : { 64, 128, }) { // other head sizes not implemented
#else
for (int hs : { 64, 80, 128, 256, }) { for (int hs : { 64, 80, 128, 256, }) {
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
for (float max_bias : {0.0f, 8.0f}) {
for (int nh : { 32, }) { for (int nh : { 32, }) {
for (int kv : { 512, 1024, }) { for (int kv : { 512, 1024, }) {
for (int nb : { 1, 2, 4, 8, }) { for (int nb : { 1, 2, 4, 8, }) {
test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb)); test_cases.emplace_back(new test_flash_attn_ext(hs, nh, kv, nb, max_bias));
}
} }
} }
} }

View file

@ -0,0 +1,295 @@
# Test libllama tokenizer == AutoTokenizer.
# Brute force random tokens/text generation.
#
# Sample usage:
#
# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
#
import time
import logging
import argparse
import subprocess
import random
from typing import Iterator
import cffi
from transformers import AutoTokenizer, PreTrainedTokenizerBase
logger = logging.getLogger("test-tokenizer-random-bpe")
class LibLlama:
DEFAULT_PATH_LLAMA_H = "./llama.h"
DEFAULT_PATH_LIBLLAMA = "./build/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON
def __init__(self, path_llama_h: str = None, path_libllama: str = None):
path_llama_h = path_llama_h or self.DEFAULT_PATH_LLAMA_H
path_libllama = path_libllama or self.DEFAULT_PATH_LIBLLAMA
(self.ffi, self.lib) = self._load_libllama_cffi(path_llama_h, path_libllama)
self.lib.llama_backend_init()
def _load_libllama_cffi(self, path_llama_h: str, path_libllama: str):
cmd = ["gcc", "-E", "-P", "-D__restrict=", "-D__attribute__(x)=", "-D__asm__(x)=", path_llama_h]
res = subprocess.run(cmd, stdout=subprocess.PIPE)
assert (res.returncode == 0)
source = res.stdout.decode()
ffi = cffi.FFI()
if True: # workarounds for pycparser
source = "typedef struct { } __builtin_va_list;" + "\n" + source
source = source.replace("sizeof (int)", str(ffi.sizeof("int")))
source = source.replace("sizeof (void *)", str(ffi.sizeof("void*")))
source = source.replace("sizeof (size_t)", str(ffi.sizeof("size_t")))
source = source.replace("sizeof(int32_t)", str(ffi.sizeof("int32_t")))
ffi.cdef(source, override=True)
lib = ffi.dlopen(path_libllama)
return (ffi, lib)
def model_default_params(self, **kwargs):
mparams = self.lib.llama_model_default_params()
for k, v in kwargs.items():
setattr(mparams, k, v)
return mparams
def context_default_params(self, **kwargs):
cparams = self.lib.llama_context_default_params()
for k, v in kwargs.items():
setattr(cparams, k, v)
return cparams
class LibLlamaModel:
def __init__(self, libllama: LibLlama, path_model: str, mparams={}, cparams={}):
self.lib = libllama.lib
self.ffi = libllama.ffi
if isinstance(mparams, dict):
mparams = libllama.model_default_params(**mparams)
self.model = self.lib.llama_load_model_from_file(path_model.encode(), mparams)
if not self.model:
raise RuntimeError("error: failed to load model '%s'" % path_model)
if isinstance(cparams, dict):
cparams = libllama.context_default_params(**cparams)
self.ctx = self.lib.llama_new_context_with_model(self.model, cparams)
if not self.ctx:
raise RuntimeError("error: failed to create context for model '%s'" % path_model)
n_tokens_max = self.lib.llama_n_ctx(self.ctx)
self.token_ids = self.ffi.new("llama_token[]", n_tokens_max)
def free(self):
if self.ctx:
self.lib.llama_free(self.ctx)
if self.model:
self.lib.llama_free_model(self.model)
self.ctx = None
self.model = None
self.lib = None
def tokenize(self, text: str, n_tokens_max: int = 0, add_special: bool = False, parse_special: bool = False) -> list[int]:
n_tokens_max = n_tokens_max if n_tokens_max > 0 else len(self.token_ids)
text = text.encode("utf-8")
num = self.lib.llama_tokenize(self.model, text, len(text), self.token_ids, n_tokens_max, add_special, parse_special)
if num < 0:
return []
return list(self.token_ids[0:num])
def generator_custom_text() -> Iterator[str]:
"""General tests"""
yield from [
"",
" ",
" ",
" ",
"\t",
"\n",
"\n\n",
"\n\n\n",
"\t\n",
"Hello world",
" Hello world",
"Hello World",
" Hello World",
" Hello World!",
"Hello, world!",
" Hello, world!",
" this is 🦙.cpp",
"w048 7tuijk dsdfhu",
"нещо на Български",
"កាន់តែពិសេសអាចខលចេញ",
"🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)",
"Hello",
" Hello",
" Hello",
" Hello",
" Hello",
" Hello\n Hello",
" (",
"\n =",
"' era",
"Hello, y'all! How are you 😁 ?我想在apple工作1314151天",
"3",
"33",
"333",
"3333",
"33333",
"333333",
"3333333",
"33333333",
"333333333",
]
def generator_custom_text_edge_cases() -> Iterator[str]:
"""Edge cases found while debugging"""
yield from [
'\x1f-a', # unicode_ranges_control, {0x00001C, 0x00001F}
'¼-a', # unicode_ranges_digit, 0x00BC
'½-a', # unicode_ranges_digit, 0x00BD
'¾-a', # unicode_ranges_digit, 0x00BE
'a b', # unicode_ranges_digit, 0x3007
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
'<s>a' # TODO: Phi-3 fail
]
def generator_random_chars(iterations = 100) -> Iterator[str]:
"""Brute force random text with simple characters"""
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
CHARS = list(set("""
ABCDEFGHIJKLMNOPQRSTUVWXYZ
abcdefghijklmnopqrstuvwxyz
ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
áéíóúàèìòùâêîôûäëïöü
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\|@#~½¬~;:_
"""))
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = []
num_words = rand.randint(300, 400)
for i in range(num_words):
k = rand.randint(1, 7)
word = rand.choices(CHARS, k=k)
space = rand.choice(WHITESPACES)
text.append("".join(word) + space)
yield "".join(text)
def generator_random_vocab_chars(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
"""Brute force random text with vocab characters"""
vocab_ids = list(tokenizer.vocab.values())
vocab_text = tokenizer.decode(vocab_ids, skip_special_tokens=True)
vocab_chars = list(set(vocab_text))
del vocab_ids, vocab_text
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = rand.choices(vocab_chars, k=1024)
yield "".join(text)
def generator_random_vocab_tokens(tokenizer: PreTrainedTokenizerBase, iterations = 100) -> Iterator[str]:
"""Brute force random text from vocab tokens"""
space_id = tokenizer.encode(" ", add_special_tokens=False)[0]
vocab_ids = list(tokenizer.vocab.values())
vocab_ids = list(sorted(vocab_ids + vocab_ids))
for i in range(1, len(vocab_ids), 2):
vocab_ids[i] = space_id
vocab_tokens = tokenizer.decode(vocab_ids, skip_special_tokens=True)
vocab_tokens = vocab_tokens.split(" ")
del vocab_ids
yield from vocab_tokens
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = []
num_words = rand.randint(300, 400)
for i in range(num_words):
k = rand.randint(1, 3)
tokens = rand.choices(vocab_tokens, k=k)
tokens = [t.strip(" \n\r\t") for t in tokens]
sep = rand.choice(" \n\r\t")
text.append("".join(tokens) + sep)
yield "".join(text)
def generator_random_bytes(iterations = 100) -> Iterator[str]:
"""Brute force random bytes"""
WHITESPACES = list(" " * 20 + "\n" * 5 + "\r\n" * 5 + "\t" * 5)
rand = random.Random()
for m in range(iterations):
rand.seed(m)
text = []
num_words = rand.randint(300, 400)
for i in range(num_words):
k = rand.randint(1, 8)
word = [chr(r) for r in rand.randbytes(k) if r]
word.append(rand.choice(WHITESPACES))
text.append("".join(word))
yield "".join(text)
def test_compare_tokenizer(model: LibLlamaModel, tokenizer: PreTrainedTokenizerBase, generator: Iterator[str]):
def find_first_mismatch(ids1: list[int], ids2: list[int]):
for i, (a,b) in enumerate(zip(ids1, ids2)):
if a != b:
return i
if len(ids1) == len(ids2):
return -1
return min(len(ids1), len(ids2))
t0 = time.perf_counter()
logger.info("%s: %s" % (generator.__name__, "ini"))
for text in generator:
ids1 = model.tokenize(text, add_special=False, parse_special=False)
ids2 = tokenizer.encode(text, add_special_tokens=False)
if ids1 != ids2:
i = find_first_mismatch(ids1, ids2)
ids1 = list(ids1)[max(0, i - 2) : i + 2 + 1]
ids2 = list(ids2)[max(0, i - 2) : i + 2 + 1]
text2 = tokenizer.decode(ids2, skip_special_tokens=True)
assert (text2 in text)
logger.info(" Text: " + repr(text2))
logger.info(" TokenIDs: " + str(ids1))
logger.info(" Expected: " + str(ids2))
raise Exception()
t1 = time.perf_counter()
logger.info("%s: end, time: %.3f secs" % (generator.__name__, t1 - t0))
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("vocab_file", help="path to vocab 'gguf' file")
parser.add_argument("dir_tokenizer", help="directory containing 'tokenizer.model' file")
parser.add_argument("--verbose", action="store_true", help="increase output verbosity")
args = parser.parse_args()
logging.basicConfig(level=logging.DEBUG if args.verbose else logging.INFO)
model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=2048))
tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
test_compare_tokenizer(model, tokenizer, generator_custom_text())
test_compare_tokenizer(model, tokenizer, generator_custom_text_edge_cases())
test_compare_tokenizer(model, tokenizer, generator_random_chars(10_000))
test_compare_tokenizer(model, tokenizer, generator_random_vocab_chars(tokenizer, 10_000))
test_compare_tokenizer(model, tokenizer, generator_random_vocab_tokens(tokenizer, 10_000))
# test_compare_tokenizer(model, tokenizer, generator_random_bytes(10_000)) # FAIL
model.free()

File diff suppressed because it is too large Load diff

View file

@ -7,6 +7,7 @@
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number; extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_number;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter; extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_letter;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_separator;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace; extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_whitespace;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark; extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_accent_mark;
extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation; extern const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_punctuation;

View file

@ -9,6 +9,7 @@
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <unordered_map> #include <unordered_map>
#include <unordered_set>
#include <utility> #include <utility>
#include <vector> #include <vector>
#include <locale> #include <locale>
@ -120,9 +121,9 @@ static std::unordered_map<uint32_t, int> unicode_cpt_type_map() {
cpt_types[i] = CODEPOINT_TYPE_LETTER; cpt_types[i] = CODEPOINT_TYPE_LETTER;
} }
} }
for (auto p : unicode_ranges_whitespace) { for (auto p : unicode_ranges_separator) {
for (auto i = p.first; i <= p.second; ++i) { for (auto i = p.first; i <= p.second; ++i) {
cpt_types[i] = CODEPOINT_TYPE_WHITESPACE; cpt_types[i] = CODEPOINT_TYPE_SEPARATOR;
} }
} }
for (auto p : unicode_ranges_accent_mark) { for (auto p : unicode_ranges_accent_mark) {
@ -224,138 +225,256 @@ static std::vector<size_t> unicode_regex_split_custom_gpt2(const std::string & t
std::vector<size_t> bpe_offsets; // store the offset of each word std::vector<size_t> bpe_offsets; // store the offset of each word
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
const auto cpts = unicode_cpts_from_utf8(text);
size_t start = 0; size_t start = 0;
for (auto offset : offsets) {
const size_t offset_ini = start;
const size_t offset_end = start + offset;
assert(offset_end <= cpts.size());
start = offset_end;
auto _get_cpt = [&] (const size_t pos) -> char32_t {
return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
};
auto _get_cpt_type = [&] (const size_t pos) -> int {
return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
};
size_t _prev_end = offset_ini;
auto _add_token = [&] (const size_t end) -> size_t {
assert(_prev_end <= end && end <= offset_end);
size_t len = end - _prev_end;
if (len > 0) {
bpe_offsets.push_back(len);
}
_prev_end = end;
//if (len > 0) {
// std::string s = "";
// for(size_t p = end-len; p < end; p++)
// s += unicode_cpt_to_utf8(cpts[p]);
// printf(">>> '%s'\n", s.c_str());
//}
return len;
};
for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
const char32_t cpt = _get_cpt(pos);
const int cpt_type = _get_cpt_type(pos);
// regex: 's|'t|'re|'ve|'m|'ll|'d
if (cpt == '\'' && pos+1 < offset_end) {
char32_t cpt_next = _get_cpt(pos+1);
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
pos += _add_token(pos+2);
continue;
}
if (pos+2 < offset_end) {
char32_t cpt_next_next = _get_cpt(pos+2);
if ((cpt_next == 'r' && cpt_next_next == 'e') ||
(cpt_next == 'v' && cpt_next_next == 'e') ||
(cpt_next == 'l' && cpt_next_next == 'l')) {
pos += _add_token(pos+3);
continue;
}
}
}
char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
// regex: <space>?\p{L}+
if (cpt2_type == CODEPOINT_TYPE_LETTER) {
pos += (cpt == ' ');
while (cpt2_type == CODEPOINT_TYPE_LETTER) {
cpt2_type = _get_cpt_type(++pos);
}
_add_token(pos);
continue;
}
// regex: <space>?\p{N}+
if (cpt2_type == CODEPOINT_TYPE_NUMBER) {
pos += (cpt == ' ');
while (cpt2_type == CODEPOINT_TYPE_NUMBER) {
cpt2_type = _get_cpt_type(++pos);
}
_add_token(pos);
continue;
}
// regex: <space>?[^\s\p{L}\p{N}]+
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
pos += (cpt == ' ');
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
cpt2_type = _get_cpt_type(++pos);
cpt2 = _get_cpt(pos);
}
_add_token(pos);
continue;
}
size_t num_whitespaces = 0;
while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
num_whitespaces++;
}
// regex: \s+(?!\S)
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
pos += num_whitespaces - 1;
_add_token(pos);
continue;
}
// regex: \s+
if (num_whitespaces > 0) {
pos += num_whitespaces;
_add_token(pos);
continue;
}
// no matches
_add_token(++pos);
}
}
return bpe_offsets;
}
// LLAMA3 system regex: "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
static std::vector<size_t> unicode_regex_split_custom_llama3(const std::string & text, const std::vector<size_t> & offsets) {
std::vector<size_t> bpe_offsets; // store the offset of each word
bpe_offsets.reserve(offsets.size()); // Reserve memory for the approximate size
const auto cpts = unicode_cpts_from_utf8(text); const auto cpts = unicode_cpts_from_utf8(text);
size_t start = 0;
for (auto offset : offsets) { for (auto offset : offsets) {
std::string token; const size_t offset_ini = start;
const size_t offset_end = start + offset;
assert(offset_end <= cpts.size());
start = offset_end;
bool collecting_numeric = false; auto _get_cpt = [&] (const size_t pos) -> char32_t {
bool collecting_letter = false; return (offset_ini <= pos && pos < offset_end) ? cpts[pos] : 0;
bool collecting_special = false; };
bool collecting_whitespace_lookahead = false;
bool collecting = false;
std::vector<std::string> text_utf; auto _get_cpt_type = [&] (const size_t pos) -> int {
text_utf.reserve(offset); return (offset_ini <= pos && pos < offset_end) ? unicode_cpt_type(cpts[pos]) : CODEPOINT_TYPE_UNIDENTIFIED;
};
for (size_t i = start; i < start + offset; ++i) { size_t _prev_end = offset_ini;
text_utf.emplace_back(unicode_cpt_to_utf8(cpts[i])); auto _add_token = [&] (const size_t end) -> size_t {
assert(_prev_end <= end && end <= offset_end);
size_t len = end - _prev_end;
if (len > 0) {
bpe_offsets.push_back(len);
} }
_prev_end = end;
//if (len > 0) {
// std::string s = "";
// for(size_t p = end-len; p < end; p++)
// s += unicode_cpt_to_utf8(cpts[p]);
// printf(">>> '%s'\n", s.c_str());
//}
return len;
};
for (int i = 0; i < (int)text_utf.size(); i++) { for (size_t pos = offset_ini; pos < offset_end; /*pos++*/ ) {
const std::string & utf_char = text_utf[i]; const char32_t cpt = _get_cpt(pos);
bool split_condition = false; const int cpt_type = _get_cpt_type(pos);
int bytes_remain = text_utf.size() - i;
// forward backward lookups // regex: (?i:'s|'t|'re|'ve|'m|'ll|'d) // case insensitive
const std::string & utf_char_next = (i + 1 < (int)text_utf.size()) ? text_utf[i + 1] : ""; if (cpt == '\'' && pos+1 < offset_end) {
const std::string & utf_char_next_next = (i + 2 < (int)text_utf.size()) ? text_utf[i + 2] : ""; char32_t cpt_next = unicode_tolower(_get_cpt(pos+1));
if (cpt_next == 's' || cpt_next == 't' || cpt_next == 'm' || cpt_next == 'd') {
// handling contractions pos += _add_token(pos+2);
if (!split_condition && bytes_remain >= 2) { continue;
// 's|'t|'m|'d
if (utf_char == "\'" && (utf_char_next == "s" || utf_char_next == "t" || utf_char_next == "m" || utf_char_next == "d")) {
split_condition = true;
} }
if (split_condition) { if (pos+2 < offset_end) {
if (token.size()) { char32_t cpt_next_next = unicode_tolower(_get_cpt(pos+2));
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); if ((cpt_next == 'r' && cpt_next_next == 'e') ||
} (cpt_next == 'v' && cpt_next_next == 'e') ||
token = utf_char + utf_char_next; (cpt_next == 'l' && cpt_next_next == 'l')) {
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); pos += _add_token(pos+3);
token = "";
i++;
continue; continue;
} }
} }
if (!split_condition && bytes_remain >= 3) {
// 're|'ve|'ll
if (utf_char == "\'" && (
(utf_char_next == "r" && utf_char_next_next == "e") ||
(utf_char_next == "v" && utf_char_next_next == "e") ||
(utf_char_next == "l" && utf_char_next_next == "l"))
) {
split_condition = true;
} }
if (split_condition) {
// current token + next token can be defined
if (token.size()) {
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size());
}
token = utf_char;
token += utf_char_next;
token += utf_char_next_next;
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); // regex: [^\r\n\p{L}\p{N}]?\p{L}+ //####FIXME: the first \p{L} is correct?
token = ""; if (cpt != '\r' && cpt != '\n' && /*cpt_type != CODEPOINT_TYPE_LETTER &&*/ cpt_type != CODEPOINT_TYPE_NUMBER) {
i += 2; if (cpt_type == CODEPOINT_TYPE_LETTER || _get_cpt_type(pos+1) == CODEPOINT_TYPE_LETTER) { // one or more letters
pos++;
while (_get_cpt_type(pos) == CODEPOINT_TYPE_LETTER) {
pos++;
}
_add_token(pos);
continue; continue;
} }
} }
if (!split_condition && !collecting) { // regex: \p{N}{1,3}
if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER)) { if (cpt_type == CODEPOINT_TYPE_NUMBER) {
collecting_letter = true; size_t ini = pos;
collecting = true; while (_get_cpt_type(pos) == CODEPOINT_TYPE_NUMBER) {
} if (++pos - ini >= 3 ) {
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || (token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) { _add_token(pos);
collecting_numeric = true; ini = pos;
collecting = true;
}
else if (
((unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) && (unicode_cpt_type(utf_char) != CODEPOINT_TYPE_WHITESPACE)) ||
(token.empty() && utf_char == " " && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_LETTER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_NUMBER && unicode_cpt_type(utf_char_next) != CODEPOINT_TYPE_WHITESPACE)
) {
collecting_special = true;
collecting = true;
}
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE && unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_WHITESPACE) {
collecting_whitespace_lookahead = true;
collecting = true;
}
else if (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE) {
split_condition = true;
} }
} }
else if (!split_condition && collecting) { _add_token(pos);
if (collecting_letter && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_LETTER) { continue;
split_condition = true;
}
else if (collecting_numeric && unicode_cpt_type(utf_char) != CODEPOINT_TYPE_NUMBER) {
split_condition = true;
}
else if (collecting_special && (unicode_cpt_type(utf_char) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_NUMBER || unicode_cpt_type(utf_char) == CODEPOINT_TYPE_WHITESPACE)) {
split_condition = true;
}
else if (collecting_whitespace_lookahead && (unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_LETTER || unicode_cpt_type(utf_char_next) == CODEPOINT_TYPE_NUMBER)) {
split_condition = true;
}
} }
if (utf_char_next == "") { // regex: <space>?[^\s\p{L}\p{N}]+[\r\n]*
split_condition = true; // final char32_t cpt2 = (cpt == ' ' ? _get_cpt(pos+1) : cpt);
token += utf_char; int cpt2_type = (cpt == ' ' ? _get_cpt_type(pos+1) : cpt_type);
if (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
pos += (cpt == ' ');
while (!unicode_cpt_is_whitespace(cpt2) && cpt2_type != CODEPOINT_TYPE_LETTER && cpt2_type != CODEPOINT_TYPE_NUMBER && cpt2_type != CODEPOINT_TYPE_UNIDENTIFIED) {
cpt2_type = _get_cpt_type(++pos);
cpt2 = _get_cpt(pos);
}
while (cpt2 == '\r' || cpt2 == '\n') {
cpt2 = _get_cpt(++pos);
}
_add_token(pos);
continue;
} }
if (split_condition) { size_t num_whitespaces = 0;
if (token.size()) { size_t last_end_r_or_n = 0;
bpe_offsets.emplace_back(unicode_cpts_from_utf8(token).size()); while (unicode_cpt_is_whitespace(_get_cpt(pos+num_whitespaces))) {
} char32_t cpt2 = _get_cpt(pos+num_whitespaces);
token = utf_char; if (cpt2 == '\r' || cpt2 == '\n') {
collecting = false; last_end_r_or_n = pos + num_whitespaces + 1;
collecting_letter = false;
collecting_numeric = false;
collecting_special = false;
collecting_whitespace_lookahead = false;
}
else {
token += utf_char;
} }
num_whitespaces++;
} }
start += offset; // regex: \s*[\r\n]+
if (last_end_r_or_n > 0) {
pos = last_end_r_or_n;
_add_token(pos);
continue;
}
// regex: \s+(?!\S)
if (num_whitespaces > 1 && _get_cpt(pos+num_whitespaces) != 0) {
pos += num_whitespaces - 1;
_add_token(pos);
continue;
}
// regex: \s+
if (num_whitespaces > 0) {
pos += num_whitespaces;
_add_token(pos);
continue;
}
// no matches
_add_token(++pos);
}
} }
return bpe_offsets; return bpe_offsets;
@ -424,14 +543,14 @@ static std::vector<size_t> unicode_regex_split_stl(const std::string & text, con
static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) { static std::vector<size_t> unicode_regex_split_custom(const std::string & text, const std::string & regex_expr, const std::vector<size_t> & offsets) {
std::vector<size_t> bpe_offsets; std::vector<size_t> bpe_offsets;
(void)(text); if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
(void)(regex_expr); bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets);
(void)(offsets); } else if (
// TODO: this implementation is actually wrong, uncomment and run: regex_expr == "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+" ||
// make -j && ./bin/test-tokenizer-0 ../models/ggml-vocab-gpt-2.gguf regex_expr == "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+") {
//if (regex_expr == "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)") {
// bpe_offsets = unicode_regex_split_custom_gpt2(text, offsets); bpe_offsets = unicode_regex_split_custom_llama3(text, offsets);
//} }
return bpe_offsets; return bpe_offsets;
} }
@ -506,6 +625,19 @@ int unicode_cpt_type(const std::string & utf8) {
return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset)); return unicode_cpt_type(unicode_cpt_from_utf8(utf8, offset));
} }
bool unicode_cpt_is_whitespace(uint32_t cp) {
static const std::unordered_set<uint32_t> is_whitespace = [] {
std::unordered_set<uint32_t> is_whitespace;
for (auto p : unicode_ranges_whitespace) {
for (auto i = p.first; i <= p.second; ++i) {
is_whitespace.insert(i);
}
}
return is_whitespace;
}();
return (bool)is_whitespace.count(cp);
}
std::string unicode_byte_to_utf8(uint8_t byte) { std::string unicode_byte_to_utf8(uint8_t byte) {
static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map(); static std::unordered_map<uint8_t, std::string> map = unicode_byte_to_utf8_map();
return map.at(byte); return map.at(byte);

View file

@ -7,7 +7,7 @@
#define CODEPOINT_TYPE_UNIDENTIFIED 0 #define CODEPOINT_TYPE_UNIDENTIFIED 0
#define CODEPOINT_TYPE_NUMBER 1 #define CODEPOINT_TYPE_NUMBER 1
#define CODEPOINT_TYPE_LETTER 2 #define CODEPOINT_TYPE_LETTER 2
#define CODEPOINT_TYPE_WHITESPACE 3 #define CODEPOINT_TYPE_SEPARATOR 3
#define CODEPOINT_TYPE_ACCENT_MARK 4 #define CODEPOINT_TYPE_ACCENT_MARK 4
#define CODEPOINT_TYPE_PUNCTUATION 5 #define CODEPOINT_TYPE_PUNCTUATION 5
#define CODEPOINT_TYPE_SYMBOL 6 #define CODEPOINT_TYPE_SYMBOL 6
@ -21,6 +21,8 @@ std::vector<uint32_t> unicode_cpts_normalize_nfd(const std::vector<uint32_t> & c
int unicode_cpt_type(uint32_t cp); int unicode_cpt_type(uint32_t cp);
int unicode_cpt_type(const std::string & utf8); int unicode_cpt_type(const std::string & utf8);
bool unicode_cpt_is_whitespace(uint32_t cp);
std::string unicode_byte_to_utf8(uint8_t byte); std::string unicode_byte_to_utf8(uint8_t byte);
uint8_t unicode_utf8_to_byte(const std::string & utf8); uint8_t unicode_utf8_to_byte(const std::string & utf8);