llm : add Falcon support (#2717)
* llama : refactor GGUF constants into static maps * llama : check if model architecture is known * llama : refactor llama_model_load_internal() * gguf : add KV constant maps * llm : read arch-specific KVs * convert : add dummy scores + types * falcon : load tensor data (CPU only) * llama : fix loading progress bar * llama : add arch member to llama_model * falcon : CPU inference working * falcon : support non-40B models * falcon : minor * llama : minor updates ggml-ci * convert-falcon-hf-to-gguf.py : fix special token mapping * llama.cpp : llama default UNK token = id 0 * llama.cpp : fix bpe tokenizer * llama.cpp : fix the fix of bpe tokenizer * ggml : pass eps to ggml_norm * metal : implement RoPE (mode = 2) + avoid ggml_repeat * ggml : ggml_repeat always creates new tensor * falcon : copy-paste self-attention from LLaMA * metal : print extra compute pipeline info * falcon : minor changes (still chasing the Metal problem) * llama.cpp : fix linefeed token * metal : fix GELU kernel numerical stability by using precise::tanh * metal : temporary workaround for the concurrency optimization bug * falcon : add CUDA offloading (#2739) * llama : better model naming and size reporting * llama : prep new tokenizer support * llama : advanced BPE tokenizer based on ggllm.cpp imlpementation * llama : remove oboslete comment ggml-ci * common : remove obsolete BPE API + disable test-tokenizer-1 * llama : revert BPE special-case in llama_byte_to_token() * cuda : add TODOs for RoPE NeoX implementation * llama : default special tokens based on vocab type * perplexity : add log for start of tokenization --------- Co-authored-by: klosax <131523366+klosax@users.noreply.github.com> Co-authored-by: slaren <slarengh@gmail.com>
This commit is contained in:
parent
a192860cfe
commit
cf658adc83
18 changed files with 1596 additions and 668 deletions
30
ggml.c
30
ggml.c
|
@ -3554,9 +3554,9 @@ inline static void ggml_vec_tanh_f32 (const int n, float * y, const float * x) {
|
|||
inline static void ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
|
||||
inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
||||
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
static const float GELU_COEF_A = 0.044715f;
|
||||
static const float GELU_QUICK_COEF = -1.702f;
|
||||
static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f;
|
||||
|
||||
inline static float ggml_gelu_f32(float x) {
|
||||
return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
|
@ -5555,10 +5555,6 @@ struct ggml_tensor * ggml_repeat(
|
|||
is_node = true;
|
||||
}
|
||||
|
||||
if (ggml_are_same_shape(a, b) && !is_node) {
|
||||
return a;
|
||||
}
|
||||
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, a->type, b->n_dims, b->ne);
|
||||
|
||||
result->op = GGML_OP_REPEAT;
|
||||
|
@ -5789,6 +5785,7 @@ struct ggml_tensor * ggml_silu_back(
|
|||
static struct ggml_tensor * ggml_norm_impl(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
float eps,
|
||||
bool inplace) {
|
||||
bool is_node = false;
|
||||
|
||||
|
@ -5799,7 +5796,7 @@ static struct ggml_tensor * ggml_norm_impl(
|
|||
|
||||
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
||||
|
||||
// TODO: maybe store epsilon here?
|
||||
ggml_set_op_params(result, &eps, sizeof(eps));
|
||||
|
||||
result->op = GGML_OP_NORM;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
|
@ -5810,14 +5807,16 @@ static struct ggml_tensor * ggml_norm_impl(
|
|||
|
||||
struct ggml_tensor * ggml_norm(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_norm_impl(ctx, a, false);
|
||||
struct ggml_tensor * a,
|
||||
float eps) {
|
||||
return ggml_norm_impl(ctx, a, eps, false);
|
||||
}
|
||||
|
||||
struct ggml_tensor * ggml_norm_inplace(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a) {
|
||||
return ggml_norm_impl(ctx, a, true);
|
||||
struct ggml_tensor * a,
|
||||
float eps) {
|
||||
return ggml_norm_impl(ctx, a, eps, true);
|
||||
}
|
||||
|
||||
// ggml_rms_norm
|
||||
|
@ -10619,7 +10618,8 @@ static void ggml_compute_forward_norm_f32(
|
|||
|
||||
GGML_TENSOR_UNARY_OP_LOCALS;
|
||||
|
||||
const float eps = 1e-5f; // TODO: make this a parameter
|
||||
float eps;
|
||||
memcpy(&eps, dst->op_params, sizeof(float));
|
||||
|
||||
// TODO: optimize
|
||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||
|
@ -12537,7 +12537,7 @@ static void ggml_compute_forward_rope_f32(
|
|||
dst_data[1] = x0*sin_theta*zeta + x1*cos_theta*zeta;
|
||||
}
|
||||
} else {
|
||||
// TODO: this is probably wrong, but I can't figure it out ..
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
|
@ -12666,7 +12666,7 @@ static void ggml_compute_forward_rope_f16(
|
|||
dst_data[1] = GGML_FP32_TO_FP16(x0*sin_theta + x1*cos_theta);
|
||||
}
|
||||
} else {
|
||||
// TODO: this is probably wrong, but I can't figure it out ..
|
||||
// TODO: this might be wrong for ne0 != n_dims - need double check
|
||||
// ref: https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py#LL251C1-L294C28
|
||||
for (int64_t ib = 0; ib < ne0/n_dims; ++ib) {
|
||||
for (int64_t ic = 0; ic < n_dims; ic += 2) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue