Code refactor and optimize using reserve

This commit is contained in:
German Semenov 2023-06-17 02:33:57 +03:00
parent 5ea4339273
commit 5f2c9ce21e
3 changed files with 106 additions and 93 deletions

138
ggml.c
View file

@ -3802,7 +3802,7 @@ static inline int ggml_up(int n, int m) {
////////////////////////////////////////////////////////////////////////////////
struct ggml_context * ggml_init(struct ggml_init_params params) {
struct ggml_context * ggml_init(const struct ggml_init_params params) {
// make this function thread safe
ggml_critical_section_start();
@ -3936,7 +3936,7 @@ size_t ggml_used_mem(const struct ggml_context * ctx) {
return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
}
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
size_t ggml_set_scratch(struct ggml_context * ctx, const struct ggml_scratch scratch) {
const size_t result = ctx->scratch.data ? ctx->scratch.offs : 0;
ctx->scratch = scratch;
@ -6458,7 +6458,7 @@ void ggml_set_param(
static void ggml_compute_forward_dup_same_cont(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
GGML_ASSERT(ggml_nelements(dst) == ggml_nelements(src0));
GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0));
GGML_ASSERT(src0->type == dst->type);
@ -7064,7 +7064,7 @@ static void ggml_compute_forward_dup_f32(
static void ggml_compute_forward_dup(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
if (ggml_is_contiguous(src0) && ggml_is_contiguous(dst) && src0->type == dst->type) {
ggml_compute_forward_dup_same_cont(params, src0, dst);
return;
@ -7710,7 +7710,7 @@ static void ggml_compute_forward_add1(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -7842,7 +7842,7 @@ static void ggml_compute_forward_acc(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * opt0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
@ -7946,7 +7946,7 @@ static void ggml_compute_forward_sub(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8045,7 +8045,7 @@ static void ggml_compute_forward_mul(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8141,7 +8141,7 @@ static void ggml_compute_forward_div(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8183,7 +8183,7 @@ static void ggml_compute_forward_sqr_f32(
static void ggml_compute_forward_sqr(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8225,7 +8225,7 @@ static void ggml_compute_forward_sqrt_f32(
static void ggml_compute_forward_sqrt(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8268,7 +8268,7 @@ static void ggml_compute_forward_log_f32(
static void ggml_compute_forward_log(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8286,7 +8286,7 @@ static void ggml_compute_forward_log(
static void ggml_compute_forward_sum_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_is_scalar(dst));
@ -8325,7 +8325,7 @@ static void ggml_compute_forward_sum_f32(
static void ggml_compute_forward_sum(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8343,7 +8343,7 @@ static void ggml_compute_forward_sum(
static void ggml_compute_forward_sum_rows_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
GGML_ASSERT(params->ith == 0);
if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
@ -8392,7 +8392,7 @@ static void ggml_compute_forward_sum_rows_f32(
static void ggml_compute_forward_sum_rows(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8463,7 +8463,7 @@ static void ggml_compute_forward_mean_f32(
static void ggml_compute_forward_mean(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8481,7 +8481,7 @@ static void ggml_compute_forward_mean(
static void ggml_compute_forward_repeat_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
GGML_ASSERT(params->ith == 0);
GGML_ASSERT(ggml_can_repeat(src0, dst));
@ -8542,7 +8542,7 @@ static void ggml_compute_forward_repeat_f32(
static void ggml_compute_forward_repeat(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8560,7 +8560,7 @@ static void ggml_compute_forward_repeat(
static void ggml_compute_forward_abs_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, dst));
@ -8584,7 +8584,7 @@ static void ggml_compute_forward_abs_f32(
static void ggml_compute_forward_abs(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8626,7 +8626,7 @@ static void ggml_compute_forward_sgn_f32(
static void ggml_compute_forward_sgn(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8668,7 +8668,7 @@ static void ggml_compute_forward_neg_f32(
static void ggml_compute_forward_neg(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8710,7 +8710,7 @@ static void ggml_compute_forward_step_f32(
static void ggml_compute_forward_step(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8752,7 +8752,7 @@ static void ggml_compute_forward_relu_f32(
static void ggml_compute_forward_relu(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8811,7 +8811,7 @@ static void ggml_compute_forward_gelu_f32(
static void ggml_compute_forward_gelu(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8872,7 +8872,7 @@ static void ggml_compute_forward_silu_f32(
static void ggml_compute_forward_silu(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -8937,7 +8937,7 @@ static void ggml_compute_forward_silu_back(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * grad,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -9016,7 +9016,7 @@ static void ggml_compute_forward_norm_f32(
static void ggml_compute_forward_norm(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -9090,7 +9090,7 @@ static void ggml_compute_forward_rms_norm_f32(
static void ggml_compute_forward_rms_norm(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -9279,7 +9279,7 @@ static void ggml_compute_forward_rms_norm_back(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -9937,7 +9937,7 @@ static void ggml_compute_forward_mul_mat(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
@ -10013,7 +10013,7 @@ static void ggml_compute_forward_scale(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -10114,7 +10114,7 @@ static void ggml_compute_forward_set(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * opt0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
@ -10140,7 +10140,7 @@ static void ggml_compute_forward_set(
static void ggml_compute_forward_cpy(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
ggml_compute_forward_dup(params, src0, dst);
}
@ -10149,7 +10149,7 @@ static void ggml_compute_forward_cpy(
static void ggml_compute_forward_cont(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
ggml_compute_forward_dup(params, src0, dst);
}
@ -10158,7 +10158,7 @@ static void ggml_compute_forward_cont(
static void ggml_compute_forward_reshape(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
// NOP
UNUSED(params);
UNUSED(src0);
@ -10285,7 +10285,7 @@ static void ggml_compute_forward_get_rows(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_Q4_0:
case GGML_TYPE_Q4_1:
@ -10403,7 +10403,7 @@ static void ggml_compute_forward_get_rows_back(
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
const struct ggml_tensor * opt0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
{
@ -10498,7 +10498,7 @@ static void ggml_compute_forward_diag_f32(
static void ggml_compute_forward_diag(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -10517,7 +10517,7 @@ static void ggml_compute_forward_diag_mask_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst,
const struct ggml_tensor * dst,
const float value) {
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 2);
@ -10569,7 +10569,7 @@ static void ggml_compute_forward_diag_mask_inf(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -10586,7 +10586,7 @@ static void ggml_compute_forward_diag_mask_zero(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -10675,7 +10675,7 @@ static void ggml_compute_forward_soft_max_f32(
static void ggml_compute_forward_soft_max(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F32:
{
@ -10694,7 +10694,7 @@ static void ggml_compute_forward_alibi_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 2);
@ -10757,7 +10757,7 @@ static void ggml_compute_forward_alibi_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
assert(params->ith == 0);
assert(src1->type == GGML_TYPE_I32);
assert(ggml_nelements(src1) == 2);
@ -10820,7 +10820,7 @@ static void ggml_compute_forward_alibi(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
{
@ -10852,7 +10852,7 @@ static void ggml_compute_forward_rope_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 3);
@ -10965,7 +10965,7 @@ static void ggml_compute_forward_rope_f16(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
GGML_ASSERT(src1->type == GGML_TYPE_I32);
GGML_ASSERT(ggml_nelements(src1) == 3);
@ -11078,7 +11078,7 @@ static void ggml_compute_forward_rope(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
{
@ -11327,7 +11327,7 @@ static void ggml_compute_forward_rope_back(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
{
@ -11350,7 +11350,7 @@ static void ggml_compute_forward_conv_1d_1s_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -11470,7 +11470,7 @@ static void ggml_compute_forward_conv_1d_1s_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -11590,7 +11590,7 @@ static void ggml_compute_forward_conv_1d_1s(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
{
@ -11613,7 +11613,7 @@ static void ggml_compute_forward_conv_1d_2s_f16_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F16);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -11733,7 +11733,7 @@ static void ggml_compute_forward_conv_1d_2s_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
GGML_ASSERT(src0->type == GGML_TYPE_F32);
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
@ -11853,7 +11853,7 @@ static void ggml_compute_forward_conv_1d_2s(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (src0->type) {
case GGML_TYPE_F16:
{
@ -11878,7 +11878,7 @@ static void ggml_compute_forward_flash_attn_f32(
const struct ggml_tensor * k,
const struct ggml_tensor * v,
const bool masked,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
@ -12087,7 +12087,7 @@ static void ggml_compute_forward_flash_attn_f16(
const struct ggml_tensor * k,
const struct ggml_tensor * v,
const bool masked,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
int64_t t0 = ggml_perf_time_us();
UNUSED(t0);
@ -12333,7 +12333,7 @@ static void ggml_compute_forward_flash_attn(
const struct ggml_tensor * k,
const struct ggml_tensor * v,
const bool masked,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (q->type) {
case GGML_TYPE_F16:
{
@ -12539,7 +12539,7 @@ static void ggml_compute_forward_flash_ff(
const struct ggml_tensor * b1,
const struct ggml_tensor * c0,
const struct ggml_tensor * c1,
struct ggml_tensor * dst) {
const struct ggml_tensor * dst) {
switch (b0->type) {
case GGML_TYPE_F16:
{
@ -12586,7 +12586,7 @@ static void ggml_compute_forward_map_unary_f32(
static void ggml_compute_forward_map_unary(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
struct ggml_tensor * dst,
const struct ggml_tensor * dst,
const ggml_unary_op_f32_t fun) {
switch (src0->type) {
case GGML_TYPE_F32:
@ -12606,7 +12606,7 @@ static void ggml_compute_forward_map_binary_f32(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst,
const struct ggml_tensor * dst,
const ggml_binary_op_f32_t fun) {
assert(params->ith == 0);
assert(ggml_are_same_shape(src0, src1) && ggml_are_same_shape(src0, dst));
@ -12635,7 +12635,7 @@ static void ggml_compute_forward_map_binary(
const struct ggml_compute_params * params,
const struct ggml_tensor * src0,
const struct ggml_tensor * src1,
struct ggml_tensor * dst,
const struct ggml_tensor * dst,
const ggml_binary_op_f32_t fun) {
switch (src0->type) {
case GGML_TYPE_F32:
@ -12651,7 +12651,7 @@ static void ggml_compute_forward_map_binary(
/////////////////////////////////
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
static void ggml_compute_forward(const struct ggml_compute_params * params, const struct ggml_tensor * tensor) {
GGML_ASSERT(params);
switch (tensor->op) {
@ -14405,7 +14405,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
"label=\"",
(void *) node, color);
if (strlen(node->name) > 0) {
if (node->name[0] != '\0') {
fprintf(fp, "%s |", node->name);
}
@ -14430,7 +14430,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
"label=\"<x>",
(void *) node, color);
if (strlen(node->name) > 0) {
if (node->name[0] != '\0') {
fprintf(fp, "%s | ", node->name);
}
if (ggml_nelements(node) == 1) {
@ -14543,7 +14543,7 @@ static void ggml_opt_get_grad(int np, struct ggml_tensor * const ps[], float * g
static enum ggml_opt_result ggml_opt_adam(
struct ggml_context * ctx,
struct ggml_opt_params params,
const struct ggml_opt_params params,
struct ggml_tensor * f,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb) {
@ -15120,7 +15120,7 @@ struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type) {
enum ggml_opt_result ggml_opt(
struct ggml_context * ctx,
struct ggml_opt_params params,
const struct ggml_opt_params params,
struct ggml_tensor * f) {
bool free_ctx = false;
if (ctx == NULL) {

4
ggml.h
View file

@ -442,7 +442,7 @@ extern "C" {
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, const struct ggml_scratch scratch);
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
@ -1061,7 +1061,7 @@ extern "C" {
// optimize the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt(
struct ggml_context * ctx,
struct ggml_opt_params params,
const struct ggml_opt_params params,
struct ggml_tensor * f);
//

View file

@ -592,6 +592,9 @@ struct llama_model_loader {
auto * first_file = new llama_file_loader(fname_base.c_str(), 0, tensors_map);
file_loaders.emplace_back(first_file);
uint32_t n_parts = vocab_only ? 1 : guess_n_parts();
if (n_parts != 1) {
file_loaders.reserve(n_parts - 1);
}
for (uint32_t i = 1; i < n_parts; i++) {
std::string fname = fname_base + "." + std::to_string(i);
auto * ith_file = new llama_file_loader(fname.c_str(), i, tensors_map);
@ -891,10 +894,11 @@ static void llama_model_load_internal(
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, vocab_only));
lctx.vocab = std::move(ml->file_loaders.at(0)->vocab);
const auto & loader = ml->file_loaders.at(0);
lctx.vocab = std::move(loader->vocab);
auto & model = lctx.model;
model.hparams = ml->file_loaders.at(0)->hparams;
llama_file_version file_version = ml->file_loaders.at(0)->file_version;
model.hparams = loader->hparams;
llama_file_version file_version = loader->file_version;
auto & hparams = model.hparams;
uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
@ -1019,7 +1023,8 @@ static void llama_model_load_internal(
ml->done_getting_tensors();
// populate `tensors_by_name`
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
model.tensors_by_name.reserve(ml->tensors_map.tensors.size());
for (const auto & lt : ml->tensors_map.tensors) {
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
}
@ -1143,6 +1148,8 @@ static bool llama_eval_internal(
struct ggml_tensor * inpL = ggml_get_rows(ctx0, model.tok_embeddings, embd);
for (int il = 0; il < n_layer; ++il) {
const auto & layer = model.layers[il];
struct ggml_tensor * inpSA = inpL;
struct ggml_tensor * cur;
@ -1155,22 +1162,22 @@ static bool llama_eval_internal(
// cur = attention_norm*cur
cur = ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].attention_norm, cur),
ggml_repeat(ctx0, layer.attention_norm, cur),
cur);
}
// self-attention
{
// compute Q and K and RoPE them
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, layer.wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, layer.wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
ggml_set_name(Qcur, "Qcur");
ggml_set_name(Kcur, "Kcur");
// store key and value to memory
{
// compute the transposed [N, n_embd] V matrix
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, cur), n_embd, N));
struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, layer.wv, cur), n_embd, N));
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past));
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd,
@ -1249,7 +1256,7 @@ static bool llama_eval_internal(
// projection (no bias)
cur = ggml_mul_mat(ctx0,
model.layers[il].wo,
layer.wo,
cur);
}
@ -1265,16 +1272,16 @@ static bool llama_eval_internal(
// cur = ffn_norm*cur
cur = ggml_mul(ctx0,
ggml_repeat(ctx0, model.layers[il].ffn_norm, cur),
ggml_repeat(ctx0, layer.ffn_norm, cur),
cur);
}
struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
model.layers[il].w3,
layer.w3,
cur);
cur = ggml_mul_mat(ctx0,
model.layers[il].w1,
layer.w1,
cur);
// SILU activation
@ -1283,7 +1290,7 @@ static bool llama_eval_internal(
cur = ggml_mul(ctx0, cur, tmp);
cur = ggml_mul_mat(ctx0,
model.layers[il].w2,
layer.w2,
cur);
}
@ -1450,7 +1457,7 @@ struct llama_tokenizer {
// keep substituting the highest frequency pairs for as long as we can.
while (!work_queue_.empty()) {
auto bigram = work_queue_.top();
const auto& bigram = work_queue_.top();
work_queue_.pop();
auto & left_sym = symbols_[bigram.left];
@ -1485,6 +1492,7 @@ struct llama_tokenizer {
if (token == vocab_.token_to_id.end()) {
// output any symbols that did not form tokens as bytes.
output.reserve(symbol.n);
for (int j = 0; j < (int) symbol.n; ++j) {
llama_vocab::id token_id = static_cast<uint8_t>(symbol.text[j]) + 3;
output.push_back(token_id);
@ -1703,8 +1711,9 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
// Compute the absolute difference between negative log probability and entropy for each candidate
std::vector<float> shifted_scores;
shifted_scores.reserve(candidates->size);
for (size_t i = 0; i < candidates->size; ++i) {
float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
const float shifted_score = fabsf(-logf(candidates->data[i].p) - entropy);
shifted_scores.push_back(shifted_score);
}
@ -1733,6 +1742,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
// Resize the output vector to keep only the locally typical tokens
std::vector<llama_token_data> new_candidates;
new_candidates.reserve(last_idx);
for (size_t i = 0; i < last_idx; ++i) {
size_t idx = indices[i];
new_candidates.push_back(candidates->data[idx]);
@ -2258,7 +2268,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
// create a name -> tensor map of the model to accelerate lookups
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
for (auto & kv: model.tensors_by_name) {
model_tensors.reserve(model.tensors_by_name.size());
for (const auto & kv: model.tensors_by_name) {
model_tensors.insert(kv);
}
@ -2374,12 +2385,13 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
ggml_tensor * base_t;
if (model_loader) {
// load from base model
if (model_loader->tensors_map.name_to_idx.find(base_name) == model_loader->tensors_map.name_to_idx.end()) {
auto & tmap = model_loader->tensors_map;
if (tmap.name_to_idx.find(base_name) == tmap.name_to_idx.end()) {
fprintf(stderr, "%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
return 1;
}
size_t idx = model_loader->tensors_map.name_to_idx[base_name];
llama_load_tensor & lt = model_loader->tensors_map.tensors[idx];
size_t idx = tmap.name_to_idx[base_name];
llama_load_tensor & lt = tmap.tensors[idx];
base_t = model_loader->get_tensor(base_name, { (uint32_t)dest_t->ne[0], (uint32_t)dest_t->ne[1] });
lt.data = (uint8_t *) lt.ggml_tensor->data;
model_loader->load_data_for(lt);
@ -2513,11 +2525,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
std::stringstream rng_ss;
rng_ss << ctx->rng;
const size_t rng_size = rng_ss.str().size();
const auto & rng = rng_ss.str();
const size_t rng_size = rng.size();
char rng_buf[LLAMA_MAX_RNG_STATE];
memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
memcpy(&rng_buf[0], rng.data(), rng.size());
memcpy(out, &rng_size, sizeof(rng_size)); out += sizeof(rng_size);
memcpy(out, &rng_buf[0], LLAMA_MAX_RNG_STATE); out += LLAMA_MAX_RNG_STATE;
@ -2901,7 +2914,7 @@ void llama_reset_timings(struct llama_context * ctx) {
const char * llama_print_system_info(void) {
static std::string s;
s = "";
s.clear();
s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";