mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-04-15 12:18:52 +00:00
Sync llama.cpp to 6986c7835adc13ba3f9d933b95671bb1f3984dc6
This commit is contained in:
parent
8f522cb702
commit
1904a3cae8
3 changed files with 3666 additions and 357 deletions
3737
third_party/ggml/ggml.c
vendored
3737
third_party/ggml/ggml.c
vendored
File diff suppressed because it is too large
Load diff
209
third_party/ggml/ggml.h
vendored
209
third_party/ggml/ggml.h
vendored
|
@ -226,6 +226,11 @@ COSMOPOLITAN_C_START_
|
||||||
GGML_TYPE_COUNT,
|
GGML_TYPE_COUNT,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
enum ggml_backend {
|
||||||
|
GGML_BACKEND_CPU = 0,
|
||||||
|
GGML_BACKEND_CUDA = 1,
|
||||||
|
};
|
||||||
|
|
||||||
// model file types
|
// model file types
|
||||||
enum ggml_ftype {
|
enum ggml_ftype {
|
||||||
GGML_FTYPE_UNKNOWN = -1,
|
GGML_FTYPE_UNKNOWN = -1,
|
||||||
|
@ -246,12 +251,16 @@ COSMOPOLITAN_C_START_
|
||||||
|
|
||||||
GGML_OP_DUP,
|
GGML_OP_DUP,
|
||||||
GGML_OP_ADD,
|
GGML_OP_ADD,
|
||||||
|
GGML_OP_ADD1,
|
||||||
|
GGML_OP_ACC,
|
||||||
GGML_OP_SUB,
|
GGML_OP_SUB,
|
||||||
GGML_OP_MUL,
|
GGML_OP_MUL,
|
||||||
GGML_OP_DIV,
|
GGML_OP_DIV,
|
||||||
GGML_OP_SQR,
|
GGML_OP_SQR,
|
||||||
GGML_OP_SQRT,
|
GGML_OP_SQRT,
|
||||||
|
GGML_OP_LOG,
|
||||||
GGML_OP_SUM,
|
GGML_OP_SUM,
|
||||||
|
GGML_OP_SUM_ROWS,
|
||||||
GGML_OP_MEAN,
|
GGML_OP_MEAN,
|
||||||
GGML_OP_REPEAT,
|
GGML_OP_REPEAT,
|
||||||
GGML_OP_ABS,
|
GGML_OP_ABS,
|
||||||
|
@ -261,12 +270,15 @@ COSMOPOLITAN_C_START_
|
||||||
GGML_OP_RELU,
|
GGML_OP_RELU,
|
||||||
GGML_OP_GELU,
|
GGML_OP_GELU,
|
||||||
GGML_OP_SILU,
|
GGML_OP_SILU,
|
||||||
|
GGML_OP_SILU_BACK,
|
||||||
GGML_OP_NORM, // normalize
|
GGML_OP_NORM, // normalize
|
||||||
GGML_OP_RMS_NORM,
|
GGML_OP_RMS_NORM,
|
||||||
|
GGML_OP_RMS_NORM_BACK,
|
||||||
|
|
||||||
GGML_OP_MUL_MAT,
|
GGML_OP_MUL_MAT,
|
||||||
|
|
||||||
GGML_OP_SCALE,
|
GGML_OP_SCALE,
|
||||||
|
GGML_OP_SET,
|
||||||
GGML_OP_CPY,
|
GGML_OP_CPY,
|
||||||
GGML_OP_CONT,
|
GGML_OP_CONT,
|
||||||
GGML_OP_RESHAPE,
|
GGML_OP_RESHAPE,
|
||||||
|
@ -274,9 +286,13 @@ COSMOPOLITAN_C_START_
|
||||||
GGML_OP_PERMUTE,
|
GGML_OP_PERMUTE,
|
||||||
GGML_OP_TRANSPOSE,
|
GGML_OP_TRANSPOSE,
|
||||||
GGML_OP_GET_ROWS,
|
GGML_OP_GET_ROWS,
|
||||||
|
GGML_OP_GET_ROWS_BACK,
|
||||||
|
GGML_OP_DIAG,
|
||||||
GGML_OP_DIAG_MASK_INF,
|
GGML_OP_DIAG_MASK_INF,
|
||||||
|
GGML_OP_DIAG_MASK_ZERO,
|
||||||
GGML_OP_SOFT_MAX,
|
GGML_OP_SOFT_MAX,
|
||||||
GGML_OP_ROPE,
|
GGML_OP_ROPE,
|
||||||
|
GGML_OP_ROPE_BACK,
|
||||||
GGML_OP_ALIBI,
|
GGML_OP_ALIBI,
|
||||||
GGML_OP_CONV_1D_1S,
|
GGML_OP_CONV_1D_1S,
|
||||||
GGML_OP_CONV_1D_2S,
|
GGML_OP_CONV_1D_2S,
|
||||||
|
@ -305,7 +321,8 @@ COSMOPOLITAN_C_START_
|
||||||
|
|
||||||
// n-dimensional tensor
|
// n-dimensional tensor
|
||||||
struct ggml_tensor {
|
struct ggml_tensor {
|
||||||
enum ggml_type type;
|
enum ggml_type type;
|
||||||
|
enum ggml_backend backend;
|
||||||
|
|
||||||
int n_dims;
|
int n_dims;
|
||||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||||
|
@ -336,7 +353,7 @@ COSMOPOLITAN_C_START_
|
||||||
|
|
||||||
char name[32];
|
char name[32];
|
||||||
|
|
||||||
char padding[8]; // TODO: remove and add padding to name?
|
char padding[16]; // TODO: remove and add padding to name?
|
||||||
};
|
};
|
||||||
|
|
||||||
// computation graph
|
// computation graph
|
||||||
|
@ -487,6 +504,29 @@ COSMOPOLITAN_C_START_
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_add1(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_acc(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
size_t nb1,
|
||||||
|
size_t nb2,
|
||||||
|
size_t nb3,
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_acc_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
size_t nb1,
|
||||||
|
size_t nb2,
|
||||||
|
size_t nb3,
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_sub(
|
GGML_API struct ggml_tensor * ggml_sub(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -510,12 +550,24 @@ COSMOPOLITAN_C_START_
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_log(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_log_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// return scalar
|
// return scalar
|
||||||
// TODO: compute sum along rows
|
|
||||||
GGML_API struct ggml_tensor * ggml_sum(
|
GGML_API struct ggml_tensor * ggml_sum(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// sums along rows, with input shape [a,b,c,d] return shape [1,b,c,d]
|
||||||
|
GGML_API struct ggml_tensor * ggml_sum_rows(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// mean along rows
|
// mean along rows
|
||||||
GGML_API struct ggml_tensor * ggml_mean(
|
GGML_API struct ggml_tensor * ggml_mean(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -557,6 +609,13 @@ COSMOPOLITAN_C_START_
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// a - x
|
||||||
|
// b - dy
|
||||||
|
GGML_API struct ggml_tensor * ggml_silu_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// normalize along rows
|
// normalize along rows
|
||||||
// TODO: eps is hardcoded to 1e-5 for now
|
// TODO: eps is hardcoded to 1e-5 for now
|
||||||
GGML_API struct ggml_tensor * ggml_norm(
|
GGML_API struct ggml_tensor * ggml_norm(
|
||||||
|
@ -567,6 +626,13 @@ COSMOPOLITAN_C_START_
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// a - x
|
||||||
|
// b - dy
|
||||||
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
// A: m rows, n columns
|
// A: m rows, n columns
|
||||||
// B: p rows, n columns (i.e. we transpose it internally)
|
// B: p rows, n columns (i.e. we transpose it internally)
|
||||||
// result is m columns, p rows
|
// result is m columns, p rows
|
||||||
|
@ -579,12 +645,66 @@ COSMOPOLITAN_C_START_
|
||||||
// operations on tensors without backpropagation
|
// operations on tensors without backpropagation
|
||||||
//
|
//
|
||||||
|
|
||||||
// in-place, returns view(a)
|
|
||||||
GGML_API struct ggml_tensor * ggml_scale(
|
GGML_API struct ggml_tensor * ggml_scale(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// in-place, returns view(a)
|
||||||
|
GGML_API struct ggml_tensor * ggml_scale_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||||
|
GGML_API struct ggml_tensor * ggml_set(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
size_t nb1,
|
||||||
|
size_t nb2,
|
||||||
|
size_t nb3,
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
|
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||||
|
GGML_API struct ggml_tensor * ggml_set_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
size_t nb1,
|
||||||
|
size_t nb2,
|
||||||
|
size_t nb3,
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_set_1d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_set_1d_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
|
// b -> view(a,offset,nb1,nb2,3), return modified a
|
||||||
|
GGML_API struct ggml_tensor * ggml_set_2d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
size_t nb1,
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
|
// b -> view(a,offset,nb1,nb2,3), return view(a)
|
||||||
|
GGML_API struct ggml_tensor * ggml_set_2d_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
size_t nb1,
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
|
|
||||||
// a -> b, return view(b)
|
// a -> b, return view(b)
|
||||||
GGML_API struct ggml_tensor * ggml_cpy(
|
GGML_API struct ggml_tensor * ggml_cpy(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -603,6 +723,13 @@ COSMOPOLITAN_C_START_
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
// return view(a)
|
||||||
|
// TODO: when we start computing gradient, make a copy instead of view
|
||||||
|
GGML_API struct ggml_tensor * ggml_reshape_1d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0);
|
||||||
|
|
||||||
// return view(a)
|
// return view(a)
|
||||||
// TODO: when we start computing gradient, make a copy instead of view
|
// TODO: when we start computing gradient, make a copy instead of view
|
||||||
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
GGML_API struct ggml_tensor * ggml_reshape_2d(
|
||||||
|
@ -620,6 +747,14 @@ COSMOPOLITAN_C_START_
|
||||||
int64_t ne1,
|
int64_t ne1,
|
||||||
int64_t ne2);
|
int64_t ne2);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_reshape_4d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
int64_t ne2,
|
||||||
|
int64_t ne3);
|
||||||
|
|
||||||
// offset in bytes
|
// offset in bytes
|
||||||
GGML_API struct ggml_tensor * ggml_view_1d(
|
GGML_API struct ggml_tensor * ggml_view_1d(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
|
@ -645,6 +780,18 @@ COSMOPOLITAN_C_START_
|
||||||
size_t nb2, // slice stride in bytes
|
size_t nb2, // slice stride in bytes
|
||||||
size_t offset);
|
size_t offset);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_view_4d(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int64_t ne0,
|
||||||
|
int64_t ne1,
|
||||||
|
int64_t ne2,
|
||||||
|
int64_t ne3,
|
||||||
|
size_t nb1, // row stride in bytes
|
||||||
|
size_t nb2, // slice stride in bytes
|
||||||
|
size_t nb3,
|
||||||
|
size_t offset);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_permute(
|
GGML_API struct ggml_tensor * ggml_permute(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
|
@ -663,18 +810,49 @@ COSMOPOLITAN_C_START_
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b);
|
struct ggml_tensor * b);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_get_rows_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
struct ggml_tensor * b,
|
||||||
|
struct ggml_tensor * c);
|
||||||
|
|
||||||
|
GGML_API struct ggml_tensor * ggml_diag(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// set elements above the diagonal to -INF
|
// set elements above the diagonal to -INF
|
||||||
// in-place, returns view(a)
|
|
||||||
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
int n_past);
|
int n_past);
|
||||||
|
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
|
GGML_API struct ggml_tensor * ggml_diag_mask_inf_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_past);
|
||||||
|
|
||||||
|
// set elements above the diagonal to 0
|
||||||
|
GGML_API struct ggml_tensor * ggml_diag_mask_zero(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_past);
|
||||||
|
|
||||||
|
// in-place, returns view(a)
|
||||||
|
GGML_API struct ggml_tensor * gml_diag_mask_zero_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_past);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_soft_max(
|
GGML_API struct ggml_tensor * ggml_soft_max(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a);
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
|
// in-place, returns view(a)
|
||||||
|
GGML_API struct ggml_tensor * ggml_soft_max_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a);
|
||||||
|
|
||||||
// rotary position embedding
|
// rotary position embedding
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
// if mode & 1 == 1, skip n_past elements
|
// if mode & 1 == 1, skip n_past elements
|
||||||
|
@ -687,6 +865,23 @@ COSMOPOLITAN_C_START_
|
||||||
int n_dims,
|
int n_dims,
|
||||||
int mode);
|
int mode);
|
||||||
|
|
||||||
|
// in-place, returns view(a)
|
||||||
|
GGML_API struct ggml_tensor * ggml_rope_inplace(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_past,
|
||||||
|
int n_dims,
|
||||||
|
int mode);
|
||||||
|
|
||||||
|
// rotary position embedding backward, i.e compute dx from dy
|
||||||
|
// a - dy
|
||||||
|
GGML_API struct ggml_tensor * ggml_rope_back(
|
||||||
|
struct ggml_context * ctx,
|
||||||
|
struct ggml_tensor * a,
|
||||||
|
int n_past,
|
||||||
|
int n_dims,
|
||||||
|
int mode);
|
||||||
|
|
||||||
// alibi position embedding
|
// alibi position embedding
|
||||||
// in-place, returns view(a)
|
// in-place, returns view(a)
|
||||||
struct ggml_tensor * ggml_alibi(
|
struct ggml_tensor * ggml_alibi(
|
||||||
|
@ -731,13 +926,13 @@ COSMOPOLITAN_C_START_
|
||||||
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
GGML_API struct ggml_tensor * ggml_map_unary_f32(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
const ggml_unary_op_f32_t fun);
|
ggml_unary_op_f32_t fun);
|
||||||
|
|
||||||
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
GGML_API struct ggml_tensor * ggml_map_binary_f32(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_tensor * a,
|
struct ggml_tensor * a,
|
||||||
struct ggml_tensor * b,
|
struct ggml_tensor * b,
|
||||||
const ggml_binary_op_f32_t fun);
|
ggml_binary_op_f32_t fun);
|
||||||
|
|
||||||
//
|
//
|
||||||
// automatic differentiation
|
// automatic differentiation
|
||||||
|
|
77
third_party/ggml/llama.cc
vendored
77
third_party/ggml/llama.cc
vendored
|
@ -105,26 +105,26 @@ static const std::map<e_model, size_t> & MEM_REQ_SCRATCH1()
|
||||||
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
// 2*n_embd*n_ctx*n_layer*sizeof(float16)
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
static const std::map<e_model, size_t> & MEM_REQ_KV_SELF()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> _MEM_REQ_KV_SELF = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
{ MODEL_7B, 1026ull * MB },
|
{ MODEL_7B, 1026ull * MB },
|
||||||
{ MODEL_13B, 1608ull * MB },
|
{ MODEL_13B, 1608ull * MB },
|
||||||
{ MODEL_30B, 3124ull * MB },
|
{ MODEL_30B, 3124ull * MB },
|
||||||
{ MODEL_65B, 5120ull * MB },
|
{ MODEL_65B, 5120ull * MB },
|
||||||
};
|
};
|
||||||
return _MEM_REQ_KV_SELF;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
// this is mostly needed for temporary mul_mat buffers to dequantize the data
|
||||||
// not actually needed if BLAS is disabled
|
// not actually needed if BLAS is disabled
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
static const std::map<e_model, size_t> & MEM_REQ_EVAL()
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> _MEM_REQ_EVAL = {
|
static std::map<e_model, size_t> k_sizes = {
|
||||||
{ MODEL_7B, 768ull * MB },
|
{ MODEL_7B, 768ull * MB },
|
||||||
{ MODEL_13B, 1024ull * MB },
|
{ MODEL_13B, 1024ull * MB },
|
||||||
{ MODEL_30B, 1280ull * MB },
|
{ MODEL_30B, 1280ull * MB },
|
||||||
{ MODEL_65B, 1536ull * MB },
|
{ MODEL_65B, 1536ull * MB },
|
||||||
};
|
};
|
||||||
return _MEM_REQ_EVAL;
|
return k_sizes;
|
||||||
}
|
}
|
||||||
|
|
||||||
// default hparams (LLaMA 7B)
|
// default hparams (LLaMA 7B)
|
||||||
|
@ -681,7 +681,7 @@ struct llama_model_loader {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * get_tensor(const std::string & name, std::vector<uint32_t> ne) {
|
struct ggml_tensor * get_tensor(const std::string & name, const std::vector<uint32_t> & ne) {
|
||||||
auto it = tensors_map.name_to_idx.find(name);
|
auto it = tensors_map.name_to_idx.find(name);
|
||||||
if (it == tensors_map.name_to_idx.end()) {
|
if (it == tensors_map.name_to_idx.end()) {
|
||||||
Die("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
Die("llama.cpp: tensor '%s' is missing from model", name.c_str());
|
||||||
|
@ -1131,7 +1131,7 @@ static bool llama_eval_internal(
|
||||||
const auto & model = lctx.model;
|
const auto & model = lctx.model;
|
||||||
const auto & hparams = model.hparams;
|
const auto & hparams = model.hparams;
|
||||||
|
|
||||||
auto & kv_self = model.kv_self;
|
const auto & kv_self = model.kv_self;
|
||||||
|
|
||||||
LLAMA_ASSERT(!!kv_self.ctx);
|
LLAMA_ASSERT(!!kv_self.ctx);
|
||||||
|
|
||||||
|
@ -1184,8 +1184,8 @@ static bool llama_eval_internal(
|
||||||
// self-attention
|
// self-attention
|
||||||
{
|
{
|
||||||
// compute Q and K and RoPE them
|
// compute Q and K and RoPE them
|
||||||
struct ggml_tensor * Qcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||||
struct ggml_tensor * Kcur = ggml_rope(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0);
|
||||||
ggml_set_name(Qcur, "Qcur");
|
ggml_set_name(Qcur, "Qcur");
|
||||||
ggml_set_name(Kcur, "Kcur");
|
ggml_set_name(Kcur, "Kcur");
|
||||||
|
|
||||||
|
@ -1226,17 +1226,19 @@ static bool llama_eval_internal(
|
||||||
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head));
|
||||||
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
|
||||||
|
|
||||||
struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
|
// KQ_scaled shape [n_past + N, N, n_head, 1]
|
||||||
|
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
|
||||||
ggml_set_name(KQ_scaled, "KQ_scaled");
|
ggml_set_name(KQ_scaled, "KQ_scaled");
|
||||||
|
|
||||||
// KQ_masked = mask_past(KQ_scaled)
|
// KQ_masked = mask_past(KQ_scaled)
|
||||||
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
|
struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
|
||||||
ggml_set_name(KQ_masked, "KQ_masked");
|
ggml_set_name(KQ_masked, "KQ_masked");
|
||||||
|
|
||||||
// KQ = soft_max(KQ_masked)
|
// KQ = soft_max(KQ_masked)
|
||||||
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
|
struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
|
||||||
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
ggml_set_name(KQ_soft_max, "KQ_soft_max");
|
||||||
|
|
||||||
|
|
||||||
// split cached V into n_head heads
|
// split cached V into n_head heads
|
||||||
struct ggml_tensor * V =
|
struct ggml_tensor * V =
|
||||||
ggml_view_3d(ctx0, kv_self.v,
|
ggml_view_3d(ctx0, kv_self.v,
|
||||||
|
@ -1337,7 +1339,7 @@ static bool llama_eval_internal(
|
||||||
lctx.use_buf(ctx0, -1);
|
lctx.use_buf(ctx0, -1);
|
||||||
|
|
||||||
// logits -> probs
|
// logits -> probs
|
||||||
//inpL = ggml_soft_max(ctx0, inpL);
|
//inpL = ggml_soft_max_inplace(ctx0, inpL);
|
||||||
|
|
||||||
// run the computation
|
// run the computation
|
||||||
ggml_build_forward_expand(&gf, inpL);
|
ggml_build_forward_expand(&gf, inpL);
|
||||||
|
@ -1375,7 +1377,7 @@ static bool llama_eval_internal(
|
||||||
}
|
}
|
||||||
|
|
||||||
// extract embeddings
|
// extract embeddings
|
||||||
if (lctx.embedding.size()) {
|
if (!lctx.embedding.empty()) {
|
||||||
auto & embedding_out = lctx.embedding;
|
auto & embedding_out = lctx.embedding;
|
||||||
|
|
||||||
embedding_out.resize(n_embd);
|
embedding_out.resize(n_embd);
|
||||||
|
@ -1426,6 +1428,8 @@ struct llama_sp_symbol {
|
||||||
size_t n;
|
size_t n;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
static_assert(std::is_trivially_copyable<llama_sp_symbol>::value, "llama_sp_symbol is not trivially copyable");
|
||||||
|
|
||||||
struct llama_sp_bigram {
|
struct llama_sp_bigram {
|
||||||
struct comparator {
|
struct comparator {
|
||||||
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
bool operator()(llama_sp_bigram & l, llama_sp_bigram & r) {
|
||||||
|
@ -1458,7 +1462,7 @@ struct llama_tokenizer {
|
||||||
sym.prev = index - 1;
|
sym.prev = index - 1;
|
||||||
sym.next = offs == text.size() ? -1 : index + 1;
|
sym.next = offs == text.size() ? -1 : index + 1;
|
||||||
index++;
|
index++;
|
||||||
symbols_.emplace_back(std::move(sym));
|
symbols_.emplace_back(sym);
|
||||||
}
|
}
|
||||||
|
|
||||||
// seed the work queue with all possible 2-character tokens.
|
// seed the work queue with all possible 2-character tokens.
|
||||||
|
@ -1549,7 +1553,7 @@ static std::vector<llama_vocab::id> llama_tokenize(const llama_vocab & vocab, co
|
||||||
llama_tokenizer tokenizer(vocab);
|
llama_tokenizer tokenizer(vocab);
|
||||||
std::vector<llama_vocab::id> output;
|
std::vector<llama_vocab::id> output;
|
||||||
|
|
||||||
if (text.size() == 0) {
|
if (text.empty()) {
|
||||||
return output;
|
return output;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1785,7 +1789,7 @@ void llama_sample_repetition_penalty(struct llama_context * ctx, llama_token_dat
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
for (size_t i = 0; i < candidates->size; ++i) {
|
for (size_t i = 0; i < candidates->size; ++i) {
|
||||||
auto token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
const auto * token_iter = std::find(last_tokens, last_tokens + last_tokens_size, candidates->data[i].id);
|
||||||
if (token_iter == last_tokens + last_tokens_size) {
|
if (token_iter == last_tokens + last_tokens_size) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -1929,7 +1933,7 @@ llama_token llama_sample_token_greedy(struct llama_context * ctx, llama_token_da
|
||||||
const int64_t t_start_sample_us = ggml_time_us();
|
const int64_t t_start_sample_us = ggml_time_us();
|
||||||
|
|
||||||
// Find max element
|
// Find max element
|
||||||
auto max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
auto * max_iter = std::max_element(candidates->data, candidates->data + candidates->size, [](const llama_token_data & a, const llama_token_data & b) {
|
||||||
return a.logit < b.logit;
|
return a.logit < b.logit;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -2286,7 +2290,8 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
|
||||||
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*vocab_only*/ false));
|
||||||
|
|
||||||
size_t ctx_size, mmapped_size;
|
size_t ctx_size;
|
||||||
|
size_t mmapped_size;
|
||||||
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
model_loader->calc_sizes(&ctx_size, &mmapped_size);
|
||||||
base_buf.resize(ctx_size);
|
base_buf.resize(ctx_size);
|
||||||
|
|
||||||
|
@ -2325,8 +2330,12 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string name(length, 0);
|
std::string name;
|
||||||
fin.read(&name[0], length);
|
{
|
||||||
|
char buf[1024];
|
||||||
|
fin.read(buf, length);
|
||||||
|
name = std::string(buf, length);
|
||||||
|
}
|
||||||
|
|
||||||
// check for lora suffix and get the type of tensor
|
// check for lora suffix and get the type of tensor
|
||||||
const std::string lora_suffix = ".lora";
|
const std::string lora_suffix = ".lora";
|
||||||
|
@ -2341,7 +2350,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
base_name.erase(pos);
|
base_name.erase(pos);
|
||||||
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
// fprintf(stderr, "%s: %s => %s (lora type %s) ", __func__, name.c_str(),base_name.c_str(), lora_type.c_str());
|
||||||
|
|
||||||
if (model_tensors.find(base_name.data()) == model_tensors.end()) {
|
if (model_tensors.find(base_name) == model_tensors.end()) {
|
||||||
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
fprintf(stderr, "%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -2421,7 +2430,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
|
|
||||||
if (scaling != 1.0f) {
|
if (scaling != 1.0f) {
|
||||||
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
ggml_tensor * scale_tensor = ggml_new_f32(lora_ctx, scaling);
|
||||||
BA = ggml_scale(lora_ctx, BA, scale_tensor);
|
BA = ggml_scale_inplace(lora_ctx, BA, scale_tensor);
|
||||||
}
|
}
|
||||||
|
|
||||||
ggml_tensor * r;
|
ggml_tensor * r;
|
||||||
|
@ -2443,8 +2452,9 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
lora_tensors.clear();
|
lora_tensors.clear();
|
||||||
|
|
||||||
n_tensors++;
|
n_tensors++;
|
||||||
if (n_tensors % 4 == 0)
|
if (n_tensors % 4 == 0) {
|
||||||
fprintf(stderr, ".");
|
fprintf(stderr, ".");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2462,7 +2472,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
|
||||||
|
|
||||||
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
int llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, const char * path_base_model, int n_threads) {
|
||||||
// try {
|
// try {
|
||||||
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
return llama_apply_lora_from_file_internal(ctx, path_lora, path_base_model, n_threads);
|
||||||
// } catch (const std::string & err) {
|
// } catch (const std::string & err) {
|
||||||
// fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
// fprintf(stderr, "%s: failed to apply lora adapter: %s\n", __func__, err.c_str());
|
||||||
// return 1;
|
// return 1;
|
||||||
|
@ -2473,7 +2483,7 @@ int llama_get_kv_cache_token_count(const struct llama_context * ctx) {
|
||||||
return ctx->model.kv_self.n;
|
return ctx->model.kv_self.n;
|
||||||
}
|
}
|
||||||
|
|
||||||
#define LLAMA_MAX_RNG_STATE 64*1024
|
#define LLAMA_MAX_RNG_STATE (64*1024)
|
||||||
|
|
||||||
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
||||||
if (seed < 0) {
|
if (seed < 0) {
|
||||||
|
@ -2482,7 +2492,7 @@ void llama_set_rng_seed(struct llama_context * ctx, int seed) {
|
||||||
ctx->rng.seed(seed);
|
ctx->rng.seed(seed);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the size of the state
|
// Returns the *maximum* size of the state
|
||||||
size_t llama_get_state_size(const struct llama_context * ctx) {
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
||||||
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
||||||
|
@ -2514,8 +2524,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Copies the state to the specified destination address
|
// Copies the state to the specified destination address
|
||||||
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
||||||
uint8_t * out = dest;
|
uint8_t * out = dst;
|
||||||
|
|
||||||
// copy rng
|
// copy rng
|
||||||
{
|
{
|
||||||
|
@ -2575,9 +2585,10 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
||||||
|
|
||||||
if (kv_size) {
|
if (kv_size) {
|
||||||
const size_t elt_size = ggml_element_size(kv_self.k);
|
const size_t elt_size = ggml_element_size(kv_self.k);
|
||||||
llama_buffer buffer;
|
|
||||||
buffer.resize(4096);
|
char buffer[4096];
|
||||||
ggml_context * cpy_ctx = ggml_init({ buffer.size, buffer.addr, /* no_alloc */ true });
|
|
||||||
|
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
|
||||||
ggml_cgraph gf{};
|
ggml_cgraph gf{};
|
||||||
gf.n_threads = 1;
|
gf.n_threads = 1;
|
||||||
|
|
||||||
|
@ -2600,10 +2611,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
||||||
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
||||||
ggml_graph_compute(cpy_ctx, &gf);
|
ggml_graph_compute(cpy_ctx, &gf);
|
||||||
|
|
||||||
|
ggml_free(cpy_ctx);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const size_t written = out - dest;
|
const size_t written = out - dst;
|
||||||
const size_t max_size = llama_get_state_size(ctx);
|
const size_t max_size = llama_get_state_size(ctx);
|
||||||
|
|
||||||
LLAMA_ASSERT(written <= max_size);
|
LLAMA_ASSERT(written <= max_size);
|
||||||
|
|
Loading…
Add table
Reference in a new issue