do not use anonymous namespaces
This commit is contained in:
parent
2e2273f4fb
commit
45d0c8089a
15 changed files with 337 additions and 380 deletions
|
@ -14,9 +14,7 @@ constexpr float rms_norm_eps = LLAMA_DEFAULT_RMS_EPS;
|
||||||
constexpr float rms_norm_eps = 5e-6f;
|
constexpr float rms_norm_eps = 5e-6f;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace {
|
static float frand() {
|
||||||
|
|
||||||
float frand() {
|
|
||||||
return (float)rand()/(float)RAND_MAX;
|
return (float)rand()/(float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -27,19 +25,21 @@ struct random_normal_distribution {
|
||||||
float max;
|
float max;
|
||||||
};
|
};
|
||||||
|
|
||||||
void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
|
static void init_random_normal_distribution(
|
||||||
|
struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
|
||||||
|
) {
|
||||||
rnd->gen = std::mt19937(seed);
|
rnd->gen = std::mt19937(seed);
|
||||||
rnd->nd = std::normal_distribution<float>{mean, std};
|
rnd->nd = std::normal_distribution<float>{mean, std};
|
||||||
rnd->min = min;
|
rnd->min = min;
|
||||||
rnd->max = max;
|
rnd->max = max;
|
||||||
}
|
}
|
||||||
|
|
||||||
float frand_normal(struct random_normal_distribution * rnd) {
|
static float frand_normal(struct random_normal_distribution * rnd) {
|
||||||
const float r = rnd->nd(rnd->gen);
|
const float r = rnd->nd(rnd->gen);
|
||||||
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
return ((r < rnd->min) ? (rnd->min) : (r > rnd->max) ? (rnd->max) : r);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph, int n_threads) {
|
||||||
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
struct ggml_cplan plan = ggml_graph_plan(graph, n_threads);
|
||||||
|
|
||||||
if (plan.work_size > 0) {
|
if (plan.work_size > 0) {
|
||||||
|
@ -50,13 +50,9 @@ void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph * graph,
|
||||||
ggml_graph_compute(graph, &plan);
|
ggml_graph_compute(graph, &plan);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * randomize_tensor(
|
static struct ggml_tensor * randomize_tensor(
|
||||||
struct ggml_tensor * tensor,
|
struct ggml_tensor * tensor, int ndims, const int64_t ne[], float fmin, float fmax
|
||||||
int ndims,
|
) {
|
||||||
const int64_t ne[],
|
|
||||||
float fmin,
|
|
||||||
float fmax) {
|
|
||||||
|
|
||||||
switch (ndims) {
|
switch (ndims) {
|
||||||
case 1:
|
case 1:
|
||||||
for (int i0 = 0; i0 < ne[0]; i0++) {
|
for (int i0 = 0; i0 < ne[0]; i0++) {
|
||||||
|
@ -97,11 +93,9 @@ struct ggml_tensor * randomize_tensor(
|
||||||
return tensor;
|
return tensor;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * randomize_tensor_normal(
|
static struct ggml_tensor * randomize_tensor_normal(
|
||||||
struct ggml_tensor * tensor,
|
struct ggml_tensor * tensor, int ndims, const int64_t ne[], struct random_normal_distribution * rnd
|
||||||
int ndims,
|
) {
|
||||||
const int64_t ne[],
|
|
||||||
struct random_normal_distribution * rnd) {
|
|
||||||
float scale = 1.0; // xavier
|
float scale = 1.0; // xavier
|
||||||
switch (ndims) {
|
switch (ndims) {
|
||||||
case 1:
|
case 1:
|
||||||
|
@ -161,7 +155,7 @@ struct llama_hparams {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
static uint32_t get_n_ff(const struct llama_hparams* hparams) {
|
||||||
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
||||||
return n_ff;
|
return n_ff;
|
||||||
}
|
}
|
||||||
|
@ -262,7 +256,7 @@ struct llama_model_lora {
|
||||||
std::vector<llama_layer_lora> layers;
|
std::vector<llama_layer_lora> layers;
|
||||||
};
|
};
|
||||||
|
|
||||||
void init_model(struct llama_model * model) {
|
static void init_model(struct llama_model * model) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_embd = hparams.n_embd;
|
const uint32_t n_embd = hparams.n_embd;
|
||||||
|
@ -299,7 +293,7 @@ void init_model(struct llama_model * model) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void init_model_lora(struct llama_model_lora * model) {
|
static void init_model_lora(struct llama_model_lora * model) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_embd = hparams.n_embd;
|
const uint32_t n_embd = hparams.n_embd;
|
||||||
|
@ -342,7 +336,7 @@ void init_model_lora(struct llama_model_lora * model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_param_model(struct llama_model * model) {
|
static void set_param_model(struct llama_model * model) {
|
||||||
const auto& hparams = model->hparams;
|
const auto& hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
@ -368,7 +362,7 @@ void set_param_model(struct llama_model * model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_param_model_lora(struct llama_model_lora * model) {
|
static void set_param_model_lora(struct llama_model_lora * model) {
|
||||||
const auto& hparams = model->hparams;
|
const auto& hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
@ -399,7 +393,7 @@ void set_param_model_lora(struct llama_model_lora * model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
static void randomize_model(struct llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
@ -428,7 +422,9 @@ void randomize_model(struct llama_model * model, int seed, float mean, float std
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void randomize_model_lora(struct llama_model_lora * model, int seed, float mean, float std, float min, float max) {
|
static void randomize_model_lora(
|
||||||
|
struct llama_model_lora * model, int seed, float mean, float std, float min, float max
|
||||||
|
) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
@ -461,7 +457,7 @@ void randomize_model_lora(struct llama_model_lora * model, int seed, float mean,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
static bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int n_batch) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_ctx = hparams.n_ctx;
|
const uint32_t n_ctx = hparams.n_ctx;
|
||||||
|
@ -497,7 +493,7 @@ bool init_kv_cache(struct llama_kv_cache* cache, struct llama_model * model, int
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
static bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora * model, int n_batch) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_ctx = hparams.n_ctx;
|
const uint32_t n_ctx = hparams.n_ctx;
|
||||||
|
@ -533,15 +529,15 @@ bool init_kv_cache_lora(struct llama_kv_cache* cache, struct llama_model_lora *
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * forward(
|
static struct ggml_tensor * forward(
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_kv_cache * cache,
|
struct llama_kv_cache * cache,
|
||||||
struct ggml_context * ctx0,
|
struct ggml_context * ctx0,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_tensor * tokens_input,
|
struct ggml_tensor * tokens_input,
|
||||||
const int n_tokens,
|
const int n_tokens,
|
||||||
const int n_past) {
|
const int n_past
|
||||||
|
) {
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
|
||||||
struct llama_kv_cache& kv_self = *cache;
|
struct llama_kv_cache& kv_self = *cache;
|
||||||
|
@ -758,25 +754,25 @@ struct ggml_tensor * forward(
|
||||||
return inpL;
|
return inpL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
||||||
GGML_ASSERT(tensor->n_dims == 1);
|
GGML_ASSERT(tensor->n_dims == 1);
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
||||||
GGML_ASSERT(tensor->n_dims == 2);
|
GGML_ASSERT(tensor->n_dims == 2);
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||||
GGML_ASSERT(tensor->n_dims == 3);
|
GGML_ASSERT(tensor->n_dims == 3);
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||||
GGML_ASSERT(tensor->n_dims == 4);
|
GGML_ASSERT(tensor->n_dims == 4);
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||||
|
@ -784,16 +780,16 @@ void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int6
|
||||||
GGML_ASSERT(tensor->ne[3] == ne3);
|
GGML_ASSERT(tensor->ne[3] == ne3);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * forward_batch(
|
static struct ggml_tensor * forward_batch(
|
||||||
struct llama_model * model,
|
struct llama_model * model,
|
||||||
struct llama_kv_cache * cache,
|
struct llama_kv_cache * cache,
|
||||||
struct ggml_context * ctx0,
|
struct ggml_context * ctx0,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_tensor * tokens_input,
|
struct ggml_tensor * tokens_input,
|
||||||
const int n_tokens,
|
const int n_tokens,
|
||||||
const int n_past,
|
const int n_past,
|
||||||
const int n_batch) {
|
const int n_batch
|
||||||
|
) {
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
|
||||||
struct llama_kv_cache& kv_self = *cache;
|
struct llama_kv_cache& kv_self = *cache;
|
||||||
|
@ -1075,16 +1071,15 @@ struct ggml_tensor * forward_batch(
|
||||||
return inpL;
|
return inpL;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static struct ggml_tensor * forward_lora(
|
||||||
struct ggml_tensor * forward_lora(
|
struct llama_model_lora * model,
|
||||||
struct llama_model_lora * model,
|
struct llama_kv_cache * cache,
|
||||||
struct llama_kv_cache * cache,
|
struct ggml_context * ctx0,
|
||||||
struct ggml_context * ctx0,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_tensor * tokens_input,
|
||||||
struct ggml_tensor * tokens_input,
|
const int n_tokens,
|
||||||
const int n_tokens,
|
const int n_past
|
||||||
const int n_past) {
|
) {
|
||||||
|
|
||||||
const int N = n_tokens;
|
const int N = n_tokens;
|
||||||
|
|
||||||
struct llama_kv_cache& kv_self = *cache;
|
struct llama_kv_cache& kv_self = *cache;
|
||||||
|
@ -1330,7 +1325,7 @@ struct ggml_tensor * forward_lora(
|
||||||
return inpL;
|
return inpL;
|
||||||
}
|
}
|
||||||
|
|
||||||
void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
static void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
||||||
assert(logits->n_dims == 2);
|
assert(logits->n_dims == 2);
|
||||||
assert(probs->n_dims == 2);
|
assert(probs->n_dims == 2);
|
||||||
assert(best_samples->n_dims == 1);
|
assert(best_samples->n_dims == 1);
|
||||||
|
@ -1361,7 +1356,10 @@ void sample_softmax(struct ggml_tensor * logits, struct ggml_tensor * probs, str
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs, struct ggml_tensor * best_samples) {
|
static void sample_softmax_batch(
|
||||||
|
struct ggml_context * ctx, struct ggml_tensor * logits, struct ggml_tensor * probs,
|
||||||
|
struct ggml_tensor * best_samples
|
||||||
|
) {
|
||||||
GGML_ASSERT(best_samples->n_dims == 2);
|
GGML_ASSERT(best_samples->n_dims == 2);
|
||||||
GGML_ASSERT(logits->n_dims == 3);
|
GGML_ASSERT(logits->n_dims == 3);
|
||||||
GGML_ASSERT(probs->n_dims == 3);
|
GGML_ASSERT(probs->n_dims == 3);
|
||||||
|
@ -1395,7 +1393,7 @@ void sample_softmax_batch(struct ggml_context * ctx, struct ggml_tensor * logits
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_row(struct ggml_tensor * probs, int i) {
|
static void print_row(struct ggml_tensor * probs, int i) {
|
||||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||||
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
|
float p = ggml_get_f32_1d(probs, i*probs->ne[0] + k);
|
||||||
printf(" %.2f", p);
|
printf(" %.2f", p);
|
||||||
|
@ -1403,7 +1401,7 @@ void print_row(struct ggml_tensor * probs, int i) {
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_matrix(struct ggml_tensor * probs) {
|
static void print_matrix(struct ggml_tensor * probs) {
|
||||||
assert(probs->n_dims == 2);
|
assert(probs->n_dims == 2);
|
||||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||||
|
@ -1414,7 +1412,7 @@ void print_matrix(struct ggml_tensor * probs) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_token(int token, int n_vocab) {
|
static void print_token(int token, int n_vocab) {
|
||||||
for (int k = 0; k < token; ++k) {
|
for (int k = 0; k < token; ++k) {
|
||||||
printf(" ");
|
printf(" ");
|
||||||
}
|
}
|
||||||
|
@ -1425,14 +1423,14 @@ void print_token(int token, int n_vocab) {
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
|
static void print_tokens(struct ggml_tensor * tokens, int n_vocab) {
|
||||||
for (int i=0; i<tokens->ne[0]; ++i) {
|
for (int i=0; i<tokens->ne[0]; ++i) {
|
||||||
int token = ggml_get_i32_1d(tokens, i);
|
int token = ggml_get_i32_1d(tokens, i);
|
||||||
print_token(token, n_vocab);
|
print_token(token, n_vocab);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
static void get_example_targets(int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
||||||
int n_tokens = tokens_input->ne[0];
|
int n_tokens = tokens_input->ne[0];
|
||||||
int n_vocab = targets->ne[0];
|
int n_vocab = targets->ne[0];
|
||||||
float randomness = 0.0f;
|
float randomness = 0.0f;
|
||||||
|
@ -1453,7 +1451,9 @@ void get_example_targets(int example_id, struct ggml_tensor * tokens_input, stru
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) {
|
static void get_example_targets_batch(
|
||||||
|
struct ggml_context * ctx, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets
|
||||||
|
) {
|
||||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
GGML_ASSERT(tokens_input->n_dims == 2);
|
||||||
GGML_ASSERT( targets->n_dims == 3);
|
GGML_ASSERT( targets->n_dims == 3);
|
||||||
int n_tokens = tokens_input->ne[0];
|
int n_tokens = tokens_input->ne[0];
|
||||||
|
@ -1476,7 +1476,7 @@ void get_example_targets_batch(struct ggml_context * ctx, int example_id, struct
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
|
static void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * targets, int n_shift) {
|
||||||
int n_tokens = tokens_input->ne[0];
|
int n_tokens = tokens_input->ne[0];
|
||||||
int n_vocab = targets->ne[0];
|
int n_vocab = targets->ne[0];
|
||||||
for (int i=0; i<n_tokens-n_shift; ++i) {
|
for (int i=0; i<n_tokens-n_shift; ++i) {
|
||||||
|
@ -1487,12 +1487,16 @@ void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * tar
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * square_error_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
static struct ggml_tensor * square_error_loss(
|
||||||
|
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
|
||||||
|
) {
|
||||||
// todo: instead of a-b: a[1:]-b[:-1]
|
// todo: instead of a-b: a[1:]-b[:-1]
|
||||||
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
|
return ggml_sum(ctx, ggml_sqr(ctx, ggml_sub(ctx, a, b)));
|
||||||
}
|
}
|
||||||
|
|
||||||
struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
|
static struct ggml_tensor * cross_entropy_loss(
|
||||||
|
struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b
|
||||||
|
) {
|
||||||
const float eps = 1e-3f;
|
const float eps = 1e-3f;
|
||||||
return
|
return
|
||||||
ggml_sum(ctx,
|
ggml_sum(ctx,
|
||||||
|
@ -1506,8 +1510,6 @@ struct ggml_tensor * cross_entropy_loss(struct ggml_context * ctx, struct ggml_t
|
||||||
ggml_new_f32(ctx, eps)))))));
|
ggml_new_f32(ctx, eps)))))));
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 1) {
|
if (argc < 1) {
|
||||||
fprintf(stderr, "usage: %s\n", argv[0]);
|
fprintf(stderr, "usage: %s\n", argv[0]);
|
||||||
|
|
|
@ -25,14 +25,13 @@
|
||||||
#include <signal.h>
|
#include <signal.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
// Used for debugging to print out beam tokens.
|
// Used for debugging to print out beam tokens.
|
||||||
struct ostream_beam_view {
|
struct ostream_beam_view {
|
||||||
llama_context * ctx;
|
llama_context * ctx;
|
||||||
llama_beam_view beam_view;
|
llama_beam_view beam_view;
|
||||||
};
|
};
|
||||||
std::ostream& operator<<(std::ostream& os, const ostream_beam_view & obv) {
|
|
||||||
|
static std::ostream & operator<<(std::ostream & os, const ostream_beam_view & obv) {
|
||||||
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
os << "p(" << obv.beam_view.p << ") eob(" << std::boolalpha << obv.beam_view.eob << ") tokens(";
|
||||||
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
for (size_t i = 0 ; i < obv.beam_view.n_tokens ; ++i) {
|
||||||
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
os << llama_token_to_piece(obv.ctx, obv.beam_view.tokens[i]);
|
||||||
|
@ -48,7 +47,7 @@ struct beam_search_callback_data {
|
||||||
|
|
||||||
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
// In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same.
|
||||||
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
// For example, eob can be flagged due to maximum token length, stop words, etc.
|
||||||
bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, const size_t n_tokens) {
|
static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) {
|
||||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -58,7 +57,7 @@ bool is_at_eob(const beam_search_callback_data & callback_data, const llama_toke
|
||||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||||
// This is also called when the stop condition is met.
|
// This is also called when the stop condition is met.
|
||||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||||
void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
static void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_state) {
|
||||||
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
auto& callback_data = *static_cast<beam_search_callback_data*>(callback_data_ptr);
|
||||||
// Mark beams as EOS as needed.
|
// Mark beams as EOS as needed.
|
||||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
@ -84,8 +83,6 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char ** argv)
|
int main(int argc, char ** argv)
|
||||||
{
|
{
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
|
@ -115,9 +115,7 @@ struct TransformerWeights {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace {
|
static void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||||
|
|
||||||
void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
|
||||||
// we calloc instead of malloc to keep valgrind happy
|
// we calloc instead of malloc to keep valgrind happy
|
||||||
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
w->token_embedding_table = new float[p->vocab_size * p->dim]();
|
||||||
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
printf("[%s:AK] Allocating [%d] x [%d] = [%d] float space for w->token_embedding_table\n",__func__,p->vocab_size , p->dim, p->vocab_size * p->dim);
|
||||||
|
@ -160,7 +158,7 @@ void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
static int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shared_weights) {
|
||||||
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
if (fread(w->token_embedding_table, sizeof(float), p->vocab_size * p->dim, f) != static_cast<size_t>(p->vocab_size * p->dim)) return 1;
|
||||||
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
if (fread(w->rms_att_weight, sizeof(float), p->n_layers * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim)) return 1;
|
||||||
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
if (fread(w->wq, sizeof(float), p->n_layers * p->dim * p->dim, f) != static_cast<size_t>(p->n_layers * p->dim * p->dim)) return 1;
|
||||||
|
@ -191,7 +189,7 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_sample_weights(TransformerWeights *w){
|
static void print_sample_weights(TransformerWeights *w){
|
||||||
printf("----- Quick print of first of the weight vales of all the variables\n");
|
printf("----- Quick print of first of the weight vales of all the variables\n");
|
||||||
printf("%f\n", w->token_embedding_table[0]);
|
printf("%f\n", w->token_embedding_table[0]);
|
||||||
printf("%f\n", w->rms_att_weight[0]);
|
printf("%f\n", w->rms_att_weight[0]);
|
||||||
|
@ -326,7 +324,7 @@ struct train_params {
|
||||||
int mem_compute1_gb;
|
int mem_compute1_gb;
|
||||||
};
|
};
|
||||||
|
|
||||||
void print_params(struct my_llama_hparams * params) {
|
static void print_params(struct my_llama_hparams * params) {
|
||||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
||||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
||||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
||||||
|
@ -337,7 +335,7 @@ void print_params(struct my_llama_hparams * params) {
|
||||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||||
}
|
}
|
||||||
|
|
||||||
void init_model(struct my_llama_model * model) {
|
static void init_model(struct my_llama_model * model) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_embd = hparams.n_embd;
|
const uint32_t n_embd = hparams.n_embd;
|
||||||
|
@ -410,17 +408,17 @@ void init_model(struct my_llama_model * model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||||
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||||
return *ptr;
|
return *ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
static int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||||
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||||
return *ptr;
|
return *ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_row(struct ggml_tensor * probs, int i) {
|
static void print_row(struct ggml_tensor * probs, int i) {
|
||||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||||
float p = get_f32_2d(probs, k, i);
|
float p = get_f32_2d(probs, k, i);
|
||||||
printf(" %f", p);
|
printf(" %f", p);
|
||||||
|
@ -428,7 +426,7 @@ void print_row(struct ggml_tensor * probs, int i) {
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_matrix(struct ggml_tensor * probs) {
|
static void print_matrix(struct ggml_tensor * probs) {
|
||||||
assert(probs->n_dims == 2);
|
assert(probs->n_dims == 2);
|
||||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||||
|
@ -551,7 +549,7 @@ std::string llama_escape_whitespaces(const std::string& text) {
|
||||||
return out.str();
|
return out.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
static void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab) {
|
||||||
if (is_ggml_file(filename)) {
|
if (is_ggml_file(filename)) {
|
||||||
struct ggml_context * ctx_data = NULL;
|
struct ggml_context * ctx_data = NULL;
|
||||||
|
|
||||||
|
@ -639,7 +637,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
static void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
|
||||||
int ct;
|
int ct;
|
||||||
switch (gg_weights->n_dims){
|
switch (gg_weights->n_dims){
|
||||||
case 1:
|
case 1:
|
||||||
|
@ -675,7 +673,9 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
|
static void save_as_llama_model(
|
||||||
|
struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename
|
||||||
|
) {
|
||||||
// convert AK weights into GG weights one by one.
|
// convert AK weights into GG weights one by one.
|
||||||
// w->token_embedding_table -> model->tok_embeddings
|
// w->token_embedding_table -> model->tok_embeddings
|
||||||
// float* -> struct ggml_tensor
|
// float* -> struct ggml_tensor
|
||||||
|
@ -787,7 +787,7 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
|
||||||
gguf_free(ctx);
|
gguf_free(ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
struct train_params get_default_train_params() {
|
static struct train_params get_default_train_params() {
|
||||||
struct train_params params;
|
struct train_params params;
|
||||||
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
params.fn_vocab_model = "models/7B/ggml-model-f16.gguf";
|
||||||
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
params.fn_llama2c_output_model = "ak_llama_model.bin";
|
||||||
|
@ -837,7 +837,7 @@ struct train_params get_default_train_params() {
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
|
static void print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
|
@ -848,7 +848,7 @@ void print_usage(int /*argc*/, char ** argv, const struct train_params * params)
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool params_parse(int argc, char ** argv, struct train_params * params) {
|
static bool params_parse(int argc, char ** argv, struct train_params * params) {
|
||||||
bool invalid_param = false;
|
bool invalid_param = false;
|
||||||
bool reqd_param_found = false;
|
bool reqd_param_found = false;
|
||||||
std::string arg;
|
std::string arg;
|
||||||
|
@ -903,7 +903,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
std::string basename(const std::string &path) {
|
static std::string basename(const std::string &path) {
|
||||||
size_t pos = path.find_last_of("/\\");
|
size_t pos = path.find_last_of("/\\");
|
||||||
if (pos == std::string::npos) {
|
if (pos == std::string::npos) {
|
||||||
return path;
|
return path;
|
||||||
|
@ -911,8 +911,6 @@ std::string basename(const std::string &path) {
|
||||||
return path.substr(pos + 1);
|
return path.substr(pos + 1);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
struct train_params params = get_default_train_params();
|
struct train_params params = get_default_train_params();
|
||||||
if (!params_parse(argc, argv, ¶ms)) {
|
if (!params_parse(argc, argv, ¶ms)) {
|
||||||
|
|
|
@ -13,16 +13,14 @@
|
||||||
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
||||||
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
||||||
|
|
||||||
namespace {
|
template <typename T>
|
||||||
|
static std::string to_string(const T & val) {
|
||||||
template<typename T>
|
|
||||||
std::string to_string(const T & val) {
|
|
||||||
std::stringstream ss;
|
std::stringstream ss;
|
||||||
ss << val;
|
ss << val;
|
||||||
return ss.str();
|
return ss.str();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool gguf_ex_write(const std::string & fname) {
|
static bool gguf_ex_write(const std::string & fname) {
|
||||||
struct gguf_context * ctx = gguf_init_empty();
|
struct gguf_context * ctx = gguf_init_empty();
|
||||||
|
|
||||||
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
|
gguf_set_val_u8 (ctx, "some.parameter.uint8", 0x12);
|
||||||
|
@ -87,7 +85,7 @@ bool gguf_ex_write(const std::string & fname) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// just read tensor info
|
// just read tensor info
|
||||||
bool gguf_ex_read_0(const std::string & fname) {
|
static bool gguf_ex_read_0(const std::string & fname) {
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
/*.no_alloc = */ false,
|
/*.no_alloc = */ false,
|
||||||
/*.ctx = */ NULL,
|
/*.ctx = */ NULL,
|
||||||
|
@ -145,7 +143,7 @@ bool gguf_ex_read_0(const std::string & fname) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// read and create ggml_context containing the tensors and their data
|
// read and create ggml_context containing the tensors and their data
|
||||||
bool gguf_ex_read_1(const std::string & fname) {
|
static bool gguf_ex_read_1(const std::string & fname) {
|
||||||
struct ggml_context * ctx_data = NULL;
|
struct ggml_context * ctx_data = NULL;
|
||||||
|
|
||||||
struct gguf_init_params params = {
|
struct gguf_init_params params = {
|
||||||
|
@ -229,8 +227,6 @@ bool gguf_ex_read_1(const std::string & fname) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
printf("usage: %s data.gguf r|w\n", argv[0]);
|
printf("usage: %s data.gguf r|w\n", argv[0]);
|
||||||
|
|
|
@ -33,17 +33,16 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace {
|
static llama_context ** g_ctx;
|
||||||
|
static llama_model ** g_model;
|
||||||
|
static gpt_params * g_params;
|
||||||
|
static std::vector<llama_token> * g_input_tokens;
|
||||||
|
static std::ostringstream * g_output_ss;
|
||||||
|
static std::vector<llama_token> * g_output_tokens;
|
||||||
|
static bool is_interacting = false;
|
||||||
|
|
||||||
llama_context ** g_ctx;
|
|
||||||
llama_model ** g_model;
|
|
||||||
gpt_params * g_params;
|
|
||||||
std::vector<llama_token> * g_input_tokens;
|
|
||||||
std::ostringstream * g_output_ss;
|
|
||||||
std::vector<llama_token> * g_output_tokens;
|
|
||||||
bool is_interacting = false;
|
|
||||||
|
|
||||||
void write_logfile(
|
static void write_logfile(
|
||||||
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||||
const std::vector<llama_token> & input_tokens, const std::string & output,
|
const std::vector<llama_token> & input_tokens, const std::string & output,
|
||||||
const std::vector<llama_token> & output_tokens
|
const std::vector<llama_token> & output_tokens
|
||||||
|
@ -88,7 +87,7 @@ void write_logfile(
|
||||||
}
|
}
|
||||||
|
|
||||||
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
|
||||||
void sigint_handler(int signo) {
|
static void sigint_handler(int signo) {
|
||||||
if (signo == SIGINT) {
|
if (signo == SIGINT) {
|
||||||
if (!is_interacting) {
|
if (!is_interacting) {
|
||||||
is_interacting = true;
|
is_interacting = true;
|
||||||
|
@ -103,8 +102,6 @@ void sigint_handler(int signo) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
g_params = ¶ms;
|
g_params = ¶ms;
|
||||||
|
|
|
@ -28,11 +28,10 @@ struct results_log_softmax {
|
||||||
float prob;
|
float prob;
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace {
|
static void write_logfile(
|
||||||
|
const llama_context * ctx, const gpt_params & params, const llama_model * model,
|
||||||
void write_logfile(const llama_context * ctx, const gpt_params & params,
|
const struct results_perplexity & results
|
||||||
const llama_model * model, const struct results_perplexity & results) {
|
) {
|
||||||
|
|
||||||
if (params.logdir.empty()) {
|
if (params.logdir.empty()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -78,7 +77,7 @@ void write_logfile(const llama_context * ctx, const gpt_params & params,
|
||||||
fclose(logfile);
|
fclose(logfile);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<float> softmax(const std::vector<float>& logits) {
|
static std::vector<float> softmax(const std::vector<float>& logits) {
|
||||||
std::vector<float> probs(logits.size());
|
std::vector<float> probs(logits.size());
|
||||||
float max_logit = logits[0];
|
float max_logit = logits[0];
|
||||||
for (float v : logits) max_logit = std::max(max_logit, v);
|
for (float v : logits) max_logit = std::max(max_logit, v);
|
||||||
|
@ -94,7 +93,7 @@ std::vector<float> softmax(const std::vector<float>& logits) {
|
||||||
return probs;
|
return probs;
|
||||||
}
|
}
|
||||||
|
|
||||||
results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
static results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
||||||
float max_logit = logits[0];
|
float max_logit = logits[0];
|
||||||
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
|
for (int i = 1; i < n_vocab; ++i) max_logit = std::max(max_logit, logits[i]);
|
||||||
double sum_exp = 0.0;
|
double sum_exp = 0.0;
|
||||||
|
@ -102,9 +101,10 @@ results_log_softmax log_softmax(int n_vocab, const float * logits, int tok) {
|
||||||
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
|
return {logits[tok] - max_logit - log(sum_exp), logits[tok], expf(logits[tok] - max_logit) / (float) sum_exp};
|
||||||
}
|
}
|
||||||
|
|
||||||
void process_logits(int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
|
static void process_logits(
|
||||||
double & nll, double & nll2, float * logit_history, float * prob_history) {
|
int n_vocab, const float * logits, const int * tokens, int n_token, std::vector<std::thread> & workers,
|
||||||
|
double & nll, double & nll2, float * logit_history, float * prob_history
|
||||||
|
) {
|
||||||
std::mutex mutex;
|
std::mutex mutex;
|
||||||
int counter = 0;
|
int counter = 0;
|
||||||
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
|
auto compute = [&mutex, &counter, &nll, &nll2, logit_history, prob_history, n_vocab, logits, tokens, n_token] () {
|
||||||
|
@ -402,8 +402,9 @@ results_perplexity perplexity(llama_context * ctx, const gpt_params & params) {
|
||||||
return {tokens, ppl, logit_history, prob_history};
|
return {tokens, ppl, logit_history, prob_history};
|
||||||
}
|
}
|
||||||
|
|
||||||
std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch,
|
static std::vector<float> hellaswag_evaluate_tokens(
|
||||||
int n_vocab, int n_thread) {
|
llama_context * ctx, const std::vector<int>& tokens, int n_past, int n_batch, int n_vocab, int n_thread
|
||||||
|
) {
|
||||||
std::vector<float> result;
|
std::vector<float> result;
|
||||||
result.reserve(tokens.size() * n_vocab);
|
result.reserve(tokens.size() * n_vocab);
|
||||||
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
|
size_t n_chunk = (tokens.size() + n_batch - 1)/n_batch;
|
||||||
|
@ -423,7 +424,7 @@ std::vector<float> hellaswag_evaluate_tokens(llama_context * ctx, const std::vec
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
// Calculates hellaswag score (acc_norm) from prompt
|
// Calculates hellaswag score (acc_norm) from prompt
|
||||||
//
|
//
|
||||||
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
// Data extracted from the HellaSwag validation dataset (MIT license) https://github.com/rowanz/hellaswag/blob/master/data/hellaswag_val.jsonl
|
||||||
|
@ -653,8 +654,6 @@ void hellaswag_score(llama_context * ctx, const gpt_params & params) {
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
|
|
||||||
|
|
|
@ -44,9 +44,7 @@ struct error_stats {
|
||||||
uint64_t error_histogram[HISTOGRAM_BUCKETS];
|
uint64_t error_histogram[HISTOGRAM_BUCKETS];
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace {
|
static void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
||||||
|
|
||||||
void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
|
||||||
quantize_stats_params params;
|
quantize_stats_params params;
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
|
@ -72,7 +70,7 @@ void quantize_stats_print_usage(int /*argc*/, char ** argv) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check if a layer is included/excluded by command line
|
// Check if a layer is included/excluded by command line
|
||||||
bool layer_included(const quantize_stats_params & params, const std::string & layer) {
|
static bool layer_included(const quantize_stats_params & params, const std::string & layer) {
|
||||||
for (const auto& excluded : params.exclude_layers) {
|
for (const auto& excluded : params.exclude_layers) {
|
||||||
if (std::regex_search(layer, std::regex(excluded))) {
|
if (std::regex_search(layer, std::regex(excluded))) {
|
||||||
return false;
|
return false;
|
||||||
|
@ -87,7 +85,7 @@ bool layer_included(const quantize_stats_params & params, const std::string & la
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update error statistics given vectors with the before/after result of quantization
|
// Update error statistics given vectors with the before/after result of quantization
|
||||||
void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
|
static void update_error_stats(int64_t nelements, const float * input, const float * output, error_stats & stats) {
|
||||||
for (int64_t i = 0; i < nelements; i++) {
|
for (int64_t i = 0; i < nelements; i++) {
|
||||||
double diff = input[i] - output[i];
|
double diff = input[i] - output[i];
|
||||||
stats.total_error += diff * diff;
|
stats.total_error += diff * diff;
|
||||||
|
@ -97,7 +95,7 @@ void update_error_stats(int64_t nelements, const float * input, const float * ou
|
||||||
stats.num_samples += nelements;
|
stats.num_samples += nelements;
|
||||||
}
|
}
|
||||||
|
|
||||||
void combine_error_stats(error_stats & into, const error_stats & from) {
|
static void combine_error_stats(error_stats & into, const error_stats & from) {
|
||||||
into.num_samples += from.num_samples;
|
into.num_samples += from.num_samples;
|
||||||
into.total_error += from.total_error;
|
into.total_error += from.total_error;
|
||||||
if (from.max_error > into.max_error) into.max_error = from.max_error;
|
if (from.max_error > into.max_error) into.max_error = from.max_error;
|
||||||
|
@ -117,7 +115,7 @@ double find_quantile(const error_stats & stats, double quantile) {
|
||||||
return INFINITY;
|
return INFINITY;
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
static void print_error_stats(const std::string & name, const error_stats & stats, bool print_histogram) {
|
||||||
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
|
double rmse = sqrt(stats.total_error / (double) stats.num_samples);
|
||||||
double median = find_quantile(stats, .5);
|
double median = find_quantile(stats, .5);
|
||||||
double pct95 = find_quantile(stats, .95);
|
double pct95 = find_quantile(stats, .95);
|
||||||
|
@ -134,7 +132,7 @@ void print_error_stats(const std::string & name, const error_stats & stats, bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// copied from ggml.h - verify that we can access this as a flat array
|
// copied from ggml.h - verify that we can access this as a flat array
|
||||||
bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
static bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");
|
||||||
|
|
||||||
return
|
return
|
||||||
|
@ -144,17 +142,10 @@ bool tensor_is_contiguous(const struct ggml_tensor * tensor) {
|
||||||
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
|
||||||
}
|
}
|
||||||
|
|
||||||
void test_roundtrip_on_chunk(
|
static void test_roundtrip_on_chunk(
|
||||||
const ggml_tensor * layer,
|
const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits_t & qfns, bool use_reference,
|
||||||
int64_t offset,
|
float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats
|
||||||
int64_t chunk_size,
|
) {
|
||||||
const ggml_type_traits_t & qfns,
|
|
||||||
bool use_reference,
|
|
||||||
float * input_scratch,
|
|
||||||
char * quantized_scratch,
|
|
||||||
float * output_scratch,
|
|
||||||
error_stats & stats) {
|
|
||||||
|
|
||||||
if (layer->type == GGML_TYPE_F16) {
|
if (layer->type == GGML_TYPE_F16) {
|
||||||
for (int i = 0; i < chunk_size; i++) {
|
for (int i = 0; i < chunk_size; i++) {
|
||||||
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
|
input_scratch[i] = ggml_get_f32_1d(layer, i + offset);
|
||||||
|
@ -175,18 +166,11 @@ void test_roundtrip_on_chunk(
|
||||||
|
|
||||||
|
|
||||||
// Run quantization function for a single layer and update error stats
|
// Run quantization function for a single layer and update error stats
|
||||||
void test_roundtrip_on_layer(
|
static void test_roundtrip_on_layer(
|
||||||
std::string & name,
|
std::string & name, bool print_layer_stats, const ggml_type_traits_t & qfns, bool use_reference,
|
||||||
bool print_layer_stats,
|
const ggml_tensor * layer, std::vector<float> & input_scratch, std::vector<char> & quantized_scratch,
|
||||||
const ggml_type_traits_t & qfns,
|
std::vector<float> & output_scratch, error_stats & total_error, int max_thread = 0
|
||||||
bool use_reference,
|
) {
|
||||||
const ggml_tensor * layer,
|
|
||||||
std::vector<float> & input_scratch,
|
|
||||||
std::vector<char> & quantized_scratch,
|
|
||||||
std::vector<float> & output_scratch,
|
|
||||||
error_stats & total_error,
|
|
||||||
int max_thread = 0) {
|
|
||||||
|
|
||||||
assert(tensor_is_contiguous(layer));
|
assert(tensor_is_contiguous(layer));
|
||||||
error_stats layer_error {};
|
error_stats layer_error {};
|
||||||
uint64_t nelements = ggml_nelements(layer);
|
uint64_t nelements = ggml_nelements(layer);
|
||||||
|
@ -239,8 +223,6 @@ void test_roundtrip_on_layer(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
|
|
|
@ -7,15 +7,13 @@
|
||||||
#include <vector>
|
#include <vector>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
struct quant_option {
|
struct quant_option {
|
||||||
std::string name;
|
std::string name;
|
||||||
llama_ftype ftype;
|
llama_ftype ftype;
|
||||||
std::string desc;
|
std::string desc;
|
||||||
};
|
};
|
||||||
|
|
||||||
const std::vector<struct quant_option> QUANT_OPTIONS = {
|
static const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
|
{ "Q4_0", LLAMA_FTYPE_MOSTLY_Q4_0, " 3.56G, +0.2166 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
{ "Q4_1", LLAMA_FTYPE_MOSTLY_Q4_1, " 3.90G, +0.1585 ppl @ LLaMA-v1-7B", },
|
||||||
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
{ "Q5_0", LLAMA_FTYPE_MOSTLY_Q5_0, " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
|
||||||
|
@ -42,7 +40,7 @@ const std::vector<struct quant_option> QUANT_OPTIONS = {
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std::string & ftype_str_out) {
|
||||||
std::string ftype_str;
|
std::string ftype_str;
|
||||||
|
|
||||||
for (auto ch : ftype_str_in) {
|
for (auto ch : ftype_str_in) {
|
||||||
|
@ -74,7 +72,7 @@ bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftype, std:
|
||||||
// usage:
|
// usage:
|
||||||
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
|
||||||
//
|
//
|
||||||
void usage(const char * executable) {
|
static void usage(const char * executable) {
|
||||||
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
|
||||||
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
|
||||||
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
|
||||||
|
@ -90,8 +88,6 @@ void usage(const char * executable) {
|
||||||
exit(1);
|
exit(1);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
if (argc < 3) {
|
if (argc < 3) {
|
||||||
usage(argv[0]);
|
usage(argv[0]);
|
||||||
|
|
|
@ -26,8 +26,6 @@
|
||||||
using namespace httplib;
|
using namespace httplib;
|
||||||
using json = nlohmann::json;
|
using json = nlohmann::json;
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
struct server_params
|
struct server_params
|
||||||
{
|
{
|
||||||
std::string hostname = "127.0.0.1";
|
std::string hostname = "127.0.0.1";
|
||||||
|
@ -65,7 +63,7 @@ enum stop_type
|
||||||
STOP_PARTIAL,
|
STOP_PARTIAL,
|
||||||
};
|
};
|
||||||
|
|
||||||
bool ends_with(const std::string & str, const std::string & suffix)
|
static bool ends_with(const std::string & str, const std::string & suffix)
|
||||||
{
|
{
|
||||||
return str.size() >= suffix.size() &&
|
return str.size() >= suffix.size() &&
|
||||||
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix);
|
||||||
|
@ -102,7 +100,7 @@ std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
void server_log(
|
static void server_log(
|
||||||
const char * level, const char * function, int line, const char * message, const nlohmann::ordered_json & extra
|
const char * level, const char * function, int line, const char * message, const nlohmann::ordered_json & extra
|
||||||
) {
|
) {
|
||||||
nlohmann::ordered_json log{
|
nlohmann::ordered_json log{
|
||||||
|
@ -163,7 +161,7 @@ json probs_vector_to_json(const llama_context * ctx, const std::vector<completio
|
||||||
return out;
|
return out;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool server_verbose = false;
|
static bool server_verbose = false;
|
||||||
|
|
||||||
#if SERVER_VERBOSE != 1
|
#if SERVER_VERBOSE != 1
|
||||||
#define LOG_VERBOSE(MSG, ...)
|
#define LOG_VERBOSE(MSG, ...)
|
||||||
|
@ -692,7 +690,7 @@ struct llama_server_context
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams)
|
static void server_print_usage(const char * argv0, const gpt_params & params, const server_params & sparams)
|
||||||
{
|
{
|
||||||
printf("usage: %s [options]\n", argv0);
|
printf("usage: %s [options]\n", argv0);
|
||||||
printf("\n");
|
printf("\n");
|
||||||
|
@ -740,7 +738,7 @@ void server_print_usage(const char * argv0, const gpt_params & params, const ser
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params)
|
static void server_params_parse(int argc, char ** argv, server_params & sparams, gpt_params & params)
|
||||||
{
|
{
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
server_params default_sparams;
|
server_params default_sparams;
|
||||||
|
@ -1120,7 +1118,7 @@ T json_value(const json & body, const std::string & key, const T & default_value
|
||||||
: default_value;
|
: default_value;
|
||||||
}
|
}
|
||||||
|
|
||||||
void parse_options_completion(const json & body, llama_server_context & llama)
|
static void parse_options_completion(const json & body, llama_server_context & llama)
|
||||||
{
|
{
|
||||||
gpt_params default_params;
|
gpt_params default_params;
|
||||||
|
|
||||||
|
@ -1199,7 +1197,7 @@ void parse_options_completion(const json & body, llama_server_context & llama)
|
||||||
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
LOG_VERBOSE("completion parameters parsed", format_generation_settings(llama));
|
||||||
}
|
}
|
||||||
|
|
||||||
void log_server_request(const Request & req, const Response & res)
|
static void log_server_request(const Request & req, const Response & res)
|
||||||
{
|
{
|
||||||
LOG_INFO("request", {
|
LOG_INFO("request", {
|
||||||
{"remote_addr", req.remote_addr},
|
{"remote_addr", req.remote_addr},
|
||||||
|
@ -1216,7 +1214,7 @@ void log_server_request(const Request & req, const Response & res)
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
|
static bool is_at_eob(llama_server_context & server_context, const llama_token * tokens, const size_t n_tokens) {
|
||||||
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
|
return n_tokens && tokens[n_tokens-1] == llama_token_eos(server_context.ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1226,7 +1224,7 @@ bool is_at_eob(llama_server_context & server_context, const llama_token * tokens
|
||||||
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
// * When all beams converge to a common prefix, they are made available in beams_state.beams[0].
|
||||||
// This is also called when the stop condition is met.
|
// This is also called when the stop condition is met.
|
||||||
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
// Collect tokens into std::vector<llama_token> response which is pointed to by callback_data.
|
||||||
void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
static void beam_search_callback(void * callback_data, llama_beams_state beams_state) {
|
||||||
auto & llama = *static_cast<llama_server_context*>(callback_data);
|
auto & llama = *static_cast<llama_server_context*>(callback_data);
|
||||||
// Mark beams as EOS as needed.
|
// Mark beams as EOS as needed.
|
||||||
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
for (size_t i = 0 ; i < beams_state.n_beams ; ++i) {
|
||||||
|
@ -1259,7 +1257,7 @@ struct token_translator {
|
||||||
std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
|
std::string operator()(const completion_token_output & cto) const { return (*this)(cto.tok); }
|
||||||
};
|
};
|
||||||
|
|
||||||
void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
|
static void append_to_generated_text_from_generated_token_probs(llama_server_context & llama) {
|
||||||
auto & gtps = llama.generated_token_probs;
|
auto & gtps = llama.generated_token_probs;
|
||||||
auto translator = token_translator{llama.ctx};
|
auto translator = token_translator{llama.ctx};
|
||||||
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
|
auto add_strlen = [=](size_t sum, const completion_token_output & cto) { return sum + translator(cto).size(); };
|
||||||
|
@ -1272,10 +1270,7 @@ void append_to_generated_text_from_generated_token_probs(llama_server_context &
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
int main(int argc, char **argv) {
|
||||||
|
|
||||||
int main(int argc, char **argv)
|
|
||||||
{
|
|
||||||
// own arguments required by this example
|
// own arguments required by this example
|
||||||
gpt_params params;
|
gpt_params params;
|
||||||
server_params sparams;
|
server_params sparams;
|
||||||
|
|
|
@ -18,8 +18,6 @@
|
||||||
#pragma warning(disable: 4244 4267) // possible loss of data
|
#pragma warning(disable: 4244 4267) // possible loss of data
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
struct random_normal_distribution {
|
struct random_normal_distribution {
|
||||||
std::mt19937 gen;
|
std::mt19937 gen;
|
||||||
std::normal_distribution<float> rd;
|
std::normal_distribution<float> rd;
|
||||||
|
@ -32,35 +30,37 @@ struct random_uniform_distribution {
|
||||||
std::uniform_real_distribution<float> rd;
|
std::uniform_real_distribution<float> rd;
|
||||||
};
|
};
|
||||||
|
|
||||||
void init_random_normal_distribution(struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max) {
|
static void init_random_normal_distribution(
|
||||||
|
struct random_normal_distribution * rnd, int seed, float mean, float std, float min, float max
|
||||||
|
) {
|
||||||
rnd->gen = std::mt19937(seed);
|
rnd->gen = std::mt19937(seed);
|
||||||
rnd->rd = std::normal_distribution<float>{mean, std};
|
rnd->rd = std::normal_distribution<float>{mean, std};
|
||||||
rnd->min = min;
|
rnd->min = min;
|
||||||
rnd->max = max;
|
rnd->max = max;
|
||||||
}
|
}
|
||||||
|
|
||||||
void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
|
static void init_random_uniform_distribution(struct random_uniform_distribution * rnd, int seed, float min, float max) {
|
||||||
rnd->gen = std::mt19937(seed);
|
rnd->gen = std::mt19937(seed);
|
||||||
rnd->rd = std::uniform_real_distribution<float>{min, max};
|
rnd->rd = std::uniform_real_distribution<float>{min, max};
|
||||||
}
|
}
|
||||||
|
|
||||||
int clamp(const int v, const int min, const int max) {
|
static int clamp(const int v, const int min, const int max) {
|
||||||
return ((v < min) ? (min) : (v > max) ? (max) : v);
|
return ((v < min) ? (min) : (v > max) ? (max) : v);
|
||||||
}
|
}
|
||||||
|
|
||||||
float fclamp(const float v, const float min, const float max) {
|
static float fclamp(const float v, const float min, const float max) {
|
||||||
return ((v < min) ? (min) : (v > max) ? (max) : v);
|
return ((v < min) ? (min) : (v > max) ? (max) : v);
|
||||||
}
|
}
|
||||||
|
|
||||||
float frand() {
|
static float frand() {
|
||||||
return (float)rand()/(float)RAND_MAX;
|
return (float)rand()/(float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
float frand_normal(struct random_normal_distribution * rnd) {
|
static float frand_normal(struct random_normal_distribution * rnd) {
|
||||||
return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
|
return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max);
|
||||||
}
|
}
|
||||||
|
|
||||||
float frand_uniform(struct random_uniform_distribution * rnd) {
|
static float frand_uniform(struct random_uniform_distribution * rnd) {
|
||||||
return rnd->rd(rnd->gen);
|
return rnd->rd(rnd->gen);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -210,85 +210,85 @@ struct my_llama_model {
|
||||||
};
|
};
|
||||||
|
|
||||||
// gguf constants
|
// gguf constants
|
||||||
const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
|
static const char * LLM_KV_OPTIMIZER_TYPE = "optimizer.type";
|
||||||
const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam";
|
static const char * LLM_KV_OPTIMIZER_TYPE_ADAM = "adam";
|
||||||
const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
|
static const char * LLM_KV_OPTIMIZER_TYPE_LBFGS = "lbfgs";
|
||||||
const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version";
|
static const char * LLM_KV_OPTIMIZER_FILE_VERSION = "optimizer.file_version";
|
||||||
const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count";
|
static const char * LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT = "optimizer.convergence_past_count";
|
||||||
const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count";
|
static const char * LLM_KV_OPTIMIZER_PARAMETER_COUNT = "optimizer.parameter_count";
|
||||||
const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count";
|
static const char * LLM_KV_OPTIMIZER_ITERATION_COUNT = "optimizer.iteration_count";
|
||||||
const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized";
|
static const char * LLM_KV_OPTIMIZER_JUST_INITIALIZED = "optimizer.just_initialized";
|
||||||
const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss";
|
static const char * LLM_KV_OPTIMIZER_ADAM_BEST_LOSS = "optimizer.adam.best_loss";
|
||||||
const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss";
|
static const char * LLM_KV_OPTIMIZER_ADAM_PREVIOUS_LOSS = "optimizer.adam.previous_loss";
|
||||||
const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count";
|
static const char * LLM_KV_OPTIMIZER_ADAM_NO_IMPROVEMENT_COUNT = "optimizer.adam.no_improvement_count";
|
||||||
const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
|
static const char * LLM_KV_OPTIMIZER_LBFGS_APPROX_HESSIAN_COUNT = "optimizer.lbfgs.approx_hessian_count";
|
||||||
const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss";
|
static const char * LLM_KV_OPTIMIZER_LBFGS_BEST_LOSS = "optimizer.lbfgs.best_loss";
|
||||||
const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step";
|
static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_STEP = "optimizer.lbfgs.line_search_step";
|
||||||
const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j";
|
static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_J = "optimizer.lbfgs.line_search_j";
|
||||||
const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k";
|
static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_K = "optimizer.lbfgs.line_search_k";
|
||||||
const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end";
|
static const char * LLM_KV_OPTIMIZER_LBFGS_LINE_SEARCH_END = "optimizer.lbfgs.line_search_end";
|
||||||
const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
|
static const char * LLM_KV_OPTIMIZER_LBFGS_NO_IMPROVEMENT_COUNT = "optimizer.lbfgs.no_improvement_count";
|
||||||
|
|
||||||
const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments";
|
static const char * LLM_TENSOR_OPTIMIZER_ADAM_FIRST_MOMENTS = "optimizer.adam.first_moments";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments";
|
static const char * LLM_TENSOR_OPTIMIZER_ADAM_SECOND_MOMENTS = "optimizer.adam.second_moments";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
|
static const char * LLM_TENSOR_OPTIMIZER_ADAM_PAST_LOSS_VALUES = "optimizer.adam.past_loss_values";
|
||||||
|
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_PARAMETERS = "optimizer.lbfgs.current_parameters";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_PARAMETERS = "optimizer.lbfgs.previous_parameters";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_CURRENT_GRADIENTS = "optimizer.lbfgs.current_gradients";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PREVIOUS_GRADIENTS = "optimizer.lbfgs.previous_gradients";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_SEARCH_DIRECTION = "optimizer.lbfgs.search_direction";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_PAST_LOSS_VALUES = "optimizer.lbfgs.past_loss_values";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_ALPHA = "optimizer.lbfgs.memory_alpha";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_YS = "optimizer.lbfgs.memory_ys";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_S = "optimizer.lbfgs.memory_s";
|
||||||
const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y";
|
static const char * LLM_TENSOR_OPTIMIZER_LBFGS_MEMORY_Y = "optimizer.lbfgs.memory_y";
|
||||||
|
|
||||||
const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version";
|
static const char * LLM_KV_TRAINING_FILE_VERSION = "training.file_version";
|
||||||
const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count";
|
static const char * LLM_KV_TRAINING_ITERATION_COUNT = "training.iteration_count";
|
||||||
const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count";
|
static const char * LLM_KV_TRAINING_SAMPLE_COUNT = "training.sample_count";
|
||||||
const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count";
|
static const char * LLM_KV_TRAINING_TOKEN_COUNT = "training.token_count";
|
||||||
|
|
||||||
// gguf constants (sync with gguf.py)
|
// gguf constants (sync with gguf.py)
|
||||||
|
|
||||||
const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture";
|
static const char * LLM_KV_GENERAL_ARCHITECTURE = "general.architecture";
|
||||||
const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type";
|
static const char * LLM_KV_GENERAL_FILE_TYPE = "general.file_type";
|
||||||
|
|
||||||
const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length";
|
static const char * LLM_KV_CONTEXT_LENGTH = "%s.context_length";
|
||||||
const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length";
|
static const char * LLM_KV_EMBEDDING_LENGTH = "%s.embedding_length";
|
||||||
const char * LLM_KV_BLOCK_COUNT = "%s.block_count";
|
static const char * LLM_KV_BLOCK_COUNT = "%s.block_count";
|
||||||
const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length";
|
static const char * LLM_KV_FEED_FORWARD_LENGTH = "%s.feed_forward_length";
|
||||||
const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count";
|
static const char * LLM_KV_ATTENTION_HEAD_COUNT = "%s.attention.head_count";
|
||||||
const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
|
static const char * LLM_KV_ATTENTION_LAYERNORM_RMS_EPS = "%s.attention.layer_norm_rms_epsilon";
|
||||||
const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count";
|
static const char * LLM_KV_ROPE_DIMENSION_COUNT = "%s.rope.dimension_count";
|
||||||
const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp
|
static const char * LLM_KV_ROPE_FREQ_BASE = "%s.rope.freq_base"; // TODO load in llama.cpp
|
||||||
const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear";
|
static const char * LLM_KV_ROPE_SCALE_LINEAR = "%s.rope.scale_linear";
|
||||||
|
|
||||||
const char * LLM_KV_TOKENIZER_MODEL = "tokenizer.ggml.model";
|
static const char * LLM_KV_TOKENIZER_MODEL = "tokenizer.ggml.model";
|
||||||
const char * LLM_KV_TOKENIZER_LIST = "tokenizer.ggml.tokens";
|
static const char * LLM_KV_TOKENIZER_LIST = "tokenizer.ggml.tokens";
|
||||||
const char * LLM_KV_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type";
|
static const char * LLM_KV_TOKENIZER_TOKEN_TYPE = "tokenizer.ggml.token_type";
|
||||||
const char * LLM_KV_TOKENIZER_SCORES = "tokenizer.ggml.scores";
|
static const char * LLM_KV_TOKENIZER_SCORES = "tokenizer.ggml.scores";
|
||||||
const char * LLM_KV_TOKENIZER_MERGES = "tokenizer.ggml.merges";
|
static const char * LLM_KV_TOKENIZER_MERGES = "tokenizer.ggml.merges";
|
||||||
const char * LLM_KV_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id";
|
static const char * LLM_KV_TOKENIZER_BOS_ID = "tokenizer.ggml.bos_token_id";
|
||||||
const char * LLM_KV_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id";
|
static const char * LLM_KV_TOKENIZER_EOS_ID = "tokenizer.ggml.eos_token_id";
|
||||||
const char * LLM_KV_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id";
|
static const char * LLM_KV_TOKENIZER_UNK_ID = "tokenizer.ggml.unknown_token_id";
|
||||||
const char * LLM_KV_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id";
|
static const char * LLM_KV_TOKENIZER_SEP_ID = "tokenizer.ggml.seperator_token_id";
|
||||||
const char * LLM_KV_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id";
|
static const char * LLM_KV_TOKENIZER_PAD_ID = "tokenizer.ggml.padding_token_id";
|
||||||
|
|
||||||
const char * LLM_TENSOR_TOKEN_EMBD = "token_embd";
|
static const char * LLM_TENSOR_TOKEN_EMBD = "token_embd";
|
||||||
const char * LLM_TENSOR_OUTPUT_NORM = "output_norm";
|
static const char * LLM_TENSOR_OUTPUT_NORM = "output_norm";
|
||||||
const char * LLM_TENSOR_OUTPUT = "output";
|
static const char * LLM_TENSOR_OUTPUT = "output";
|
||||||
const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm";
|
static const char * LLM_TENSOR_ATTN_NORM = "blk.%d.attn_norm";
|
||||||
const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q";
|
static const char * LLM_TENSOR_ATTN_Q = "blk.%d.attn_q";
|
||||||
const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k";
|
static const char * LLM_TENSOR_ATTN_K = "blk.%d.attn_k";
|
||||||
const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v";
|
static const char * LLM_TENSOR_ATTN_V = "blk.%d.attn_v";
|
||||||
const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output";
|
static const char * LLM_TENSOR_ATTN_OUT = "blk.%d.attn_output";
|
||||||
const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm";
|
static const char * LLM_TENSOR_FFN_NORM = "blk.%d.ffn_norm";
|
||||||
const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate";
|
static const char * LLM_TENSOR_FFN_GATE = "blk.%d.ffn_gate";
|
||||||
const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down";
|
static const char * LLM_TENSOR_FFN_DOWN = "blk.%d.ffn_down";
|
||||||
const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
|
static const char * LLM_TENSOR_FFN_UP = "blk.%d.ffn_up";
|
||||||
|
|
||||||
void print_params(struct my_llama_hparams * params) {
|
static void print_params(struct my_llama_hparams * params) {
|
||||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
||||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
||||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
||||||
|
@ -298,7 +298,7 @@ void print_params(struct my_llama_hparams * params) {
|
||||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||||
}
|
}
|
||||||
|
|
||||||
void init_model(struct my_llama_model * model) {
|
static void init_model(struct my_llama_model * model) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_embd = hparams.n_embd;
|
const uint32_t n_embd = hparams.n_embd;
|
||||||
|
@ -365,7 +365,7 @@ void init_model(struct my_llama_model * model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_param_model(struct my_llama_model * model) {
|
static void set_param_model(struct my_llama_model * model) {
|
||||||
const auto& hparams = model->hparams;
|
const auto& hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
@ -391,7 +391,7 @@ void set_param_model(struct my_llama_model * model) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
|
static void randomize_model(struct my_llama_model * model, int seed, float mean, float std, float min, float max) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
const uint32_t n_layer = hparams.n_layer;
|
const uint32_t n_layer = hparams.n_layer;
|
||||||
|
@ -420,25 +420,25 @@ void randomize_model(struct my_llama_model * model, int seed, float mean, float
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
static void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) {
|
||||||
GGML_ASSERT(tensor->n_dims == 1);
|
GGML_ASSERT(tensor->n_dims == 1);
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
static void assert_shape_2d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1) {
|
||||||
GGML_ASSERT(tensor->n_dims == 2);
|
GGML_ASSERT(tensor->n_dims == 2);
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
static void assert_shape_3d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2) {
|
||||||
GGML_ASSERT(tensor->n_dims == 3);
|
GGML_ASSERT(tensor->n_dims == 3);
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||||
GGML_ASSERT(tensor->ne[2] == ne2);
|
GGML_ASSERT(tensor->ne[2] == ne2);
|
||||||
}
|
}
|
||||||
|
|
||||||
void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
static void assert_shape_4d(struct ggml_tensor * tensor, int64_t ne0, int64_t ne1, int64_t ne2, int64_t ne3) {
|
||||||
GGML_ASSERT(tensor->n_dims == 4);
|
GGML_ASSERT(tensor->n_dims == 4);
|
||||||
GGML_ASSERT(tensor->ne[0] == ne0);
|
GGML_ASSERT(tensor->ne[0] == ne0);
|
||||||
GGML_ASSERT(tensor->ne[1] == ne1);
|
GGML_ASSERT(tensor->ne[1] == ne1);
|
||||||
|
@ -465,7 +465,7 @@ size_t hash_find(void * hash_table[], void * p) {
|
||||||
return i;
|
return i;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool hash_insert(void * hash_table[], void * p) {
|
static bool hash_insert(void * hash_table[], void * p) {
|
||||||
//size_t h = hash(p);
|
//size_t h = hash(p);
|
||||||
size_t i = hash_find(hash_table, p);
|
size_t i = hash_find(hash_table, p);
|
||||||
|
|
||||||
|
@ -481,7 +481,7 @@ bool hash_insert(void * hash_table[], void * p) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool hash_contains(void * hash_table[], void * p) {
|
static bool hash_contains(void * hash_table[], void * p) {
|
||||||
size_t i = hash_find(hash_table, p);
|
size_t i = hash_find(hash_table, p);
|
||||||
return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
|
return (i < GGML_GRAPH_HASHTABLE_SIZE) && (hash_table[i] == p);
|
||||||
}
|
}
|
||||||
|
@ -500,11 +500,11 @@ struct hash_map * new_hash_map() {
|
||||||
return result;
|
return result;
|
||||||
};
|
};
|
||||||
|
|
||||||
void free_hash_map(struct hash_map * map) {
|
static void free_hash_map(struct hash_map * map) {
|
||||||
delete map;
|
delete map;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool ggml_is_view(struct ggml_tensor * t) {
|
static bool ggml_is_view(struct ggml_tensor * t) {
|
||||||
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
|
return t->op == GGML_OP_RESHAPE || t->op == GGML_OP_VIEW || t->op == GGML_OP_TRANSPOSE ||
|
||||||
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
t->op == GGML_OP_PERMUTE || t->op == GGML_OP_CPY;
|
||||||
}
|
}
|
||||||
|
@ -597,13 +597,14 @@ struct ggml_tensor * ggml_recompute_graph_node(
|
||||||
return clone;
|
return clone;
|
||||||
};
|
};
|
||||||
|
|
||||||
void ggml_build_backward_gradient_checkpointing(
|
static void ggml_build_backward_gradient_checkpointing(
|
||||||
struct ggml_context * ctx,
|
struct ggml_context * ctx,
|
||||||
struct ggml_cgraph * gf,
|
struct ggml_cgraph * gf,
|
||||||
struct ggml_cgraph * gb,
|
struct ggml_cgraph * gb,
|
||||||
struct ggml_cgraph * gb_tmp,
|
struct ggml_cgraph * gb_tmp,
|
||||||
struct ggml_tensor * * checkpoints,
|
struct ggml_tensor ** checkpoints,
|
||||||
int n_checkpoints) {
|
int n_checkpoints
|
||||||
|
) {
|
||||||
*gb_tmp = *gf;
|
*gb_tmp = *gf;
|
||||||
ggml_build_backward_expand(ctx, gf, gb_tmp, true);
|
ggml_build_backward_expand(ctx, gf, gb_tmp, true);
|
||||||
|
|
||||||
|
@ -827,22 +828,22 @@ struct ggml_tensor * llama_build_train_graphs(
|
||||||
return t36;
|
return t36;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
|
static void set_f32_3d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int64_t i2, float value) {
|
||||||
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
|
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1] + i2*tensor->nb[2]);
|
||||||
*ptr = value;
|
*ptr = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
|
static void set_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, float value) {
|
||||||
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||||
*ptr = value;
|
*ptr = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
|
static void set_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1, int32_t value) {
|
||||||
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
int32_t * ptr = (int32_t *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||||
*ptr = value;
|
*ptr = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
static float get_f32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||||
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
float * ptr = (float *) ((char *) tensor->data + i0*tensor->nb[0] + i1*tensor->nb[1]);
|
||||||
return *ptr;
|
return *ptr;
|
||||||
}
|
}
|
||||||
|
@ -852,7 +853,7 @@ int32_t get_i32_2d(struct ggml_tensor * tensor, int64_t i0, int64_t i1) {
|
||||||
return *ptr;
|
return *ptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_row(struct ggml_tensor * probs, int i) {
|
static void print_row(struct ggml_tensor * probs, int i) {
|
||||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||||
float p = get_f32_2d(probs, k, i);
|
float p = get_f32_2d(probs, k, i);
|
||||||
printf(" %.2f", p);
|
printf(" %.2f", p);
|
||||||
|
@ -860,7 +861,7 @@ void print_row(struct ggml_tensor * probs, int i) {
|
||||||
printf("\n");
|
printf("\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
void print_matrix(struct ggml_tensor * probs) {
|
static void print_matrix(struct ggml_tensor * probs) {
|
||||||
assert(probs->n_dims == 2);
|
assert(probs->n_dims == 2);
|
||||||
for (int i = 0; i < probs->ne[1]; ++i) {
|
for (int i = 0; i < probs->ne[1]; ++i) {
|
||||||
for (int k = 0; k < probs->ne[0]; ++k) {
|
for (int k = 0; k < probs->ne[0]; ++k) {
|
||||||
|
@ -871,7 +872,11 @@ void print_matrix(struct ggml_tensor * probs) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
|
static void get_example_targets(
|
||||||
|
struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data,
|
||||||
|
size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits,
|
||||||
|
struct ggml_tensor * target_probs
|
||||||
|
) {
|
||||||
int n_tokens = tokens_input->ne[0];
|
int n_tokens = tokens_input->ne[0];
|
||||||
int n_vocab = target_logits->ne[0];
|
int n_vocab = target_logits->ne[0];
|
||||||
|
|
||||||
|
@ -891,7 +896,11 @@ void get_example_targets(struct llama_context * lctx, const int * train_samples,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_example_targets_batch(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) {
|
static void get_example_targets_batch(
|
||||||
|
struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data,
|
||||||
|
size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits,
|
||||||
|
struct ggml_tensor * target_probs
|
||||||
|
) {
|
||||||
GGML_ASSERT(tokens_input->n_dims == 2);
|
GGML_ASSERT(tokens_input->n_dims == 2);
|
||||||
GGML_ASSERT(target_logits->n_dims == 3);
|
GGML_ASSERT(target_logits->n_dims == 3);
|
||||||
GGML_ASSERT(target_probs->n_dims == 3);
|
GGML_ASSERT(target_probs->n_dims == 3);
|
||||||
|
@ -926,7 +935,7 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token>& out) {
|
static int tokenize_file(struct llama_context * lctx, const char * filename, std::vector<llama_token> & out) {
|
||||||
FILE * fp = std::fopen(filename, "rb");
|
FILE * fp = std::fopen(filename, "rb");
|
||||||
if (fp == NULL) {
|
if (fp == NULL) {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -997,7 +1006,7 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto
|
||||||
return n_tokens;
|
return n_tokens;
|
||||||
}
|
}
|
||||||
|
|
||||||
void shuffle_ints(int * begin, int * end) {
|
static void shuffle_ints(int * begin, int * end) {
|
||||||
if (end <= begin) return;
|
if (end <= begin) return;
|
||||||
int max=begin[0];
|
int max=begin[0];
|
||||||
for (int i=1; i<end-begin; ++i) {
|
for (int i=1; i<end-begin; ++i) {
|
||||||
|
@ -1031,7 +1040,7 @@ void shuffle_ints(int * begin, int * end) {
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
|
static bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||||
GGML_ASSERT(a != NULL);
|
GGML_ASSERT(a != NULL);
|
||||||
GGML_ASSERT(b != NULL);
|
GGML_ASSERT(b != NULL);
|
||||||
GGML_ASSERT(a->type == b->type);
|
GGML_ASSERT(a->type == b->type);
|
||||||
|
@ -1041,7 +1050,7 @@ bool are_same_layout(struct ggml_tensor * a, struct ggml_tensor * b) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
|
static void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, const char * name) {
|
||||||
if (dst == NULL) {
|
if (dst == NULL) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -1054,7 +1063,9 @@ void read_tensor_by_name(struct ggml_tensor * dst, struct ggml_context * ctx, co
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt) {
|
static void load_opt_context_gguf(
|
||||||
|
struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct ggml_opt_context * opt
|
||||||
|
) {
|
||||||
// NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
|
// NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
|
||||||
|
|
||||||
uint32_t file_version;
|
uint32_t file_version;
|
||||||
|
@ -1115,7 +1126,7 @@ void load_opt_context_gguf(struct gguf_context * fctx, struct ggml_context * f_g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
|
static void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context * opt) {
|
||||||
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
|
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_FILE_VERSION, 0);
|
||||||
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
|
gguf_set_val_u32(fctx, LLM_KV_OPTIMIZER_CONVERGENCE_PAST_COUNT, opt->params.past);
|
||||||
gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
|
gguf_set_val_u64(fctx, LLM_KV_OPTIMIZER_PARAMETER_COUNT, (uint64_t) opt->nx);
|
||||||
|
@ -1182,7 +1193,9 @@ void save_opt_context_gguf(struct gguf_context * fctx, struct ggml_opt_context *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model) {
|
static void load_llama_model_gguf(
|
||||||
|
struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model
|
||||||
|
) {
|
||||||
// NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
|
// NOTE: gguf_context must be initialized with f_ggml_ctx and no_alloc=false, otherwise tensor data can not be read
|
||||||
std::string arch;
|
std::string arch;
|
||||||
|
|
||||||
|
@ -1253,7 +1266,9 @@ void load_llama_model_gguf(struct gguf_context * fctx, struct ggml_context * f_g
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model) {
|
static void save_llama_model_gguf(
|
||||||
|
struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model
|
||||||
|
) {
|
||||||
const char * arch = "llama";
|
const char * arch = "llama";
|
||||||
enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
enum llama_ftype ftype = LLAMA_FTYPE_ALL_F32;
|
||||||
|
|
||||||
|
@ -1396,7 +1411,7 @@ void save_llama_model_gguf(struct gguf_context * fctx, const char * fn_vocab_mod
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) {
|
static void save_llama_model_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model) {
|
||||||
struct gguf_context * fctx = gguf_init_empty();
|
struct gguf_context * fctx = gguf_init_empty();
|
||||||
|
|
||||||
save_llama_model_gguf(fctx, fn_vocab_model, model);
|
save_llama_model_gguf(fctx, fn_vocab_model, model);
|
||||||
|
@ -1407,7 +1422,10 @@ void save_llama_model_file(const char * filename, const char * fn_vocab_model, s
|
||||||
gguf_free(fctx);
|
gguf_free(fctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model, struct ggml_opt_context * opt) {
|
static void load_checkpoint_gguf(
|
||||||
|
struct gguf_context * fctx, struct ggml_context * f_ggml_ctx, struct my_llama_model * model,
|
||||||
|
struct ggml_opt_context * opt
|
||||||
|
) {
|
||||||
load_llama_model_gguf(fctx, f_ggml_ctx, model);
|
load_llama_model_gguf(fctx, f_ggml_ctx, model);
|
||||||
|
|
||||||
uint32_t file_version;
|
uint32_t file_version;
|
||||||
|
@ -1421,7 +1439,10 @@ void load_checkpoint_gguf(struct gguf_context * fctx, struct ggml_context * f_gg
|
||||||
load_opt_context_gguf(fctx, f_ggml_ctx, opt);
|
load_opt_context_gguf(fctx, f_ggml_ctx, opt);
|
||||||
}
|
}
|
||||||
|
|
||||||
void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
|
static void save_checkpoint_gguf(
|
||||||
|
struct gguf_context * fctx, const char * fn_vocab_model, struct my_llama_model * model,
|
||||||
|
struct ggml_opt_context * opt
|
||||||
|
) {
|
||||||
save_llama_model_gguf(fctx, fn_vocab_model, model);
|
save_llama_model_gguf(fctx, fn_vocab_model, model);
|
||||||
|
|
||||||
gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 0);
|
gguf_set_val_u32(fctx, LLM_KV_TRAINING_FILE_VERSION, 0);
|
||||||
|
@ -1432,7 +1453,7 @@ void save_checkpoint_gguf(struct gguf_context * fctx, const char * fn_vocab_mode
|
||||||
save_opt_context_gguf(fctx, opt);
|
save_opt_context_gguf(fctx, opt);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) {
|
static bool load_checkpoint_file(const char * filename, struct my_llama_model * model, struct ggml_opt_context * opt) {
|
||||||
struct ggml_context * f_ggml_ctx;
|
struct ggml_context * f_ggml_ctx;
|
||||||
struct gguf_init_params params;
|
struct gguf_init_params params;
|
||||||
params.no_alloc = false;
|
params.no_alloc = false;
|
||||||
|
@ -1447,7 +1468,9 @@ bool load_checkpoint_file(const char * filename, struct my_llama_model * model,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void save_checkpoint_file(const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt) {
|
static void save_checkpoint_file(
|
||||||
|
const char * filename, const char * fn_vocab_model, struct my_llama_model * model, struct ggml_opt_context * opt
|
||||||
|
) {
|
||||||
struct gguf_context * fctx = gguf_init_empty();
|
struct gguf_context * fctx = gguf_init_empty();
|
||||||
|
|
||||||
save_checkpoint_gguf(fctx, fn_vocab_model, model, opt);
|
save_checkpoint_gguf(fctx, fn_vocab_model, model, opt);
|
||||||
|
@ -1458,7 +1481,7 @@ void save_checkpoint_file(const char * filename, const char * fn_vocab_model, st
|
||||||
gguf_free(fctx);
|
gguf_free(fctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
float cosine_decay(const int decay_steps, const float minimum, int step) {
|
static float cosine_decay(const int decay_steps, const float minimum, int step) {
|
||||||
if (step > decay_steps) {
|
if (step > decay_steps) {
|
||||||
step = decay_steps;
|
step = decay_steps;
|
||||||
}
|
}
|
||||||
|
@ -1467,7 +1490,9 @@ float cosine_decay(const int decay_steps, const float minimum, int step) {
|
||||||
return decay;
|
return decay;
|
||||||
}
|
}
|
||||||
|
|
||||||
float cosine_decay_restart(int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart) {
|
static float cosine_decay_restart(
|
||||||
|
int decay_steps, const float minimum, int step, float restart_step_mult, bool enable_restart
|
||||||
|
) {
|
||||||
if (enable_restart) {
|
if (enable_restart) {
|
||||||
while (step > decay_steps) {
|
while (step > decay_steps) {
|
||||||
step -= decay_steps;
|
step -= decay_steps;
|
||||||
|
@ -1595,7 +1620,7 @@ struct train_params get_default_train_params() {
|
||||||
return params;
|
return params;
|
||||||
}
|
}
|
||||||
|
|
||||||
void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
|
static void train_print_usage(int /*argc*/, char ** argv, const struct train_params * params) {
|
||||||
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
fprintf(stderr, "usage: %s [options]\n", argv[0]);
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
fprintf(stderr, "options:\n");
|
fprintf(stderr, "options:\n");
|
||||||
|
@ -1652,7 +1677,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p
|
||||||
fprintf(stderr, "\n");
|
fprintf(stderr, "\n");
|
||||||
}
|
}
|
||||||
|
|
||||||
bool train_params_parse(int argc, char ** argv, struct train_params * params) {
|
static bool train_params_parse(int argc, char ** argv, struct train_params * params) {
|
||||||
bool invalid_param = false;
|
bool invalid_param = false;
|
||||||
std::string arg;
|
std::string arg;
|
||||||
struct train_params default_params = get_default_train_params();
|
struct train_params default_params = get_default_train_params();
|
||||||
|
@ -1946,7 +1971,7 @@ struct opt_callback_data {
|
||||||
struct ggml_tensor * target_probs;
|
struct ggml_tensor * target_probs;
|
||||||
};
|
};
|
||||||
|
|
||||||
void opt_callback(void * vdata, float * sched) {
|
static void opt_callback(void * vdata, float * sched) {
|
||||||
struct opt_callback_data * data = (struct opt_callback_data *) vdata;
|
struct opt_callback_data * data = (struct opt_callback_data *) vdata;
|
||||||
struct train_params * params = data->params;
|
struct train_params * params = data->params;
|
||||||
struct ggml_opt_context * opt = data->opt;
|
struct ggml_opt_context * opt = data->opt;
|
||||||
|
@ -1989,8 +2014,6 @@ void opt_callback(void * vdata, float * sched) {
|
||||||
data->shuffle_countdown -= n_batch;
|
data->shuffle_countdown -= n_batch;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char ** argv) {
|
int main(int argc, char ** argv) {
|
||||||
struct train_params params = get_default_train_params();
|
struct train_params params = get_default_train_params();
|
||||||
|
|
||||||
|
|
|
@ -16,9 +16,7 @@
|
||||||
|
|
||||||
constexpr int kVecSize = 1 << 18;
|
constexpr int kVecSize = 1 << 18;
|
||||||
|
|
||||||
namespace {
|
static float drawFromGaussianPdf(std::mt19937& rndm) {
|
||||||
|
|
||||||
float drawFromGaussianPdf(std::mt19937& rndm) {
|
|
||||||
constexpr double kScale = 1./(1. + std::mt19937::max());
|
constexpr double kScale = 1./(1. + std::mt19937::max());
|
||||||
constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
|
constexpr double kTwoPiTimesScale = 6.28318530717958647692*kScale;
|
||||||
static float lastX;
|
static float lastX;
|
||||||
|
@ -30,7 +28,8 @@ float drawFromGaussianPdf(std::mt19937& rndm) {
|
||||||
haveX = true;
|
haveX = true;
|
||||||
return r*cos(phi);
|
return r*cos(phi);
|
||||||
}
|
}
|
||||||
void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
|
|
||||||
|
static void fillRandomGaussianFloats(std::vector<float>& values, std::mt19937& rndm, float mean = 0) {
|
||||||
for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
|
for (auto& v : values) v = mean + drawFromGaussianPdf(rndm);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -220,8 +219,6 @@ static void dot_q4_q8(const int n, float* s, const void* vx, const void* vy) {
|
||||||
*s = sumf;
|
*s = sumf;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char** argv) {
|
int main(int argc, char** argv) {
|
||||||
|
|
||||||
int nloop = argc > 1 ? atoi(argv[1]) : 10;
|
int nloop = argc > 1 ? atoi(argv[1]) : 10;
|
||||||
|
|
|
@ -36,17 +36,15 @@
|
||||||
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
#define GGML_PRINT(...) printf(__VA_ARGS__)
|
||||||
|
|
||||||
|
|
||||||
namespace {
|
static float frand(void) {
|
||||||
|
|
||||||
float frand(void) {
|
|
||||||
return (float)rand()/(float)RAND_MAX;
|
return (float)rand()/(float)RAND_MAX;
|
||||||
}
|
}
|
||||||
|
|
||||||
int irand(int n) {
|
static int irand(int n) {
|
||||||
return rand()%n;
|
return rand()%n;
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_random_dims(int64_t * dims, int ndims) {
|
static void get_random_dims(int64_t * dims, int ndims) {
|
||||||
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
||||||
|
|
||||||
for (int i = 0; i < ndims; i++) {
|
for (int i = 0; i < ndims; i++) {
|
||||||
|
@ -54,7 +52,7 @@ void get_random_dims(int64_t * dims, int ndims) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
|
static void get_random_dims_minmax(int64_t * dims, int ndims, int min, int max) {
|
||||||
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
dims[0] = dims[1] = dims[2] = dims[3] = 1;
|
||||||
|
|
||||||
for (int i = 0; i < ndims; i++) {
|
for (int i = 0; i < ndims; i++) {
|
||||||
|
@ -111,16 +109,14 @@ struct ggml_tensor * get_random_tensor(
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
float get_element(const struct ggml_tensor * t, int idx) {
|
static float get_element(const struct ggml_tensor * t, int idx) {
|
||||||
return ((float *)t->data)[idx];
|
return ((float *)t->data)[idx];
|
||||||
}
|
}
|
||||||
|
|
||||||
void set_element(struct ggml_tensor * t, int idx, float value) {
|
static void set_element(struct ggml_tensor * t, int idx, float value) {
|
||||||
((float *)t->data)[idx] = value;
|
((float *)t->data)[idx] = value;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
struct ggml_init_params params = {
|
struct ggml_init_params params = {
|
||||||
/* .mem_size = */ 1024*1024*1024,
|
/* .mem_size = */ 1024*1024*1024,
|
||||||
|
|
|
@ -19,20 +19,18 @@ constexpr float MAX_QUANTIZATION_TOTAL_ERROR_2BITS = 0.0075f;
|
||||||
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
constexpr float MAX_QUANTIZATION_TOTAL_ERROR_3BITS = 0.0040f;
|
||||||
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
constexpr float MAX_DOT_PRODUCT_ERROR = 0.02f;
|
||||||
|
|
||||||
|
static const char* RESULT_STR[] = {"ok", "FAILED"};
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
const char* RESULT_STR[] = {"ok", "FAILED"};
|
|
||||||
|
|
||||||
// Generate synthetic data
|
// Generate synthetic data
|
||||||
void generate_data(float offset, size_t n, float * dst) {
|
static void generate_data(float offset, size_t n, float * dst) {
|
||||||
for (size_t i = 0; i < n; i++) {
|
for (size_t i = 0; i < n; i++) {
|
||||||
dst[i] = 0.1 + 2*cosf(i + offset);
|
dst[i] = 0.1 + 2*cosf(i + offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Calculate RMSE between two float arrays
|
// Calculate RMSE between two float arrays
|
||||||
float array_rmse(const float * a1, const float * a2, size_t n) {
|
static float array_rmse(const float * a1, const float * a2, size_t n) {
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (size_t i = 0; i < n; i++) {
|
for (size_t i = 0; i < n; i++) {
|
||||||
double diff = a1[i] - a2[i];
|
double diff = a1[i] - a2[i];
|
||||||
|
@ -42,7 +40,7 @@ float array_rmse(const float * a1, const float * a2, size_t n) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Total quantization error on test data
|
// Total quantization error on test data
|
||||||
float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
static float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||||
std::vector<uint8_t> tmp_q(2*test_size);
|
std::vector<uint8_t> tmp_q(2*test_size);
|
||||||
std::vector<float> tmp_out(test_size);
|
std::vector<float> tmp_out(test_size);
|
||||||
|
|
||||||
|
@ -52,7 +50,7 @@ float total_quantization_error(ggml_type_traits_t & qfns, size_t test_size, cons
|
||||||
}
|
}
|
||||||
|
|
||||||
// Total quantization error on test data
|
// Total quantization error on test data
|
||||||
float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
static float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data) {
|
||||||
std::vector<uint8_t> tmp_q(2*test_size);
|
std::vector<uint8_t> tmp_q(2*test_size);
|
||||||
std::vector<float> tmp_out(test_size);
|
std::vector<float> tmp_out(test_size);
|
||||||
std::vector<float> tmp_out_ref(test_size);
|
std::vector<float> tmp_out_ref(test_size);
|
||||||
|
@ -66,7 +64,7 @@ float reference_quantization_error(ggml_type_traits_t & qfns, size_t test_size,
|
||||||
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
return array_rmse(tmp_out.data(), tmp_out_ref.data(), test_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
float dot_product(const float * a1, const float * a2, size_t test_size) {
|
static float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||||
double sum = 0;
|
double sum = 0;
|
||||||
for (size_t i = 0; i < test_size; i++) {
|
for (size_t i = 0; i < test_size; i++) {
|
||||||
sum += a1[i] * a2[i];
|
sum += a1[i] * a2[i];
|
||||||
|
@ -75,7 +73,9 @@ float dot_product(const float * a1, const float * a2, size_t test_size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Total dot product error
|
// Total dot product error
|
||||||
float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2) {
|
static float dot_product_error(
|
||||||
|
ggml_type_traits_t & qfns, size_t test_size, const float * test_data1, const float *test_data2
|
||||||
|
) {
|
||||||
std::vector<uint8_t> tmp_q1(2*test_size);
|
std::vector<uint8_t> tmp_q1(2*test_size);
|
||||||
std::vector<uint8_t> tmp_q2(2*test_size);
|
std::vector<uint8_t> tmp_q2(2*test_size);
|
||||||
|
|
||||||
|
@ -92,8 +92,6 @@ float dot_product_error(ggml_type_traits_t & qfns, size_t test_size, const float
|
||||||
return fabsf(result - dot_ref) / test_size;
|
return fabsf(result - dot_ref) / test_size;
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char * argv[]) {
|
int main(int argc, char * argv[]) {
|
||||||
bool verbose = false;
|
bool verbose = false;
|
||||||
const size_t test_size = 32 * 128;
|
const size_t test_size = 32 * 128;
|
||||||
|
|
|
@ -60,25 +60,23 @@ inline int64_t cpu_cycles() {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
namespace {
|
|
||||||
|
|
||||||
// Generate synthetic data
|
// Generate synthetic data
|
||||||
void generate_data(float offset, size_t n, float * dst) {
|
static void generate_data(float offset, size_t n, float * dst) {
|
||||||
for (size_t i = 0; i < n; i++) {
|
for (size_t i = 0; i < n; i++) {
|
||||||
dst[i] = 0.1 + 2*cosf(i + offset);
|
dst[i] = 0.1 + 2*cosf(i + offset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
float gigabytes_per_second(size_t bytes, int64_t usecs) {
|
static float gigabytes_per_second(size_t bytes, int64_t usecs) {
|
||||||
return bytes / (float) usecs * 1000000 / (1024*1024*1024);
|
return bytes / (float) usecs * 1000000 / (1024*1024*1024);
|
||||||
}
|
}
|
||||||
|
|
||||||
void * align_with_offset(void * ptr, int offset) {
|
static void * align_with_offset(void * ptr, int offset) {
|
||||||
size_t dummy_size = MAX_ALIGNMENT * 4;
|
size_t dummy_size = MAX_ALIGNMENT * 4;
|
||||||
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
return (char *) std::align(MAX_ALIGNMENT, MAX_ALIGNMENT, ptr, dummy_size) + offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
|
static void benchmark_function(size_t size, size_t q_size, int64_t iterations, const std::function<size_t(void)> & function) {
|
||||||
int64_t min_time_us = INT64_MAX;
|
int64_t min_time_us = INT64_MAX;
|
||||||
int64_t total_time_us = 0;
|
int64_t total_time_us = 0;
|
||||||
int64_t min_time_cycles = INT64_MAX;
|
int64_t min_time_cycles = INT64_MAX;
|
||||||
|
@ -110,7 +108,7 @@ void benchmark_function(size_t size, size_t q_size, int64_t iterations, const st
|
||||||
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
|
printf(" quantized throughput : %9.2f GB/s\n", gigabytes_per_second(q_size * iterations, total_time_us));
|
||||||
}
|
}
|
||||||
|
|
||||||
void usage(char * argv[]) {
|
static void usage(char * argv[]) {
|
||||||
printf("Benchmark quantization specific functions on synthetic data\n");
|
printf("Benchmark quantization specific functions on synthetic data\n");
|
||||||
printf("\n");
|
printf("\n");
|
||||||
printf("usage: %s [options]\n", argv[0]);
|
printf("usage: %s [options]\n", argv[0]);
|
||||||
|
@ -139,8 +137,6 @@ void usage(char * argv[]) {
|
||||||
printf(" set test iteration number (%d)\n", ITERATIONS);
|
printf(" set test iteration number (%d)\n", ITERATIONS);
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(int argc, char * argv[]) {
|
int main(int argc, char * argv[]) {
|
||||||
quantize_perf_params params {};
|
quantize_perf_params params {};
|
||||||
|
|
||||||
|
|
|
@ -13,9 +13,7 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
|
||||||
|
|
||||||
namespace {
|
static void dump(const llama_token_data_array * candidates) {
|
||||||
|
|
||||||
void dump(const llama_token_data_array * candidates) {
|
|
||||||
for (size_t i = 0; i < candidates->size; i++) {
|
for (size_t i = 0; i < candidates->size; i++) {
|
||||||
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
|
printf("%d: %f (%f)\n", candidates->data[i].id, candidates->data[i].p, candidates->data[i].logit);
|
||||||
}
|
}
|
||||||
|
@ -24,9 +22,7 @@ void dump(const llama_token_data_array * candidates) {
|
||||||
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
|
#define DUMP(__candidates) do { printf("%s:%d (%s)\n", __FILE__, __LINE__, __func__); dump((__candidates)); printf("-\n"); } while(0)
|
||||||
|
|
||||||
|
|
||||||
void test_top_k(const std::vector<float> & probs,
|
static void test_top_k(const std::vector<float> & probs, const std::vector<float> & expected_probs, int k) {
|
||||||
const std::vector<float> & expected_probs,
|
|
||||||
int k) {
|
|
||||||
size_t n_vocab = probs.size();
|
size_t n_vocab = probs.size();
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
@ -48,10 +44,7 @@ void test_top_k(const std::vector<float> & probs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void test_top_p(const std::vector<float> & probs,
|
static void test_top_p(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||||
const std::vector<float> & expected_probs,
|
|
||||||
float p) {
|
|
||||||
|
|
||||||
size_t n_vocab = probs.size();
|
size_t n_vocab = probs.size();
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
@ -73,9 +66,7 @@ void test_top_p(const std::vector<float> & probs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void test_tfs(const std::vector<float> & probs,
|
static void test_tfs(const std::vector<float> & probs, const std::vector<float> & expected_probs, float z) {
|
||||||
const std::vector<float> & expected_probs,
|
|
||||||
float z) {
|
|
||||||
size_t n_vocab = probs.size();
|
size_t n_vocab = probs.size();
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
@ -96,9 +87,7 @@ void test_tfs(const std::vector<float> & probs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void test_typical(const std::vector<float> & probs,
|
static void test_typical(const std::vector<float> & probs, const std::vector<float> & expected_probs, float p) {
|
||||||
const std::vector<float> & expected_probs,
|
|
||||||
float p) {
|
|
||||||
size_t n_vocab = probs.size();
|
size_t n_vocab = probs.size();
|
||||||
std::vector<llama_token_data> candidates;
|
std::vector<llama_token_data> candidates;
|
||||||
candidates.reserve(n_vocab);
|
candidates.reserve(n_vocab);
|
||||||
|
@ -119,11 +108,10 @@ void test_typical(const std::vector<float> & probs,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void test_repetition_penalty(
|
static void test_repetition_penalty(
|
||||||
const std::vector<float> & probs,
|
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
||||||
const std::vector<llama_token> & last_tokens,
|
const std::vector<float> & expected_probs, float penalty
|
||||||
const std::vector<float> & expected_probs,
|
) {
|
||||||
float penalty) {
|
|
||||||
assert(probs.size() == expected_probs.size());
|
assert(probs.size() == expected_probs.size());
|
||||||
|
|
||||||
size_t n_vocab = probs.size();
|
size_t n_vocab = probs.size();
|
||||||
|
@ -148,11 +136,10 @@ void test_repetition_penalty(
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void test_frequency_presence_penalty(
|
static void test_frequency_presence_penalty(
|
||||||
const std::vector<float> & probs,
|
const std::vector<float> & probs, const std::vector<llama_token> & last_tokens,
|
||||||
const std::vector<llama_token> & last_tokens,
|
const std::vector<float> & expected_probs, float alpha_frequency, float alpha_presence
|
||||||
const std::vector<float> & expected_probs,
|
) {
|
||||||
float alpha_frequency, float alpha_presence) {
|
|
||||||
assert(probs.size() == expected_probs.size());
|
assert(probs.size() == expected_probs.size());
|
||||||
|
|
||||||
size_t n_vocab = probs.size();
|
size_t n_vocab = probs.size();
|
||||||
|
@ -176,8 +163,6 @@ void test_frequency_presence_penalty(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace
|
|
||||||
|
|
||||||
int main(void) {
|
int main(void) {
|
||||||
ggml_time_init();
|
ggml_time_init();
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue