fix get_samples call, add model tensor names, increase model size, start training samples after newline
This commit is contained in:
parent
e063135d0b
commit
d328472f16
1 changed files with 72 additions and 45 deletions
|
@ -8,6 +8,7 @@
|
||||||
#include <stdexcept>
|
#include <stdexcept>
|
||||||
#include <cstdarg>
|
#include <cstdarg>
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
|
||||||
struct random_normal_distribution {
|
struct random_normal_distribution {
|
||||||
|
@ -162,22 +163,6 @@ struct my_llama_hparams {
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
|
|
||||||
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
|
||||||
return n_ff;
|
|
||||||
}
|
|
||||||
|
|
||||||
void print_params(struct my_llama_hparams * params) {
|
|
||||||
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
|
||||||
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
|
||||||
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
|
||||||
printf("%s: n_mult: %d\n", __func__, params->n_mult);
|
|
||||||
printf("%s: n_head: %d\n", __func__, params->n_head);
|
|
||||||
printf("%s: n_ff: %d\n", __func__, get_n_ff(params));
|
|
||||||
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
|
||||||
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
|
||||||
}
|
|
||||||
|
|
||||||
struct my_llama_layer {
|
struct my_llama_layer {
|
||||||
// normalization
|
// normalization
|
||||||
struct ggml_tensor * attention_norm;
|
struct ggml_tensor * attention_norm;
|
||||||
|
@ -221,6 +206,22 @@ struct my_llama_model {
|
||||||
std::vector<my_llama_layer> layers;
|
std::vector<my_llama_layer> layers;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
uint32_t get_n_ff(const struct my_llama_hparams* hparams) {
|
||||||
|
const uint32_t n_ff = ((2*(4*hparams->n_embd)/3 + hparams->n_mult - 1)/hparams->n_mult)*hparams->n_mult;
|
||||||
|
return n_ff;
|
||||||
|
}
|
||||||
|
|
||||||
|
void print_params(struct my_llama_hparams * params) {
|
||||||
|
printf("%s: n_vocab: %d\n", __func__, params->n_vocab);
|
||||||
|
printf("%s: n_ctx: %d\n", __func__, params->n_ctx);
|
||||||
|
printf("%s: n_embd: %d\n", __func__, params->n_embd);
|
||||||
|
printf("%s: n_mult: %d\n", __func__, params->n_mult);
|
||||||
|
printf("%s: n_head: %d\n", __func__, params->n_head);
|
||||||
|
printf("%s: n_ff: %d\n", __func__, get_n_ff(params));
|
||||||
|
printf("%s: n_layer: %d\n", __func__, params->n_layer);
|
||||||
|
printf("%s: n_rot: %d\n", __func__, params->n_rot);
|
||||||
|
}
|
||||||
|
|
||||||
void init_model(struct my_llama_model * model) {
|
void init_model(struct my_llama_model * model) {
|
||||||
const auto & hparams = model->hparams;
|
const auto & hparams = model->hparams;
|
||||||
|
|
||||||
|
@ -232,32 +233,48 @@ void init_model(struct my_llama_model * model) {
|
||||||
|
|
||||||
struct ggml_context * ctx = model->ctx;
|
struct ggml_context * ctx = model->ctx;
|
||||||
|
|
||||||
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("tok_embeddings.weight", {n_embd, n_vocab});
|
model->tok_embeddings = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||||
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // ("norm.weight", {n_embd});
|
model->norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab); // ("output.weight", {n_embd, n_vocab});
|
model->output = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_vocab);
|
||||||
|
|
||||||
|
ggml_set_name(model->tok_embeddings, "tok_embeddings.weight");
|
||||||
|
ggml_set_name(model->norm, "norm.weight");
|
||||||
|
ggml_set_name(model->output, "output.weight");
|
||||||
|
|
||||||
model->layers.resize(n_layer);
|
model->layers.resize(n_layer);
|
||||||
for (uint32_t i = 0; i < n_layer; ++i) {
|
for (uint32_t i = 0; i < n_layer; ++i) {
|
||||||
auto & layer = model->layers[i];
|
auto & layer = model->layers[i];
|
||||||
|
|
||||||
// std::string layers_i = "layers." + std::to_string(i);
|
std::string layers_i = "layers." + std::to_string(i);
|
||||||
|
|
||||||
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".attention_norm.weight", {n_embd});
|
layer.attention_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wq.weight", {n_embd, n_embd});
|
layer.wq = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||||
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wk.weight", {n_embd, n_embd});
|
layer.wk = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||||
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wv.weight", {n_embd, n_embd});
|
layer.wv = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||||
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd); // (layers_i + ".attention.wo.weight", {n_embd, n_embd});
|
layer.wo = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_embd);
|
||||||
|
|
||||||
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); // (layers_i + ".ffn_norm.weight", {n_embd});
|
layer.ffn_norm = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
|
||||||
|
|
||||||
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w1.weight", {n_embd, n_ff});
|
layer.w1 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||||
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd); // (layers_i + ".feed_forward.w2.weight", { n_ff, n_embd});
|
layer.w2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_ff, n_embd);
|
||||||
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff); // (layers_i + ".feed_forward.w3.weight", {n_embd, n_ff});
|
layer.w3 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ff);
|
||||||
|
|
||||||
|
ggml_set_name(layer.attention_norm, (layers_i + ".attention_norm.weight").c_str());
|
||||||
|
|
||||||
|
ggml_set_name(layer.wq, (layers_i + ".attention.wq.weight").c_str());
|
||||||
|
ggml_set_name(layer.wk, (layers_i + ".attention.wk.weight").c_str());
|
||||||
|
ggml_set_name(layer.wv, (layers_i + ".attention.wv.weight").c_str());
|
||||||
|
ggml_set_name(layer.wo, (layers_i + ".attention.wo.weight").c_str());
|
||||||
|
|
||||||
|
ggml_set_name(layer.ffn_norm, (layers_i + ".ffn_norm.weight").c_str());
|
||||||
|
|
||||||
|
ggml_set_name(layer.w1, (layers_i + ".feed_forward.w1.weight").c_str());
|
||||||
|
ggml_set_name(layer.w2, (layers_i + ".feed_forward.w2.weight").c_str());
|
||||||
|
ggml_set_name(layer.w3, (layers_i + ".feed_forward.w3.weight").c_str());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void set_param_model(struct my_llama_model * model) {
|
void set_param_model(struct my_llama_model * model) {
|
||||||
const auto& hparams = model->hparams;
|
const auto& hparams = model->hparams;
|
||||||
|
|
||||||
|
@ -676,7 +693,6 @@ struct ggml_tensor * forward_batch(
|
||||||
cur),
|
cur),
|
||||||
n_embd, N, n_batch),
|
n_embd, N, n_batch),
|
||||||
1, 0, 2, 3));
|
1, 0, 2, 3));
|
||||||
|
|
||||||
assert_shape_3d(Vcur, N, n_embd, n_batch);
|
assert_shape_3d(Vcur, N, n_embd, n_batch);
|
||||||
|
|
||||||
// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
|
// kv_self.k shape [n_embd * n_ctx * n_batch * n_layer]
|
||||||
|
@ -1366,12 +1382,12 @@ int main(int argc, char ** argv) {
|
||||||
|
|
||||||
struct my_llama_model model;
|
struct my_llama_model model;
|
||||||
model.hparams.n_vocab = llama_n_vocab(lctx);
|
model.hparams.n_vocab = llama_n_vocab(lctx);
|
||||||
model.hparams.n_ctx = 16;
|
model.hparams.n_ctx = 32;
|
||||||
model.hparams.n_embd = 64;
|
model.hparams.n_embd = 128;
|
||||||
model.hparams.n_mult = 8;
|
model.hparams.n_mult = 64;
|
||||||
model.hparams.n_head = 8;
|
model.hparams.n_head = 16;
|
||||||
model.hparams.n_layer = 1;
|
model.hparams.n_layer = 4;
|
||||||
model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head);
|
model.hparams.n_rot = std::min(64u, model.hparams.n_embd / model.hparams.n_head);
|
||||||
|
|
||||||
print_params(&model.hparams);
|
print_params(&model.hparams);
|
||||||
|
|
||||||
|
@ -1383,13 +1399,18 @@ int main(int argc, char ** argv) {
|
||||||
token_occurs[train_tokens[i]] = true;
|
token_occurs[train_tokens[i]] = true;
|
||||||
token_notavail[train_tokens[i]] = false;
|
token_notavail[train_tokens[i]] = false;
|
||||||
}
|
}
|
||||||
|
int n_unique_tokens = 0;
|
||||||
|
for (int i=0; i<token_occurs.size(); ++i) {
|
||||||
|
n_unique_tokens += token_occurs[i] ? 1 : 0;
|
||||||
|
}
|
||||||
|
printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens);
|
||||||
|
|
||||||
struct my_llama_kv_cache kv_self;
|
struct my_llama_kv_cache kv_self;
|
||||||
|
|
||||||
int n_batch = 8;
|
int n_batch = 32;
|
||||||
|
|
||||||
struct ggml_init_params lcparams;
|
struct ggml_init_params lcparams;
|
||||||
lcparams.mem_size = 1024ll*1024ll*1024ll;
|
lcparams.mem_size = 1024ll*1024ll*1024ll*8ll;
|
||||||
lcparams.mem_buffer = NULL;
|
lcparams.mem_buffer = NULL;
|
||||||
lcparams.no_alloc = false;
|
lcparams.no_alloc = false;
|
||||||
|
|
||||||
|
@ -1414,8 +1435,11 @@ int main(int argc, char ** argv) {
|
||||||
int n_vocab = model.hparams.n_vocab;
|
int n_vocab = model.hparams.n_vocab;
|
||||||
|
|
||||||
std::vector<int> train_samples;
|
std::vector<int> train_samples;
|
||||||
for (int i=0; i<train_tokens.size()-n_tokens; ++i) {
|
train_samples.push_back(0);
|
||||||
train_samples.push_back(i);
|
for (int i=1; i<train_tokens.size()-n_tokens; ++i) {
|
||||||
|
if (train_tokens[i-1] == llama_token_nl()) {
|
||||||
|
train_samples.push_back(i);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
|
shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size());
|
||||||
for (int i=0; i<train_samples.size(); ++i) {
|
for (int i=0; i<train_samples.size(); ++i) {
|
||||||
|
@ -1447,13 +1471,16 @@ int main(int argc, char ** argv) {
|
||||||
int n_past = 0;
|
int n_past = 0;
|
||||||
|
|
||||||
ggml_cgraph gf = {};
|
ggml_cgraph gf = {};
|
||||||
gf.n_threads = 4;
|
gf.n_threads = 6;
|
||||||
|
|
||||||
get_example_targets_batch(ctx0, train_tokens.data(), train_tokens.size(), ex, tokens_input, targets);
|
get_example_targets_batch(ctx0, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), ex, tokens_input, targets);
|
||||||
|
|
||||||
struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
|
struct ggml_tensor * logits = forward_batch(&model, &kv_self, ctx0, &gf, tokens_input, n_tokens, n_past, n_batch);
|
||||||
struct ggml_tensor * e = square_error_loss(ctx0, targets, logits);
|
struct ggml_tensor * se = square_error_loss(ctx0, targets, logits);
|
||||||
// struct ggml_tensor * e = cross_entropy_loss(ctx0, targets, logits);
|
// struct ggml_tensor * ce = cross_entropy_loss(ctx0, targets, logits);
|
||||||
|
// struct ggml_tensor * e = ggml_add(ctx0, se, ce);
|
||||||
|
// struct ggml_tensor * e = ce;
|
||||||
|
struct ggml_tensor * e = se;
|
||||||
|
|
||||||
ggml_build_forward_expand(&gf, e);
|
ggml_build_forward_expand(&gf, e);
|
||||||
ggml_graph_compute(ctx0, &gf);
|
ggml_graph_compute(ctx0, &gf);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue