Merge branch 'master' into concedo_experimental

# Conflicts:
#	tests/test-grad0.c
This commit is contained in:
Concedo 2023-06-14 11:35:43 +08:00
commit f5247be0d7
10 changed files with 5442 additions and 265 deletions

View file

@ -37,6 +37,7 @@ else()
add_subdirectory(save-load-state)
add_subdirectory(benchmark)
add_subdirectory(baby-llama)
add_subdirectory(train-text-from-scratch)
if (LLAMA_METAL)
add_subdirectory(metal)
endif()

View file

@ -79,34 +79,39 @@ struct ggml_tensor * randomize_tensor_normal(
int ndims,
const int64_t ne[],
struct random_normal_distribution * rnd) {
float scale = 1.0; // xavier
switch (ndims) {
case 1:
scale /= sqrtf(ne[0]);
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)tensor->data)[i0] = frand_normal(rnd);
((float *)tensor->data)[i0] = scale * frand_normal(rnd);
}
break;
case 2:
scale /= sqrtf(ne[0]+ne[1]);
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)tensor->data)[i1*ne[0] + i0] = frand_normal(rnd);
((float *)tensor->data)[i1*ne[0] + i0] = scale * frand_normal(rnd);
}
}
break;
case 3:
scale /= sqrtf(ne[0]+ne[1]);
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
((float *)tensor->data)[i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
}
}
}
break;
case 4:
scale /= sqrtf(ne[0]+ne[1]);
for (int i3 = 0; i3 < ne[3]; i3++) {
for (int i2 = 0; i2 < ne[2]; i2++) {
for (int i1 = 0; i1 < ne[1]; i1++) {
for (int i0 = 0; i0 < ne[0]; i0++) {
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = frand_normal(rnd);
((float *)tensor->data)[i3*ne[2]*ne[1]*ne[0] + i2*ne[1]*ne[0] + i1*ne[0] + i0] = scale * frand_normal(rnd);
}
}
}
@ -148,8 +153,8 @@ struct llama_hparams_lora {
uint32_t n_rot = 64;
uint32_t n_lora = 64;
bool operator!=(const llama_hparams & other) const {
return memcmp(this, &other, sizeof(llama_hparams));
bool operator!=(const llama_hparams_lora & other) const {
return memcmp(this, &other, sizeof(llama_hparams_lora)) != 0;
}
};

View file

@ -331,6 +331,13 @@ int main(int argc, char ** argv) {
std::vector<llama_token> embd;
// do one empty run to warm up the model
{
const std::vector<llama_token> tmp = { llama_token_bos(), };
llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads);
llama_reset_timings(ctx);
}
while ((n_remain != 0 && !is_antiprompt) || params.interactive) {
// predict
if (embd.size() > 0) {

View file

@ -0,0 +1,4 @@
set(TARGET train-text-from-scratch)
add_executable(${TARGET} train-text-from-scratch.cpp)
target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

View file

@ -0,0 +1,22 @@
# train-text-from-scratch
Basic usage instructions:
```bash
# get training data
wget https://github.com/brunoklein99/deep-learning-notes/blob/master/shakespeare.txt
# train
./bin/train-text-from-scratch \
--vocab-model ../models/ggml-vocab.bin \
--ctx 64 --embd 256 --head 8 --layer 16 \
--checkpoint-in chk-shakespeare-256x16.bin \
--checkpoint-out chk-shakespeare-256x16.bin \
--model-out ggml-shakespeare-256x16-f32.bin \
--train-data "shakespeare.txt" \
-t 6 -b 16 -n 32 --seed 1 --adam-iter 16 \
--print-details-interval 0 --predict 16 --use-flash
# predict
./bin/main -m ggml-shakespeare-256x16-f32.bin
```

File diff suppressed because it is too large Load diff

2097
ggml.c

File diff suppressed because it is too large Load diff

127
ggml.h
View file

@ -296,6 +296,7 @@ extern "C" {
GGML_OP_SUM_ROWS,
GGML_OP_MEAN,
GGML_OP_REPEAT,
GGML_OP_REPEAT_BACK,
GGML_OP_ABS,
GGML_OP_SGN,
GGML_OP_NEG,
@ -309,6 +310,7 @@ extern "C" {
GGML_OP_RMS_NORM_BACK,
GGML_OP_MUL_MAT,
GGML_OP_OUT_PROD,
GGML_OP_SCALE,
GGML_OP_SET,
@ -324,6 +326,7 @@ extern "C" {
GGML_OP_DIAG_MASK_INF,
GGML_OP_DIAG_MASK_ZERO,
GGML_OP_SOFT_MAX,
GGML_OP_SOFT_MAX_BACK,
GGML_OP_ROPE,
GGML_OP_ROPE_BACK,
GGML_OP_ALIBI,
@ -333,10 +336,14 @@ extern "C" {
GGML_OP_FLASH_ATTN,
GGML_OP_FLASH_FF,
GGML_OP_FLASH_ATTN_BACK,
GGML_OP_MAP_UNARY,
GGML_OP_MAP_BINARY,
GGML_OP_CROSS_ENTROPY_LOSS,
GGML_OP_CROSS_ENTROPY_LOSS_BACK,
GGML_OP_COUNT,
};
@ -574,6 +581,11 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_add1_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_acc(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -645,6 +657,11 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_repeat_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_abs(
struct ggml_context * ctx,
struct ggml_tensor * a);
@ -698,14 +715,22 @@ extern "C" {
struct ggml_tensor * a,
struct ggml_tensor * b);
// A: m rows, n columns
// B: p rows, n columns (i.e. we transpose it internally)
// A: n columns, m rows
// B: n columns, p rows (i.e. we transpose it internally)
// result is m columns, p rows
GGML_API struct ggml_tensor * ggml_mul_mat(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// A: m columns, n rows,
// B: p columns, n rows,
// result is m columns, p rows
GGML_API struct ggml_tensor * ggml_out_prod(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
//
// operations on tensors without backpropagation
//
@ -916,6 +941,17 @@ extern "C" {
struct ggml_context * ctx,
struct ggml_tensor * a);
GGML_API struct ggml_tensor * ggml_soft_max_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// in-place, returns view(a)
GGML_API struct ggml_tensor * ggml_soft_max_back_inplace(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
// rotary position embedding
// if mode & 1 == 1, skip n_past elements
// if mode & 2 == 1, GPT-NeoX style
@ -982,6 +1018,14 @@ extern "C" {
struct ggml_tensor * v,
bool masked);
GGML_API struct ggml_tensor * ggml_flash_attn_back(
struct ggml_context * ctx,
struct ggml_tensor * q,
struct ggml_tensor * k,
struct ggml_tensor * v,
struct ggml_tensor * d,
bool masked);
GGML_API struct ggml_tensor * ggml_flash_ff(
struct ggml_context * ctx,
struct ggml_tensor * a,
@ -1005,6 +1049,19 @@ extern "C" {
struct ggml_tensor * b,
ggml_binary_op_f32_t fun);
// loss function
GGML_API struct ggml_tensor * ggml_cross_entropy_loss(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b);
GGML_API struct ggml_tensor * ggml_cross_entropy_loss_back(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
struct ggml_tensor * c);
//
// automatic differentiation
//
@ -1099,6 +1156,8 @@ extern "C" {
struct {
int n_iter;
float sched; // schedule multiplier (fixed, decay or warmup)
float decay; // weight decay for AdamW, use 0.0f to disable
float alpha; // learning rate
float beta1;
float beta2;
@ -1123,6 +1182,49 @@ extern "C" {
} lbfgs;
};
struct ggml_opt_context {
struct ggml_context * ctx;
struct ggml_opt_params params;
int iter;
int64_t nx; // number of parameter elements
bool just_initialized;
struct {
struct ggml_tensor * x; // view of the parameters
struct ggml_tensor * g1; // gradient
struct ggml_tensor * g2; // gradient squared
struct ggml_tensor * m; // first moment
struct ggml_tensor * v; // second moment
struct ggml_tensor * mh; // first moment hat
struct ggml_tensor * vh; // second moment hat
struct ggml_tensor * pf; // past function values
float fx_best;
float fx_prev;
int n_no_improvement;
} adam;
struct {
struct ggml_tensor * x; // current parameters
struct ggml_tensor * xp; // previous parameters
struct ggml_tensor * g; // current gradient
struct ggml_tensor * gp; // previous gradient
struct ggml_tensor * d; // search direction
struct ggml_tensor * pf; // past function values
struct ggml_tensor * lmal; // the L-BFGS memory alpha
struct ggml_tensor * lmys; // the L-BFGS memory ys
struct ggml_tensor * lms; // the L-BFGS memory s
struct ggml_tensor * lmy; // the L-BFGS memory y
float fx_best;
float step;
int j;
int k;
int end;
int n_no_improvement;
} lbfgs;
};
GGML_API struct ggml_opt_params ggml_opt_default_params(enum ggml_opt_type type);
// optimize the function defined by the tensor f
@ -1131,6 +1233,27 @@ extern "C" {
struct ggml_opt_params params,
struct ggml_tensor * f);
// initialize optimizer context
GGML_API void ggml_opt_init(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_opt_params params,
int64_t nx);
// continue optimizing the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt_resume(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_tensor * f);
// continue optimizing the function defined by the tensor f
GGML_API enum ggml_opt_result ggml_opt_resume_g(
struct ggml_context * ctx,
struct ggml_opt_context * opt,
struct ggml_tensor * f,
struct ggml_cgraph * gf,
struct ggml_cgraph * gb);
//
// quantization
//

View file

@ -1036,6 +1036,12 @@ static void llama_model_load_internal(
case 40: model.type = e_model::MODEL_13B; break;
case 60: model.type = e_model::MODEL_30B; break;
case 80: model.type = e_model::MODEL_65B; break;
default:
{
if (hparams.n_layer < 32) {
model.type = e_model::MODEL_7B;
}
} break;
}
hparams.n_ctx = n_ctx;
@ -1200,6 +1206,7 @@ static void llama_model_load_internal(
mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0);
(void) vram_scratch;
(void) n_batch;
#ifdef GGML_USE_CUBLAS
vram_scratch = n_batch * MB;
ggml_cuda_set_scratch_size(vram_scratch);
@ -1227,6 +1234,7 @@ static void llama_model_load_internal(
model.tensors_by_name.emplace_back(lt.name, lt.ggml_tensor);
}
(void) tensor_split;
#if defined(GGML_USE_CUBLAS)
{
ggml_cuda_set_tensor_split(tensor_split);
@ -2161,6 +2169,10 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
return -log2f(candidate.p) > *mu;
}));
if (candidates->size == 0) {
candidates->size = 1;
}
// Normalize the probabilities of the remaining words
llama_sample_softmax(ctx, candidates);
@ -3287,6 +3299,19 @@ int llama_n_embd(const struct llama_context * ctx) {
return ctx->model.hparams.n_embd;
}
int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity) {
int n = std::min(capacity, (int) ctx->vocab.id_to_token.size());
for (int i = 0; i<n; ++i) {
strings[i] = ctx->vocab.id_to_token[i].tok.c_str();
scores[i] = ctx->vocab.id_to_token[i].score;
}
return n;
}
float * llama_get_logits(struct llama_context * ctx) {
return ctx->logits.data();
}

View file

@ -220,6 +220,14 @@ extern "C" {
LLAMA_API int llama_n_ctx (const struct llama_context * ctx);
LLAMA_API int llama_n_embd (const struct llama_context * ctx);
// Get the vocabulary as output parameters.
// Returns number of results.
LLAMA_API int llama_get_vocab(
const struct llama_context * ctx,
const char * * strings,
float * scores,
int capacity);
// Token logits obtained from the last call to llama_eval()
// The logits for the last token are stored in the last row
// Can be mutated in order to change the probabilities of the next token