diff --git a/examples/baby-llama/baby-llama-text.cpp b/examples/baby-llama/baby-llama-text.cpp index b5177ed5b..ed7dc9666 100644 --- a/examples/baby-llama/baby-llama-text.cpp +++ b/examples/baby-llama/baby-llama-text.cpp @@ -7,6 +7,7 @@ #include #include #include +#include struct random_normal_distribution { @@ -42,6 +43,10 @@ float fclamp(const float v, const float min, const float max) { return ((v < min) ? (min) : (v > max) ? (max) : v); } +float frand() { + return (float)rand()/(float)RAND_MAX; +} + float frand_normal(struct random_normal_distribution * rnd) { return fclamp(rnd->rd(rnd->gen), rnd->min, rnd->max); } @@ -162,6 +167,17 @@ uint32_t get_n_ff(const struct my_llama_hparams* hparams) { return n_ff; } +void print_params(struct my_llama_hparams * params) { + printf("%s: n_vocab: %d\n", __func__, params->n_vocab); + printf("%s: n_ctx: %d\n", __func__, params->n_ctx); + printf("%s: n_embd: %d\n", __func__, params->n_embd); + printf("%s: n_mult: %d\n", __func__, params->n_mult); + printf("%s: n_head: %d\n", __func__, params->n_head); + printf("%s: n_ff: %d\n", __func__, get_n_ff(params)); + printf("%s: n_layer: %d\n", __func__, params->n_layer); + printf("%s: n_rot: %d\n", __func__, params->n_rot); +} + struct my_llama_layer { // normalization struct ggml_tensor * attention_norm; @@ -989,18 +1005,17 @@ void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) } } -void get_example_targets(const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { +void get_example_targets(const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * targets) { int n_tokens = tokens_input->ne[0]; int n_vocab = targets->ne[0]; - int n_examples = (n_train_data / (size_t) n_tokens); - int begin = (example_id % n_examples) * n_tokens; - GGML_ASSERT(begin+n_tokens-1 < n_train_data); + int sample = train_samples[example_id % n_train_samples]; + GGML_ASSERT(sample+n_tokens-1 < n_train_data); - ggml_set_f32(targets, -1.0f); + ggml_set_f32(targets, -1.0f/n_vocab); ggml_set_i32_1d(tokens_input, 0, llama_token_bos()); for (int i=1; in_dims == 2); GGML_ASSERT( targets->n_dims == 3); int n_tokens = tokens_input->ne[0]; @@ -1028,7 +1043,7 @@ void get_example_targets_batch(struct ggml_context * ctx, const llama_token * tr targets->nb[1], k*targets->nb[2]); - get_example_targets(train_data, n_train_data, + get_example_targets(train_samples, n_train_samples, train_data, n_train_data, example_id*n_batch + k, tokens_input_k, targets_k); } } @@ -1171,10 +1186,11 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto struct llama_file f(filename, "rb"); std::vector buf; - buf.resize(f.size); + buf.resize(f.size+1); f.read_raw(buf.data(), f.size); - + buf[f.size] = '\0'; + out.resize(buf.size()); int n_tokens = llama_tokenize(lctx, buf.data(), out.data(), buf.size(), false); @@ -1186,6 +1202,143 @@ int tokenize_file(struct llama_context * lctx, const char * filename, std::vecto return n_tokens; } +void shuffle_ints(int * begin, int * end) { + if (end <= begin) return; + int max=begin[0]; + for (int i=1; i max) { + max = begin[i]; + } + } + std::vector vals; + vals.resize(max+1); + for (int i=0; i candidates; + llama_token_data_array candidates_p; + +}; + +void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) { + sampler->ctx = ctx; + sampler->n_vocab = llama_n_vocab(sampler->ctx); + sampler->n_ctx = llama_n_ctx(sampler->ctx); + sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau; +} + +llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) { + GGML_ASSERT(sampler->ctx != NULL); + + struct llama_context * ctx = sampler->ctx; + + sampler->candidates.resize(sampler->n_vocab); + for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) { + sampler->candidates[token_id].id = token_id; + sampler->candidates[token_id].logit = logits[token_id]; + sampler->candidates[token_id].p = 0.0; + } + + llama_token_data_array * candidates_p = & sampler->candidates_p; + + candidates_p->data = sampler->candidates.data(); + candidates_p->size = sampler->candidates.size(); + candidates_p->sorted = false; + + const auto params = sampler->params; + + // Apply penalties + const float nl_logit = logits[llama_token_nl()]; + + const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx); + + llama_sample_repetition_penalty( + ctx, + candidates_p, + last_tokens + n_last_tokens - n_last, + n_last, + params.repeat_penalty); + llama_sample_frequency_and_presence_penalties( + ctx, + candidates_p, + last_tokens + n_last_tokens - n_last, + n_last, + params.alpha_frequency, + params.alpha_presence); + + if (!params.penalize_nl) { + logits[llama_token_nl()] = nl_logit; + } + + llama_token token = 0; + if (params.temp <= 0) { + // Greedy sampling + token = llama_sample_token_greedy(ctx, candidates_p); + } else { + if (params.mirostat == 1) { + int mirostat_m = 100; + llama_sample_temperature(ctx, candidates_p, params.temp); + token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu); + } else if (params.mirostat == 2) { + llama_sample_temperature(ctx, candidates_p, params.temp); + token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu); + } else { + // Temperature sampling + llama_sample_top_k (ctx, candidates_p, params.top_k, 1); + llama_sample_tail_free (ctx, candidates_p, params.tfs_z, 1); + llama_sample_typical (ctx, candidates_p, params.typical_p, 1); + + llama_sample_top_p (ctx, candidates_p, params.top_p, 1); + llama_sample_temperature (ctx, candidates_p, params.temp); + token = llama_sample_token(ctx, candidates_p); + } + } + return token; +} + +void set_logits_masked(struct ggml_tensor * logits, std::vector& mask, float value) { + GGML_ASSERT(logits->ne[0] == mask.size()); + for (int i2 = 0; i2 < logits->ne[2]; ++i2) { + for (int i1 = 0; i1 < logits->ne[1]; ++i1) { + for (int i0 = 0; i0 < logits->ne[0]; ++i0) { + if (!mask[i0]) continue; + float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]); + *ptr = value; + } + } + } +} + int main(int argc, char ** argv) { const char * default_model = "ggml-vic7b-uncensored-q4_0.bin"; const char * default_train = "shakespeare.txt"; @@ -1220,6 +1373,17 @@ int main(int argc, char ** argv) { model.hparams.n_layer = 1; model.hparams.n_rot = std::min(16u, model.hparams.n_embd / model.hparams.n_head); + print_params(&model.hparams); + + std::vector token_occurs; + std::vector token_notavail; + token_occurs.resize(model.hparams.n_vocab, false); + token_notavail.resize(model.hparams.n_vocab, true); + for (int i=0; i train_samples; + for (int i=0; i= train_samples.size()) { + shuffle_ints(train_samples.data(), train_samples.data() + train_samples.size()); + for (int i=0; idata + i*logits->nb[2] + k*logits->nb[1]), + (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), + k); + * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token; + } + } + + // sample_softmax_batch(ctx0, logits, after_opt_probs, after_opt_best_samples); // printf("probabilities after optimization:\n"); // print_matrix(after_opt_probs); printf("Example:\n---\n"); print_tokens_batch(lctx, tokens_input); printf("\n---\n"); - printf("best samples after optimization:\n---\n"); + // printf("best samples after optimization:\n---\n"); + printf("samples after optimization:\n---\n"); print_tokens_batch(lctx, after_opt_best_samples); printf("\n---\n"); } @@ -1320,13 +1517,15 @@ int main(int argc, char ** argv) { { int n_gen = 128; int sample_ctx = n_tokens - n_tokens/8; + + init_sampler(&sampler, lctx); printf("Generating %d tokens.\n", n_gen); struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens); struct ggml_tensor * targets = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); - get_example_targets(train_tokens.data(), train_tokens.size(), 137, tokens_input, targets); + get_example_targets(train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), 137, tokens_input, targets); for (int i=sample_ctx; idata + (sample_ctx-1)*logits->nb[1]), + (llama_token *) tokens_input->data, + sample_ctx-1); + // sample_softmax(logits, probs, best_samples); + //int token = ggml_get_i32_1d(best_samples, sample_ctx-1); // print_row(probs, sample_at); print_token(lctx, token);