diff --git a/examples/train-text-from-scratch/train-text-from-scratch.cpp b/examples/train-text-from-scratch/train-text-from-scratch.cpp index aafdbff74..c9bba95c7 100644 --- a/examples/train-text-from-scratch/train-text-from-scratch.cpp +++ b/examples/train-text-from-scratch/train-text-from-scratch.cpp @@ -62,17 +62,6 @@ float frand_uniform(struct random_uniform_distribution * rnd) { return rnd->rd(rnd->gen); } -void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - struct ggml_tensor * randomize_tensor_normal(struct ggml_tensor * tensor, struct random_normal_distribution * rnd) { float scale = 1.0f; // xavier switch (tensor->n_dims) { @@ -205,17 +194,6 @@ struct my_llama_layer { struct ggml_tensor * w3; }; -struct my_llama_kv_cache { - struct ggml_context * ctx = NULL; - - struct ggml_tensor * k; - struct ggml_tensor * v; - - // llama_ctx_buffer buf; - - int n; // number of tokens currently in the cache -}; - struct my_llama_model { struct ggml_context * ctx = NULL; @@ -446,268 +424,6 @@ void randomize_model(struct my_llama_model * model, int seed, float mean, float } } -bool init_kv_cache(struct my_llama_kv_cache* cache, struct my_llama_model * model, int n_batch) { - const auto & hparams = model->hparams; - - const uint32_t n_ctx = hparams.n_ctx; - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - - const int64_t n_mem = n_layer*n_ctx*n_batch; - const int64_t n_elements = n_embd*n_mem; - - // cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB); - - // struct ggml_init_params params; - // params.mem_size = cache.buf.size; - // params.mem_buffer = cache.buf.addr; - // params.no_alloc = false; - if (!cache->ctx) { - struct ggml_init_params params; - params.mem_size = 2u*n_elements*ggml_type_size(GGML_TYPE_F32) + 2u*1024*1024; - params.mem_buffer = NULL; - params.no_alloc = false; - - cache->ctx = ggml_init(params); - - if (!cache->ctx) { - fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__); - return false; - } - } - - cache->k = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - cache->v = ggml_new_tensor_1d(cache->ctx, GGML_TYPE_F32, n_elements); - - return true; -} - -struct ggml_tensor * forward( - struct my_llama_model * model, - struct my_llama_kv_cache * cache, - struct ggml_context * ctx0, - struct ggml_cgraph * gf, - struct ggml_tensor * tokens_input, - const int n_tokens, - const int n_past) { - - const int N = n_tokens; - - struct my_llama_kv_cache& kv_self = *cache; - const auto & hparams = model->hparams; - const int n_ctx = hparams.n_ctx; - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_head = hparams.n_head; - const int n_rot = hparams.n_rot; - const float rms_norm_eps = hparams.f_norm_rms_eps; - - struct ggml_tensor * tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - memcpy(tokens->data, tokens_input->data, N*ggml_element_size(tokens)); - - struct ggml_tensor * kc = kv_self.k; - struct ggml_tensor * vc = kv_self.v; - - // inpL shape [n_embd,N,1,1] - struct ggml_tensor * inpL = ggml_get_rows(ctx0, model->tok_embeddings, tokens); - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * inpSA = inpL; - - struct ggml_tensor * cur; - - // lctx.use_buf(ctx0, 0); - - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // cur = attention_norm*cur - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].attention_norm, cur), - cur); - } - - // self-attention - { - // compute Q and K and RoPE them - // wq shape [n_embd, n_embd, 1, 1] - // wk shape [n_embd, n_embd, 1, 1] - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Kcur shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * Qcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wq, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx); - struct ggml_tensor * Kcur = ggml_rope_inplace(ctx0, ggml_reshape_3d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wk, cur), n_embd/n_head, n_head, N), n_past, n_rot, 0, n_ctx); - - // store key and value to memory - { - // compute the transposed [N, n_embd] V matrix - // wv shape [n_embd, n_embd, 1, 1] - // Vcur shape [n_embd, N, 1, 1] - struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_transpose(ctx0, ggml_reshape_2d(ctx0, ggml_mul_mat(ctx0, model->layers[il].wv, cur), n_embd, N))); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // kv_self.v shape [n_embd * n_ctx * n_layer, 1] - // k shape [n_embd * N, 1] == kv_self.k[:,n_past:n_past+N,il,0] - // v shape [N, n_embd, 1, 1] == kv_self.v[:,n_past:n_past+N,il,0] - - /* { - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, - ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - - // important: storing RoPE-ed version of K in the KV cache! - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } //*/ - - kc = ggml_set_1d_inplace(ctx0, kc, ggml_reshape_1d(ctx0, Kcur, n_embd*N), (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); - vc = ggml_set_2d_inplace(ctx0, vc, Vcur, ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); - } - - // Qcur shape [n_embd/n_head, n_head, N, 1] - // Q shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * Q = - ggml_permute(ctx0, - Qcur, - 0, 2, 1, 3); - - // kv_self.k shape [n_embd * n_ctx * n_layer, 1] - // K shape [n_embd/n_head, n_past + N, n_head, 1] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kc, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kc)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - - // K * Q - // KQ shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // KQ_scaled shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); - - // KQ_masked = mask_past(KQ_scaled) - // KQ_masked shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - - // KQ = soft_max(KQ_masked) - // KQ_soft_max shape [n_past + N, N, n_head, 1] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - - // split cached V into n_head heads - //// V shape [n_past + N, n_embd/n_head, n_head, 1] - // V shape [n_past + N, n_embd/n_head, n_head, 1] == kv_self.v[:,:(n_past+N),il,1] - struct ggml_tensor * V = - ggml_view_3d(ctx0, vc, - n_past + N, n_embd/n_head, n_head, - n_ctx*ggml_element_size(vc), - n_ctx*ggml_element_size(vc)*n_embd/n_head, - il*n_ctx*ggml_element_size(vc)*n_embd); - - // KQV shape [n_embd/n_head, N, n_head, 1] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // KQV_merged shape [n_embd/n_head, n_head, N, 1] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - // KQV_merged shape - - // cur = KQV_merged.contiguous().view(n_embd, N) - // cur shape [n_embd,N,1,1] - cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, KQV_merged), n_embd, N); - // cur = ggml_cpy(ctx0, - // KQV_merged, - // ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - - // projection (no bias) - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].wo, - cur); - } - - // lctx.use_buf(ctx0, 1); - - // inpFF shape [n_embd,N,1,1] - struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA); - - // feed-forward network - { - // norm - { - // cur shape [n_embd,N,1,1] - cur = ggml_rms_norm(ctx0, inpFF, rms_norm_eps); - - // cur = ffn_norm*cur - // cur shape [n_embd,N,1,1] - cur = ggml_mul(ctx0, - ggml_repeat(ctx0, model->layers[il].ffn_norm, cur), - cur); - } - - // tmp shape [n_ff,N,1,1] - struct ggml_tensor * tmp = ggml_mul_mat(ctx0, - model->layers[il].w3, - cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w1, - cur); - - // SILU activation - // cur shape [n_ff,N,1,1] - cur = ggml_silu(ctx0, cur); - - // cur shape [n_ff,N,1,1] - cur = ggml_mul(ctx0, cur, tmp); - - // cur shape [n_embd,N,1,1] - cur = ggml_mul_mat(ctx0, - model->layers[il].w2, - cur); - } - - // cur shape [n_embd,N,1,1] - cur = ggml_add(ctx0, cur, inpFF); - - // input for next layer - // inpL shape [n_embd,N,1,1] - inpL = cur; - } - - // norm - { - - // inpL shape [n_embd,N,1,1] - inpL = ggml_rms_norm(ctx0, inpL, rms_norm_eps); - - // inpL = norm*inpL - // inpL shape [n_embd,N,1,1] - inpL = ggml_mul(ctx0, - ggml_repeat(ctx0, model->norm, inpL), - inpL); - - //embeddings = inpL; - } - - // lm_head - // inpL shape [n_vocab,N,1,1] - inpL = ggml_mul_mat(ctx0, model->output, inpL); - - // run the computation - ggml_build_forward_expand(gf, inpL); - - return inpL; -} - void assert_shape_1d(struct ggml_tensor * tensor, int64_t ne0) { GGML_ASSERT(tensor->n_dims == 1); GGML_ASSERT(tensor->ne[0] == ne0); @@ -1157,42 +873,6 @@ void print_matrix(struct ggml_tensor * probs) { } } - -void print_token(struct llama_context * ctx, llama_token token) { - printf("%s", llama_token_to_piece(ctx, token).c_str()); -} - -void print_tokens(struct llama_context* ctx, struct ggml_tensor * tokens) { - for (int i=0; ine[0]; ++i) { - int token = ggml_get_i32_1d(tokens, i); - print_token(ctx, token); - } -} - -void print_tokens_batch(struct llama_context* ctx, struct ggml_tensor * tokens) { - for (int i1=0; i1ne[1]; ++i1) { - //int num_newline = 0; - for (int i0=0; i0ne[0]; ++i0) { - int token = get_i32_2d(tokens, i0, i1); - print_token(ctx, token); - // bool isnl = (token == llama_token_nl()); - // if (isnl) { - // ++num_newline; - // } - // if (isnl) { - // if (num_newline < 2) { - // print_token(ctx, token); - // } else { - // printf("\\n"); - // } - // } else { - // print_token(ctx, token); - // } - } - printf("\n--\n"); - } -} - void get_example_targets(struct llama_context * lctx, const int * train_samples, size_t n_train_samples, const llama_token * train_data, size_t n_train_data, int example_id, struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs) { int n_tokens = tokens_input->ne[0]; int n_vocab = target_logits->ne[0]; @@ -1249,26 +929,6 @@ void get_example_targets_batch(struct llama_context * lctx, const int * train_sa } -void lshift_examples(struct ggml_tensor * tokens_input, struct ggml_tensor * target_logits, struct ggml_tensor * target_probs, int n_shift) { - int n_tokens = tokens_input->ne[0]; - int n_vocab = target_logits->ne[0]; - for (int i=0; i candidates; - llama_token_data_array candidates_p; - -}; - -void init_sampler(struct my_llama_sampler * sampler, struct llama_context * ctx) { - sampler->ctx = ctx; - sampler->n_vocab = llama_n_vocab(sampler->ctx); - sampler->n_ctx = llama_n_ctx(sampler->ctx); - sampler->mirostat_mu = 2.0f * sampler->params.mirostat_tau; -} - -llama_token sample(struct my_llama_sampler * sampler, float * logits, const llama_token * last_tokens, int n_last_tokens) { - GGML_ASSERT(sampler->ctx != NULL); - - struct llama_context * ctx = sampler->ctx; - - sampler->candidates.resize(sampler->n_vocab); - for (llama_token token_id = 0; token_id < sampler->n_vocab; ++token_id) { - sampler->candidates[token_id].id = token_id; - sampler->candidates[token_id].logit = logits[token_id]; - sampler->candidates[token_id].p = 0.0; - } - - llama_token_data_array * candidates_p = & sampler->candidates_p; - - candidates_p->data = sampler->candidates.data(); - candidates_p->size = sampler->candidates.size(); - candidates_p->sorted = false; - - const auto params = sampler->params; - - // Apply penalties - const float nl_logit = logits[llama_token_nl(ctx)]; - - const int n_last = std::min(std::min(n_last_tokens, params.repeat_last_n), sampler->n_ctx); - - llama_sample_repetition_penalty( - ctx, - candidates_p, - last_tokens + n_last_tokens - n_last, - n_last, - params.repeat_penalty); - llama_sample_frequency_and_presence_penalties( - ctx, - candidates_p, - last_tokens + n_last_tokens - n_last, - n_last, - params.frequency_penalty, - params.presence_penalty); - - if (!params.penalize_nl) { - logits[llama_token_nl(ctx)] = nl_logit; - } - - llama_token token = 0; - if (params.temp <= 0) { - // Greedy sampling - token = llama_sample_token_greedy(ctx, candidates_p); - } else { - if (params.mirostat == 1) { - int mirostat_m = 100; - llama_sample_temperature(ctx, candidates_p, params.temp); - token = llama_sample_token_mirostat(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, mirostat_m, &sampler->mirostat_mu); - } else if (params.mirostat == 2) { - llama_sample_temperature(ctx, candidates_p, params.temp); - token = llama_sample_token_mirostat_v2(ctx, candidates_p, params.mirostat_tau, params.mirostat_eta, &sampler->mirostat_mu); - } else { - // Temperature sampling - llama_sample_top_k (ctx, candidates_p, params.top_k, 1); - llama_sample_tail_free (ctx, candidates_p, params.tfs_z, 1); - llama_sample_typical (ctx, candidates_p, params.typical_p, 1); - - llama_sample_top_p (ctx, candidates_p, params.top_p, 1); - llama_sample_temperature (ctx, candidates_p, params.temp); - token = llama_sample_token(ctx, candidates_p); - } - } - return token; -} - -void set_logits_masked(struct ggml_tensor * logits, std::vector& mask, float value) { - GGML_ASSERT(logits->ne[0] == (int64_t) mask.size()); - for (int i2 = 0; i2 < logits->ne[2]; ++i2) { - for (int i1 = 0; i1 < logits->ne[1]; ++i1) { - for (int i0 = 0; i0 < logits->ne[0]; ++i0) { - if (!mask[i0]) continue; - float * ptr = (float *) ((char *) logits->data + i2*logits->nb[2] + i1*logits->nb[1] + i0*logits->nb[0]); - *ptr = value; - } - } - } -} - #define GGUF_GET_KEY(ctx, dst, func, type, req, key) \ { \ const std::string skey(key); \ @@ -1976,14 +1517,12 @@ struct train_params { int n_threads; int n_batch; int n_examples; - int n_predict; float f_norm_rms_eps; float rope_freq_base; float rope_freq_scale; int print_info_interval; - int print_details_interval; bool samples_start_after_nl; bool use_adam; @@ -2037,14 +1576,12 @@ struct train_params get_default_train_params() { params.n_threads = 6; params.n_batch = 8; params.n_examples = 1; - params.n_predict = 1024; params.f_norm_rms_eps = 1e-5; params.rope_freq_base = 10000.0f; params.rope_freq_scale = 1.0f; params.print_info_interval = 1; - params.print_details_interval = 2; params.samples_start_after_nl = false; params.use_adam = true; @@ -2102,9 +1639,7 @@ void train_print_usage(int /*argc*/, char ** argv, const struct train_params * p fprintf(stderr, " -t N, --threads N Number of threads (default %d)\n", params->n_threads); fprintf(stderr, " -b N, --batch N Parallel batch size (default %d)\n", params->n_batch); fprintf(stderr, " -n N, --examples N Number of examples to train (default %d)\n", params->n_examples); - fprintf(stderr, " --predict N Number of tokens to generate after training (default %d)\n", params->n_predict); fprintf(stderr, " --print-info-interval N Print infos during training each N examples (default %d)\n", params->print_info_interval); - fprintf(stderr, " --print-details-interval N Print details during training each N examples (default %d)\n", params->print_details_interval); fprintf(stderr, " --samples-after-nl Training samples start after newlines. (default %s)\n", params->samples_start_after_nl ? "on" : "off"); fprintf(stderr, " --use-lbfgs Use LBFGS optimizer instead of default Adam\n"); fprintf(stderr, " --use-adam Use Adam optimizer (default)\n"); @@ -2253,24 +1788,12 @@ bool train_params_parse(int argc, char ** argv, struct train_params * params) { break; } params->n_examples = std::stoi(argv[i]); - } else if (arg == "--predict") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->n_predict = std::stoi(argv[i]); } else if (arg == "--print-info-interval") { if (++i >= argc) { invalid_param = true; break; } params->print_info_interval = std::stoi(argv[i]); - } else if (arg == "--print-details-interval") { - if (++i >= argc) { - invalid_param = true; - break; - } - params->print_details_interval = std::stoi(argv[i]); } else if (arg == "--samples-after-nl") { params->samples_start_after_nl = true; } else if (arg == "--use-lbfgs") { @@ -2547,19 +2070,12 @@ int main(int argc, char ** argv) { } printf("%s: number of unique tokens: %d\n", __func__, n_unique_tokens); - struct my_llama_kv_cache kv_self; - - struct ggml_init_params lcparams; lcparams.mem_size = 1024ll*1024ll*1024ll*((size_t) params.mem_model_gb); lcparams.mem_buffer = NULL; lcparams.no_alloc = false; model.ctx = ggml_init(lcparams); - kv_self.ctx = model.ctx; - - my_llama_sampler sampler; - int n_tokens = model.hparams.n_ctx; int n_vocab = model.hparams.n_vocab; @@ -2614,11 +2130,7 @@ int main(int argc, char ** argv) { randomize_model(&model, params.seed, 0.0f, 1.0f, -1.0f, +1.0f); } - init_kv_cache(&kv_self, &model, 1); - // init_kv_cache(&kv_self, &model, n_batch); - init_sampler(&sampler, lctx); - - printf("used_mem model+cache: %zu bytes\n", ggml_used_mem(model.ctx)); + printf("used_mem model: %zu bytes\n", ggml_used_mem(model.ctx)); // ggml_print_tensor_objects(model.ctx); // TODO: use std::vector intead of "new" @@ -2647,8 +2159,6 @@ int main(int argc, char ** argv) { GGML_ASSERT(train_samples[i]+n_tokens-1 < (int) train_tokens.size()); } - std::vector work_buffer; - printf("%s: begin training\n", __func__); struct opt_callback_data opt_cb_data; @@ -2755,31 +2265,6 @@ int main(int argc, char ** argv) { printf("used_mem_after_opt: %zu bytes\n", used_mem_after_opt); } - if (params.print_details_interval > 0 && ex % params.print_details_interval == 0) { - // set_logits_masked(logits, token_notavail, -1e9); - for (int i=0; idata + i*logits->nb[2] + k*logits->nb[1]), - (llama_token *) ((char *) tokens_input->data + i*tokens_input->nb[1]), - k); - * ((int32_t *) ((char *) after_opt_best_samples->data + i*after_opt_best_samples->nb[1] + k*after_opt_best_samples->nb[0])) = token; - } - } - - // printf("probabilities after optimization:\n"); - // print_matrix(after_opt_probs); - printf("Example:\n---\n"); - print_tokens_batch(lctx, tokens_input); - printf("\n---\n"); - - // printf("best samples after optimization:\n---\n"); - printf("samples after optimization:\n---\n"); - print_tokens_batch(lctx, after_opt_best_samples); - printf("\n---\n"); - } - ggml_free(ctx0); } @@ -2796,78 +2281,6 @@ int main(int argc, char ** argv) { save_llama_model_file(params.fn_model_out, params.fn_vocab_model, &model); } - { - int n_gen = params.n_predict; - int sample_ctx = n_tokens - n_tokens/8; - - // use defaults from common.h - sampler.params.top_k = 40; - sampler.params.top_p = 0.95f; - sampler.params.tfs_z = 1.00f; - sampler.params.typical_p = 1.00f; - sampler.params.temp = 0.8f; - sampler.params.repeat_penalty = 1.1f; - sampler.params.repeat_last_n = 64; - sampler.params.frequency_penalty = 0.0f; - sampler.params.presence_penalty = 0.0f; - sampler.params.mirostat = 0; - sampler.params.mirostat_tau = 5.00f; - sampler.params.mirostat_eta = 0.10f; - init_sampler(&sampler, lctx); - - printf("[Prediction context]\n"); - - struct ggml_tensor * tokens_input = ggml_new_tensor_1d(model.ctx, GGML_TYPE_I32, n_tokens); - struct ggml_tensor * target_logits = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); - struct ggml_tensor * target_probs = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, n_vocab, n_tokens); - - get_example_targets(lctx, train_samples.data(), train_samples.size(), train_tokens.data(), train_tokens.size(), rand()%train_samples.size(), tokens_input, target_logits, target_probs); - for (int i=sample_ctx; idata + (sample_ctx-1)*logits->nb[1]), - (llama_token *) tokens_input->data, - sample_ctx-1); - //int token = ggml_get_i32_1d(best_samples, sample_ctx-1); - - // print_row(probs, sample_at); - print_token(lctx, token); - - lshift_examples(tokens_input, target_logits, target_probs, 1); - ggml_set_i32_1d(tokens_input, 0, 0); - ggml_set_i32_1d(tokens_input, sample_ctx-1, token); - - ggml_free(ctx0); - } - } - if (alloc) { ggml_allocr_free(alloc); }