llama : remove all_pos_0, all_pos_1, all_seq_id from llama_batch (#9745)

* refactor llama_batch_get_one * adapt all examples * fix simple.cpp * fix llama_bench * fix * fix context shifting * free batch before return * use common_batch_add, reuse llama_batch in loop * null terminated seq_id list * fix save-load-state example * fix perplexity * correct token pos in llama_batch_allocr
2024-10-18 23:18:01 +02:00 · 2024-10-18 23:18:01 +02:00 · cda0e4b648
commit cda0e4b648
parent afd9909a64
22 changed files with 205 additions and 118 deletions
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@ -408,14 +408,21 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
        // clear the KV cache
        llama_kv_cache_clear(ctx);

+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);

+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
            //LOG_DBG("    Batch %d: starts at %d, size is %d, n_past is %d\n",j,batch_start,batch_size,j * n_batch);
-            // TODO: use llama_batch.logits instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            if (llama_decode(ctx, batch)) {
                //LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                return {tokens, -1, logit_history, prob_history};
            }

@ -435,6 +442,8 @@ static results_perplexity perplexity_v2(llama_context * ctx, const common_params
            }
        }

+        llama_batch_free(batch);
+
        const auto t_end = std::chrono::high_resolution_clock::now();

        if (i == 0) {
@ -704,7 +713,6 @@ static bool decode_helper(llama_context * ctx, llama_batch & batch, std::vector<
            batch.n_seq_id + i,
            batch.seq_id   + i,
            batch.logits   + i,
-            0, 0, 0, // unused
        };

        const int ret = llama_decode(ctx, batch_view);
@ -1791,6 +1799,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
        // clear the KV cache
        llama_kv_cache_clear(ctx);

+        llama_batch batch = llama_batch_init(n_batch, 0, 1);
+
        for (int j = 0; j < num_batches; ++j) {
            const int batch_start = start + j * n_batch;
            const int batch_size  = std::min(end - batch_start, n_batch);
@ -1803,9 +1813,14 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
                tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
            }

-            // TODO: use llama_batch.logits instead of relying on logits_all == true
-            if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
+            common_batch_clear(batch);
+            for (int i = 0; i < batch_size; i++) {
+                common_batch_add(batch, tokens[batch_start + i], j*n_batch + i, {0}, true);
+            }
+
+            if (llama_decode(ctx, batch)) {
                LOG_ERR("%s : failed to eval\n", __func__);
+                llama_batch_free(batch);
                return;
            }

@ -1818,6 +1833,8 @@ static void kl_divergence(llama_context * ctx, const common_params & params) {
            }
        }

+        llama_batch_free(batch);
+
        const auto t_end = std::chrono::high_resolution_clock::now();

        if (i == 0) {