duo: more cleanup

2024-05-21 22:45:59 -04:00 · 2024-05-21 22:45:59 -04:00 · 2849247c4f
commit 2849247c4f
parent f3965704fd
1 changed files with 10 additions and 29 deletions
--- a/examples/duo/duo.cpp
+++ b/examples/duo/duo.cpp
@ -46,45 +46,28 @@ int main(int argc, char ** argv) {
    params.cb_split_done = split_done_cb;
    std::tie(model, ctx) = llama_init_from_gpt_params(params);
-    const int n_len = 128;
+    llama_tokens input = llama_tokenize(ctx, params.prompt, true);
-    std::vector<llama_token> tokens_list;
+    const size_t n_input = input.size();
    tokens_list = ::llama_tokenize(ctx, params.prompt, true);
    const int n_ctx    = llama_n_ctx(ctx);
    const int n_kv_req = tokens_list.size() + (n_len - tokens_list.size());
    LOG_TEE("\n%s: n_len = %d, n_ctx = %d, n_kv_req = %d\n", __func__, n_len, n_ctx, n_kv_req);
    // make sure the KV cache is big enough to hold all the prompt and generated tokens
    if (n_kv_req > n_ctx) {
        LOG_TEE("%s: error: n_kv_req > n_ctx, the required KV cache size is not big enough\n", __func__);
        LOG_TEE("%s:        either reduce n_len or increase n_ctx\n", __func__);
        return 1;
    }
    // print the prompt token-by-token
-    for (auto id : tokens_list) {
+    for (auto id : input) {
-        fprintf(stderr, "%s", llama_token_to_piece(ctx, id).c_str());
+        fprintf(stdout, "%s", llama_token_to_piece(ctx, id).c_str());
    }
-
+    fflush(stdout);
    fflush(stderr);
    llama_batch batch = llama_batch_init(512, 0, 1);
    // evaluate the initial prompt
-    for (size_t i = 0; i < tokens_list.size(); i++) {
+    for (size_t i = 0; i < input.size(); i++) {
-        llama_batch_add(batch, tokens_list[i], i, { 0 }, false);
+        llama_batch_add(batch, input[i], i, { 0 }, false);
    }
    // llama_decode will output logits only for the last token of the prompt
    batch.logits[batch.n_tokens - 1] = true;
    if (llama_decode(ctx, batch) != 0) {
        LOG_TEE("%s: llama_decode() failed\n", __func__);
        return 1;
    }
    // main loop
    int n_cur    = batch.n_tokens;
    int n_decode = 0;
@ -94,7 +77,7 @@ int main(int argc, char ** argv) {
    // we'll use logits from this position to determine next token
    int logit_idx = batch.n_tokens - 1;
-    while (n_cur <= n_len) {
+    while (n_decode <= params.n_predict) {
        // sample the next token
        {
            auto   n_vocab = llama_n_vocab(model);
@ -113,13 +96,11 @@ int main(int argc, char ** argv) {
            const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p);
            // is it an end of generation?
-            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
+            if (llama_token_is_eog(model, new_token_id) || n_decode >= params.n_predict) {
                LOG_TEE("\n");
                break;
            }
-            LOG_TEE("%s", llama_token_to_piece(ctx, new_token_id).c_str());
+            fprintf(stdout, "%s", llama_token_to_piece(ctx, new_token_id).c_str());
            fflush(stdout);
            // prepare the next batch