llama : minor sampling refactor (2) (#9386)

2024-09-09 17:10:46 +02:00 · 2024-09-09 17:10:46 +02:00 · 5fb5e24811
commit 5fb5e24811
parent 38ca6f644b
12 changed files with 115 additions and 113 deletions
--- a/examples/batched.swift/Sources/main.swift
+++ b/examples/batched.swift/Sources/main.swift
@ -140,8 +140,6 @@ while n_cur <= n_len {

        let new_token_id = llama_sampler_sample(smpl, context, i_batch[i])

-        llama_sampler_accept(smpl, new_token_id)
-
        // is it an end of stream? -> mark the stream as finished
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            i_batch[i] = -1
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@ -172,8 +172,6 @@ int main(int argc, char ** argv) {

            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, i_batch[i]);

-            llama_sampler_accept(smpl, new_token_id);
-
            // is it an end of generation? -> mark the stream as finished
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                i_batch[i] = -1;
--- a/examples/gritlm/gritlm.cpp
+++ b/examples/gritlm/gritlm.cpp
@ -121,7 +121,6 @@ static std::string generate(llama_context * ctx, llama_sampler * smpl, const std
        llama_decode(ctx, bat);

        llama_token token = llama_sampler_sample(smpl, ctx, bat.n_tokens - 1);
-        llama_sampler_accept(smpl, token);

        if (token == eos_token) {
            break;
--- a/examples/llama.android/llama/src/main/cpp/llama-android.cpp
+++ b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@ -414,8 +414,6 @@ Java_android_llama_cpp_LLamaAndroid_completion_1loop(
    // sample the most likely token
    const auto new_token_id = llama_sampler_sample(sampler, context, -1);

-    llama_sampler_accept(sampler, new_token_id);
-
    const auto n_cur = env->CallIntMethod(intvar_ncur, la_int_var_value);
    if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
        return nullptr;
--- a/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
+++ b/examples/llama.swiftui/llama.cpp.swift/LibLlama.swift
@ -152,8 +152,6 @@ actor LlamaContext {

        new_token_id = llama_sampler_sample(sampling, context, batch.n_tokens - 1)

-        llama_sampler_accept(sampling, new_token_id)
-
        if llama_token_is_eog(model, new_token_id) || n_cur == n_len {
            print("\n")
            is_done = true
--- a/examples/passkey/passkey.cpp
+++ b/examples/passkey/passkey.cpp
@ -220,8 +220,6 @@ int main(int argc, char ** argv) {
        {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            llama_sampler_accept(smpl, new_token_id);
-
            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_len) {
                LOG_TEE("\n");
--- a/examples/save-load-state/save-load-state.cpp
+++ b/examples/save-load-state/save-load-state.cpp
@ -74,8 +74,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl, ctx, -1);
        auto next_token_str = llama_token_to_piece(ctx, next_token);

-        llama_sampler_accept(smpl, next_token);
-
        printf("%s", next_token_str.c_str());
        result0 += next_token_str;

@ -132,8 +130,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl2, ctx2, -1);
        auto next_token_str = llama_token_to_piece(ctx2, next_token);

-        llama_sampler_accept(smpl2, next_token);
-
        printf("%s", next_token_str.c_str());
        result1 += next_token_str;

@ -222,8 +218,6 @@ int main(int argc, char ** argv) {
        auto next_token     = llama_sampler_sample(smpl3, ctx3, -1);
        auto next_token_str = llama_token_to_piece(ctx3, next_token);

-        llama_sampler_accept(smpl3, next_token);
-
        printf("%s", next_token_str.c_str());
        result2 += next_token_str;

--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -613,7 +613,7 @@ struct server_context {

    gpt_params params;

-    llama_batch batch;
+    llama_batch batch = {};

    bool clean_kv_cache = true;
    bool add_bos_token  = true;
--- a/examples/simple/simple.cpp
+++ b/examples/simple/simple.cpp
@ -118,8 +118,6 @@ int main(int argc, char ** argv) {
        {
            const llama_token new_token_id = llama_sampler_sample(smpl, ctx, batch.n_tokens - 1);

-            llama_sampler_accept(smpl, new_token_id);
-
            // is it an end of generation?
            if (llama_token_is_eog(model, new_token_id) || n_cur == n_predict) {
                LOG_TEE("\n");