Automate Context resetting and minor fixes

Fixed top_k still not being set. Removed an unnecessary loop.
2023-05-27 16:43:08 -07:00 · 2023-05-27 16:43:08 -07:00 · 36c86d794d
commit 36c86d794d
parent 66ed19d01f
1 changed files with 27 additions and 29 deletions
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@ -29,36 +29,17 @@ struct llama_server_context
  std::vector<std::vector<llama_token>> no_show_words;
  std::vector<llama_token> tokens_predicted;
  std::vector<llama_token> last_prompt_tokens;
  llama_context *ctx;
  gpt_params params;
  bool reload_ctx = false;
  void rewind() {
    as_loop = false;
    params.antiprompt.clear();
    no_show_words.clear();
    num_tokens_predicted = 0;
    generated_text = "";
    if(reload_ctx)
    {
      if(!processed_tokens.empty())
      {
        processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end());
      }
      if(!embd_inp.empty())
      {
        embd_inp.erase(embd_inp.begin() + 1, embd_inp.end());
      }
      n_remain = 0;
      n_past = 0;
      n_consumed = 0;
      reload_ctx = false;
    }
  }
  bool loadModel(gpt_params params_)
@ -82,6 +63,28 @@ struct llama_server_context
    std::vector<llama_token> prompt_tokens = ::llama_tokenize(ctx, params.prompt, true);
    // compare the evaluated prompt with the new prompt
    int new_prompt_len = 0;
    if (last_prompt_tokens == prompt_tokens)
    {
      //fprintf(stdout, "Context matched.\n");
      processed_tokens = last_prompt_tokens;
      embd_inp = last_prompt_tokens;
      n_past = processed_tokens.size();
      n_consumed = last_prompt_tokens.size() - 2;
      last_prompt_tokens = prompt_tokens;
      has_next_token = true;
      return true;
    }
    else
    {
      if (!processed_tokens.empty() && !embd_inp.empty())
      {
        //fprintf(stdout, "Resetting context.\n");
        processed_tokens.erase(processed_tokens.begin() + 1, processed_tokens.end());
        embd_inp.erase(embd_inp.begin() + 1, embd_inp.end());
        n_consumed = 0;
        n_past = 0;
      }
    }
    for (size_t i = 0; i < prompt_tokens.size(); i++) {
      if (i < processed_tokens.size() &&
        processed_tokens[i] == prompt_tokens[i])
@ -159,6 +162,7 @@ struct llama_server_context
      const float temp = params.temp;
      // const int32_t top_k = params.top_k <= 0 ? llama_n_vocab(ctx) : params.top_k;
      const float top_p = params.top_p;
      const float top_k = params.top_k;
      const float tfs_z = params.tfs_z;
      const float typical_p = params.typical_p;
      const int32_t repeat_last_n = params.repeat_last_n < 0 ? params.n_ctx : params.repeat_last_n;
@ -229,6 +233,7 @@ struct llama_server_context
            llama_sample_tail_free(ctx, &candidates_p, tfs_z, 1);
            llama_sample_typical(ctx, &candidates_p, typical_p, 1);
            llama_sample_top_p(ctx, &candidates_p, top_p, 1);
            llama_sample_top_k(ctx, &candidates_p, top_k, 1);
            llama_sample_temperature(ctx, &candidates_p, temp);
            id = llama_sample_token(ctx, &candidates_p);
          }
@ -253,10 +258,7 @@ struct llama_server_context
      // add it to the context
      embd.push_back(id);
      for (auto id : embd)
      {
      result = id;
      }
      // decrement remaining sampling budget
      --n_remain;
    }
@ -619,10 +621,6 @@ bool parse_options_completion(json body, llama_server_context& llama, Response &
  {
    llama.params.interactive = body["interactive"].get<bool>();
  }
  if (!body["reload_ctx"].is_null())
  {
      llama.reload_ctx = body["reload_ctx"].get<int>();
  }
  if (!body["prompt"].is_null())
  {
    llama.params.prompt = body["prompt"].get<std::string>();