llama.cpp : fix MEM_REQ_SCRATCH0 reusing the value of n_ctx of the first call
This commit is contained in:
parent
5765f90f58
commit
89a70f78e7
2 changed files with 12 additions and 11 deletions
|
@ -630,17 +630,18 @@ struct markdown_printer : public printer {
|
||||||
|
|
||||||
virtual void print_header(const cmd_params & params) {
|
virtual void print_header(const cmd_params & params) {
|
||||||
fields = { "model", "backend" };
|
fields = { "model", "backend" };
|
||||||
if (backend_params::get_backend() != "CPU") {
|
bool is_cpu_backend = backend_params::get_backend() == "CPU" || backend_params::get_backend() == "BLAS";
|
||||||
|
if (!is_cpu_backend) {
|
||||||
fields.push_back("n_gpu_layers");
|
fields.push_back("n_gpu_layers");
|
||||||
}
|
}
|
||||||
|
if (params.n_threads.size() > 1 || is_cpu_backend) {
|
||||||
|
fields.push_back("n_threads");
|
||||||
|
}
|
||||||
if (params.n_batch.size() > 1) {
|
if (params.n_batch.size() > 1) {
|
||||||
fields.push_back("n_batch");
|
fields.push_back("n_batch");
|
||||||
}
|
}
|
||||||
if (params.n_threads.size() > 1 || backend_params::get_backend() == "CPU") {
|
|
||||||
fields.push_back("n_threads");
|
|
||||||
}
|
|
||||||
if (params.f32_kv.size() > 1) {
|
if (params.f32_kv.size() > 1) {
|
||||||
fields.push_back("f32_kv");
|
fields.push_back("f16_kv");
|
||||||
}
|
}
|
||||||
if (params.main_gpu.size() > 1) {
|
if (params.main_gpu.size() > 1) {
|
||||||
fields.push_back("main_gpu");
|
fields.push_back("main_gpu");
|
||||||
|
@ -723,9 +724,9 @@ void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int
|
||||||
std::vector<llama_token> tokens(n_batch, llama_token_bos());
|
std::vector<llama_token> tokens(n_batch, llama_token_bos());
|
||||||
int n_processed = 0;
|
int n_processed = 0;
|
||||||
while (n_processed < n_prompt) {
|
while (n_processed < n_prompt) {
|
||||||
int n = std::min(n_prompt - n_processed, n_batch);
|
int n_tokens = std::min(n_prompt - n_processed, n_batch);
|
||||||
llama_eval(ctx, tokens.data(), n, n_past + n_processed, n_threads);
|
llama_eval(ctx, tokens.data(), n_tokens, n_past + n_processed, n_threads);
|
||||||
n_processed += n;
|
n_processed += n_tokens;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -115,9 +115,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
|
||||||
// memory sizes (calculated for n_batch == 512)
|
// memory sizes (calculated for n_batch == 512)
|
||||||
//
|
//
|
||||||
|
|
||||||
static const std::map<e_model, size_t> & MEM_REQ_SCRATCH0(int n_ctx)
|
static std::map<e_model, size_t> MEM_REQ_SCRATCH0(int n_ctx)
|
||||||
{
|
{
|
||||||
static std::map<e_model, size_t> k_sizes = {
|
std::map<e_model, size_t> k_sizes = {
|
||||||
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
{ MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB },
|
||||||
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
{ MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB },
|
||||||
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
{ MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB },
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue