diff --git a/ggml.c b/ggml.c index d579937a7..cfdf427df 100644 --- a/ggml.c +++ b/ggml.c @@ -5858,11 +5858,11 @@ static bool ggml_compute_forward_mul_mat_use_blas( if (ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) { - // disable BLAS for Q4_0 and Q4_1 - // looks like there is no benefit and we only waste a lot of memory - if (src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1) { - return false; - } + //// disable BLAS for Q4_0 and Q4_1 + //// looks like there is no benefit and we only waste a lot of memory + //if (src0->type == GGML_TYPE_Q4_0 || src0->type == GGML_TYPE_Q4_1) { + // return false; + //} //printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01); return true; diff --git a/llama.cpp b/llama.cpp index b5684d6fa..4caf607b7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -44,7 +44,7 @@ enum e_model { static const size_t MB = 1024*1024; // computed for n_ctx == 2048 -// TODO: dynamically determine thess sizes +// TODO: dynamically determine these sizes // needs modifications in ggml static const std::map MEM_REQ_SCRATCH0 = { @@ -69,11 +69,13 @@ static const std::map MEM_REQ_KV_SELF = { { MODEL_65B, 5120ull*MB }, }; +// this is mostly needed for temporary mul_mat buffers to dequantize the data +// not actually needed if BLAS is disabled static const std::map MEM_REQ_EVAL = { - { MODEL_7B, 128ull*MB }, - { MODEL_13B, 128ull*MB }, - { MODEL_30B, 128ull*MB }, - { MODEL_65B, 128ull*MB }, + { MODEL_7B, 768ull*MB }, + { MODEL_13B, 1024ull*MB }, + { MODEL_30B, 1280ull*MB }, + { MODEL_65B, 1536ull*MB }, }; // default hparams (LLaMA 7B) @@ -1034,7 +1036,7 @@ static bool llama_eval_internal( } #if 0 - printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB, %.3f MB %.3f MB %.3f %.3f %.3f MB\n", __func__, + printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0, lctx.get_buf_max_mem(0)/1024.0/1024.0, lctx.get_buf_max_mem(1)/1024.0/1024.0);