ggml : adjust mul_mat_f16 work memory (#1226)

* llama : minor - remove explicity int64_t cast

* ggml : reduce memory buffer for F16 mul_mat when not using cuBLAS

* ggml : add asserts to guard for incorrect wsize
This commit is contained in:
Georgi Gerganov 2023-04-29 18:43:28 +03:00 committed by GitHub
parent 305eb5afd5
commit 214b6a3570
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 9 deletions

View file

@ -780,7 +780,7 @@ static bool kv_cache_init(
const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int64_t n_mem = (int64_t)n_layer*n_ctx;
const int64_t n_mem = n_layer*n_ctx;
const int64_t n_elements = n_embd*n_mem;
cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);