llama : grouped-query attention + LLaMAv2 70B support (#2276)

* CUDA: GQA implementation

* llama : support for GQA and LLaMAv2 70B

ggml-ci

* py : fix hparams parsing (if-else blocks)

ggml-ci

* py : oh boy ..

ggml-ci

* help : fix gqa value for 70B

ggml-ci

---------

Co-authored-by: JohannesGaessler <johannesg@5d6.de>
This commit is contained in:
Georgi Gerganov 2023-07-23 15:09:47 +03:00 committed by GitHub
parent 1d0824b247
commit e76d630df1
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 215 additions and 108 deletions

View file

@ -93,8 +93,8 @@ int main(int argc, char ** argv) {
}
if (params.n_ctx > 2048) {
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);"
" you are on your own\n", __func__, params.n_ctx);
// TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048
fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx);
} else if (params.n_ctx < 8) {
fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__);
params.n_ctx = 8;