ggml-cuda : use graph allocator (#2684)

use a different function for no_alloc to avoid breaking backwards compat, fixes lora

remove 512 n_batch limit

fixed 2048 batch size

cleanup

Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
slaren 2023-08-22 15:25:19 +02:00 committed by GitHub
parent ef3f333d37
commit 1123f7fbdf
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 92 additions and 228 deletions

View file

@ -289,7 +289,6 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
break;
}
params.n_batch = std::stoi(argv[i]);
params.n_batch = std::min(512, params.n_batch);
} else if (arg == "--keep") {
if (++i >= argc) {
invalid_param = true;