diff --git a/ggml-alloc.c b/ggml-alloc.c index bd367c42d..ff40e3345 100644 --- a/ggml-alloc.c +++ b/ggml-alloc.c @@ -988,19 +988,19 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment); } - if (this_size > max_size) { - fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", - __func__, t->name, - ggml_backend_buft_name(buft), - this_size, max_size); - for (size_t i = 0; i < n_buffers; i++) { - ggml_backend_buffer_free(buffers[i]); - } - free(buffers); - return NULL; - } + //if (this_size > max_size) { + // fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", + // __func__, t->name, + // ggml_backend_buft_name(buft), + // this_size, max_size); + // for (size_t i = 0; i < n_buffers; i++) { + // ggml_backend_buffer_free(buffers[i]); + // } + // free(buffers); + // return NULL; + //} - if ((cur_buf_size + this_size) > max_size) { + if (cur_buf_size != 0 && (cur_buf_size + this_size) > max_size) { // allocate tensors in the current buffer if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) { return NULL; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0acfda91d..781f6eaad 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -576,6 +576,11 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend GGML_UNUSED(buft); } +GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { + ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context; + return ggml_cuda_info().devices[buft_ctx->device].vmm_granularity; +} + GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; @@ -595,7 +600,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = { /* .get_name = */ ggml_backend_cuda_buffer_type_name, /* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer, /* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment, - /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_max_size = */ ggml_backend_cuda_buffer_type_get_max_size, /* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size, /* .is_host = */ NULL, };