diff --git a/ggml-alloc.c b/ggml-alloc.c
index bd367c42d..ff40e3345 100644
--- a/ggml-alloc.c
+++ b/ggml-alloc.c
@@ -988,19 +988,19 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte
             this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
         }
 
-        if (this_size > max_size) {
-            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
-                    __func__, t->name,
-                    ggml_backend_buft_name(buft),
-                    this_size, max_size);
-            for (size_t i = 0; i < n_buffers; i++) {
-                ggml_backend_buffer_free(buffers[i]);
-            }
-            free(buffers);
-            return NULL;
-        }
+        //if (this_size > max_size) {
+        //    fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+        //            __func__, t->name,
+        //            ggml_backend_buft_name(buft),
+        //            this_size, max_size);
+        //    for (size_t i = 0; i < n_buffers; i++) {
+        //        ggml_backend_buffer_free(buffers[i]);
+        //    }
+        //    free(buffers);
+        //    return NULL;
+        //}
 
-        if ((cur_buf_size + this_size) > max_size) {
+        if (cur_buf_size != 0 && (cur_buf_size + this_size) > max_size) {
             // allocate tensors in the current buffer
             if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
                 return NULL;
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 0acfda91d..781f6eaad 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -576,6 +576,11 @@ GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend
     GGML_UNUSED(buft);
 }
 
+GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
+    ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+    return ggml_cuda_info().devices[buft_ctx->device].vmm_granularity;
+}
+
 GGML_CALL static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
     size_t size = ggml_nbytes(tensor);
     int64_t ne0 = tensor->ne[0];
@@ -595,7 +600,7 @@ static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
     /* .get_name         = */ ggml_backend_cuda_buffer_type_name,
     /* .alloc_buffer     = */ ggml_backend_cuda_buffer_type_alloc_buffer,
     /* .get_alignment    = */ ggml_backend_cuda_buffer_type_get_alignment,
-    /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
+    /* .get_max_size     = */ ggml_backend_cuda_buffer_type_get_max_size,
     /* .get_alloc_size   = */ ggml_backend_cuda_buffer_type_get_alloc_size,
     /* .is_host          = */ NULL,
 };