diff --git a/ggml-cuda.cu b/ggml-cuda.cu index dac4ab201..b19e7ad62 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -6741,7 +6741,7 @@ static void ggml_cuda_pool_free_vmm(int device, void * ptr, size_t size) { } static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_size) { - if (device == g_main_device && g_device_caps[device].vmm) { + if (g_device_caps[device].vmm) { return ggml_cuda_pool_malloc_vmm(device, size, actual_size); } else { return ggml_cuda_pool_malloc_leg(device, size, actual_size); @@ -6749,7 +6749,7 @@ static void * ggml_cuda_pool_malloc(int device, size_t size, size_t * actual_siz } static void ggml_cuda_pool_free(int device, void * ptr, size_t size) { - if (device == g_main_device && g_device_caps[device].vmm) { + if (g_device_caps[device].vmm) { ggml_cuda_pool_free_vmm(device, ptr, size); } else { ggml_cuda_pool_free_leg(device, ptr, size); diff --git a/llama.cpp b/llama.cpp index 0b99f1e03..4aa59c4c0 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9519,7 +9519,8 @@ struct llama_context * llama_new_context_with_model( ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc); #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) if (model->n_gpu_layers > 0) { - ggml_cuda_set_scratch_size(alloc_size); + // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets + ggml_cuda_set_scratch_size(alloc_size + 64); LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0); // calculate total VRAM usage