diff --git a/llama.cpp b/llama.cpp index 8087e9dad..af5da35a6 100644 --- a/llama.cpp +++ b/llama.cpp @@ -9199,10 +9199,12 @@ struct llama_context * llama_new_context_with_model( #elif defined(GGML_USE_CUBLAS) if (model->n_gpu_layers > 0) { // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used - if (model->split_mode == LLAMA_SPLIT_ROW || model->split_mode == LLAMA_SPLIT_NONE) { + if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) { ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu); + llama_free(ctx); + return nullptr; } ctx->backends.push_back(backend); } else { @@ -9211,6 +9213,8 @@ struct llama_context * llama_new_context_with_model( ggml_backend_t backend = ggml_backend_cuda_init(device); if (backend == nullptr) { LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device); + llama_free(ctx); + return nullptr; } ctx->backends.push_back(backend); }