ggml : move more prints to the ggml log system (#9839)
* ggml : move more prints to the ggml log system * show BLAS OpenMP warnings in all builds using debug print
This commit is contained in:
parent
7eee341bee
commit
96776405a1
4 changed files with 48 additions and 48 deletions
|
@ -291,7 +291,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
|
|||
return;
|
||||
}
|
||||
}
|
||||
GGML_LOG_WARN(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||
GGML_LOG_DEBUG(GGML_CUDA_NAME " buffer pool full, increase MAX_CUDA_BUFFERS\n");
|
||||
ggml_cuda_set_device(device);
|
||||
CUDA_CHECK(cudaFree(ptr));
|
||||
pool_size -= size;
|
||||
|
@ -980,7 +980,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
|
|||
if (err != cudaSuccess) {
|
||||
// clear the error
|
||||
cudaGetLastError();
|
||||
GGML_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||
return nullptr;
|
||||
}
|
||||
|
@ -2406,7 +2406,7 @@ static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend_src, ggml_
|
|||
|
||||
if (cuda_ctx_src->device != buf_ctx_src->device || cuda_ctx_dst->device != buf_ctx_dst->device) {
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_WARN("%s: backend and buffer devices do not match\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: backend and buffer devices do not match\n", __func__);
|
||||
#endif
|
||||
return false;
|
||||
}
|
||||
|
@ -2524,7 +2524,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
if (ggml_cuda_info().devices[cuda_ctx->device].cc < CC_AMPERE) {
|
||||
cuda_ctx->cuda_graph->disable_due_to_gpu_arch = true;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to GPU architecture\n", __func__);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -2575,14 +2575,14 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
|
||||
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (node->op == GGML_OP_MUL_MAT_ID) {
|
||||
use_cuda_graph = false; // This node type is not supported by CUDA graph capture
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to mul_mat_id\n", __func__);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -2591,7 +2591,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
// Changes in batch size or context size can cause changes to the grid size of some kernels.
|
||||
use_cuda_graph = false;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to batch size > 1 [%s] [%ld %ld %ld %ld]\n", __func__, node->name, node->ne[0], node->ne[1], node->ne[2], node->ne[3]);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -2603,7 +2603,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
if (!ptr) {
|
||||
use_cuda_graph = false;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to unsupported copy op\n", __func__);
|
||||
#endif
|
||||
} else {
|
||||
if (std::find(ggml_cuda_cpy_fn_ptrs.begin(), ggml_cuda_cpy_fn_ptrs.end(), ptr) == ggml_cuda_cpy_fn_ptrs.end()) {
|
||||
|
@ -2627,7 +2627,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
if (cuda_ctx->cuda_graph->number_consecutive_updates >= 4) {
|
||||
cuda_ctx->cuda_graph->disable_due_to_too_many_updates = true;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to too many consecutive updates\n", __func__);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
@ -2685,7 +2685,7 @@ static enum ggml_status ggml_backend_cuda_graph_compute(ggml_backend_t backend,
|
|||
use_cuda_graph = false;
|
||||
cuda_ctx->cuda_graph->disable_due_to_failed_graph_capture = true;
|
||||
#ifndef NDEBUG
|
||||
GGML_LOG_WARN("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
||||
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to failed graph capture\n", __func__);
|
||||
#endif
|
||||
} else {
|
||||
graph_evaluated_or_captured = true; // CUDA graph has been captured
|
||||
|
@ -2854,7 +2854,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
|
|||
// clear the error
|
||||
cudaGetLastError();
|
||||
|
||||
GGML_LOG_WARN("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
||||
GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
|
||||
size / 1024.0 / 1024.0, cudaGetErrorString(err));
|
||||
return false;
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue