diff --git a/ggml-backend.c b/ggml-backend.c index 25d9c98d5..535426b9a 100644 --- a/ggml-backend.c +++ b/ggml-backend.c @@ -1006,7 +1006,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2: assign backends to ops from current assignments // start from the end and assign the same backend to previous ops - // expand gpu backends (ie non last prio) up and down, ignoring cpu + // expand gpu backends (i.e. non last prio) up and down, ignoring cpu // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops // pass 2.1 expand gpu up diff --git a/llama.h b/llama.h index 5dc4c9357..d387ad77b 100644 --- a/llama.h +++ b/llama.h @@ -185,9 +185,11 @@ extern "C" { struct llama_model_params { int32_t n_gpu_layers; // number of layers to store in VRAM enum llama_split_mode split_mode; // how to split the model across multiple GPUs - // the GPU that is used for the model (LLAMA_SPLIT_NONE), - // for small tensors and intermediate results (LLAMA_SPLIT_ROW) - // ignored for LLAMA_SPLIT_LAYER + + // main_gpu interpretation depends on split_mode: + // LLAMA_SPLIT_NONE: the GPU that is used for the entire model + // LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results + // LLAMA_SPLIT_LAYER: ignored int32_t main_gpu; // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES const float * tensor_split; diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp index e27f9f284..d9b8b106a 100644 --- a/tests/test-backend-ops.cpp +++ b/tests/test-backend-ops.cpp @@ -468,7 +468,7 @@ struct test_case { GGML_UNUSED(index); }; - bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); + const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud); if (!cmp_ok) { printf("compare failed ");