Apply suggestions from code review
Co-authored-by: Johannes Gäßler <johannesg@5d6.de>
This commit is contained in:
parent
3cd0cbb1b5
commit
74066f8c41
3 changed files with 7 additions and 5 deletions
|
@ -1006,7 +1006,7 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g
|
|||
// pass 2: assign backends to ops from current assignments
|
||||
// start from the end and assign the same backend to previous ops
|
||||
|
||||
// expand gpu backends (ie non last prio) up and down, ignoring cpu
|
||||
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu
|
||||
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
|
||||
|
||||
// pass 2.1 expand gpu up
|
||||
|
|
8
llama.h
8
llama.h
|
@ -185,9 +185,11 @@ extern "C" {
|
|||
struct llama_model_params {
|
||||
int32_t n_gpu_layers; // number of layers to store in VRAM
|
||||
enum llama_split_mode split_mode; // how to split the model across multiple GPUs
|
||||
// the GPU that is used for the model (LLAMA_SPLIT_NONE),
|
||||
// for small tensors and intermediate results (LLAMA_SPLIT_ROW)
|
||||
// ignored for LLAMA_SPLIT_LAYER
|
||||
|
||||
// main_gpu interpretation depends on split_mode:
|
||||
// LLAMA_SPLIT_NONE: the GPU that is used for the entire model
|
||||
// LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results
|
||||
// LLAMA_SPLIT_LAYER: ignored
|
||||
int32_t main_gpu;
|
||||
// proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
|
||||
const float * tensor_split;
|
||||
|
|
|
@ -468,7 +468,7 @@ struct test_case {
|
|||
GGML_UNUSED(index);
|
||||
};
|
||||
|
||||
bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
|
||||
const bool cmp_ok = ggml_backend_compare_graph_backend(backend1, backend2, gf, callback, &ud);
|
||||
|
||||
if (!cmp_ok) {
|
||||
printf("compare failed ");
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue