Reduce code duplication in tensor split layer assignment
This commit is contained in:
parent
a1f9c008db
commit
c71316f825
2 changed files with 37 additions and 49 deletions
84
llama.cpp
84
llama.cpp
|
@ -3402,61 +3402,17 @@ static bool llm_load_tensors(
|
||||||
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef GGML_USE_CUBLAS
|
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN)
|
||||||
if (split_mode == LLAMA_SPLIT_LAYER) {
|
if (split_mode == LLAMA_SPLIT_LAYER) {
|
||||||
// calculate the split points
|
// calculate the split points
|
||||||
int device_count = ggml_backend_cuda_get_device_count();
|
int device_count = llama_get_device_count();
|
||||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
||||||
float splits[GGML_CUDA_MAX_DEVICES];
|
std::vector<float> splits_vec(device_count);
|
||||||
|
float * splits = splits_vec.data();
|
||||||
if (all_zero) {
|
if (all_zero) {
|
||||||
// default split, by free memory
|
// default split, by free memory
|
||||||
for (int i = 0; i < device_count; ++i) {
|
for (int i = 0; i < device_count; ++i) {
|
||||||
size_t total;
|
splits[i] = llama_get_default_device_split(i);
|
||||||
size_t free;
|
|
||||||
ggml_backend_cuda_get_device_memory(i, &total, &free);
|
|
||||||
splits[i] = free;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
std::copy(tensor_split, tensor_split + device_count, splits);
|
|
||||||
}
|
|
||||||
|
|
||||||
// sum and normalize the splits to get the split points
|
|
||||||
float split_sum = 0.0f;
|
|
||||||
for (int i = 0; i < device_count; ++i) {
|
|
||||||
split_sum += splits[i];
|
|
||||||
splits[i] = split_sum;
|
|
||||||
}
|
|
||||||
for (int i = 0; i < device_count; ++i) {
|
|
||||||
splits[i] /= split_sum;
|
|
||||||
}
|
|
||||||
|
|
||||||
// assign the repeating layers to the devices according to the splits
|
|
||||||
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
|
||||||
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
|
||||||
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
|
||||||
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
|
||||||
}
|
|
||||||
// assign the output layer
|
|
||||||
if (n_gpu_layers > n_layer) {
|
|
||||||
int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
|
|
||||||
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
|
||||||
} else {
|
|
||||||
model.buft_output = llama_default_buffer_type_cpu(true);
|
|
||||||
}
|
|
||||||
} else
|
|
||||||
#elif defined(GGML_USE_VULKAN)
|
|
||||||
if (split_mode == LLAMA_SPLIT_LAYER) {
|
|
||||||
// calculate the split points
|
|
||||||
int device_count = ggml_backend_vk_get_device_count();
|
|
||||||
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
|
||||||
float splits[GGML_VK_MAX_DEVICES];
|
|
||||||
if (all_zero) {
|
|
||||||
// default split, by free memory
|
|
||||||
for (int i = 0; i < device_count; ++i) {
|
|
||||||
size_t total;
|
|
||||||
size_t free;
|
|
||||||
ggml_backend_vk_get_device_memory(i, &total, &free);
|
|
||||||
splits[i] = free;
|
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
std::copy(tensor_split, tensor_split + device_count, splits);
|
std::copy(tensor_split, tensor_split + device_count, splits);
|
||||||
|
@ -10344,6 +10300,36 @@ size_t llama_max_devices(void) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size_t llama_get_device_count(void) {
|
||||||
|
#if defined(GGML_USE_METAL)
|
||||||
|
return 1;
|
||||||
|
#elif defined(GGML_USE_CUBLAS)
|
||||||
|
return ggml_backend_cuda_get_device_count();
|
||||||
|
#elif defined(GGML_USE_SYCL)
|
||||||
|
return 1;
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
return ggml_backend_vk_get_device_count();
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
LLAMA_API size_t llama_get_default_device_split(int device) {
|
||||||
|
#if defined(GGML_USE_CUBLAS)
|
||||||
|
size_t total;
|
||||||
|
size_t free;
|
||||||
|
ggml_backend_cuda_get_device_memory(device, &total, &free);
|
||||||
|
return free;
|
||||||
|
#elif defined(GGML_USE_VULKAN)
|
||||||
|
size_t total;
|
||||||
|
size_t free;
|
||||||
|
ggml_backend_vk_get_device_memory(device, &total, &free);
|
||||||
|
return free;
|
||||||
|
#else
|
||||||
|
return 1;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
bool llama_supports_mmap(void) {
|
bool llama_supports_mmap(void) {
|
||||||
return llama_mmap::SUPPORTED;
|
return llama_mmap::SUPPORTED;
|
||||||
}
|
}
|
||||||
|
|
2
llama.h
2
llama.h
|
@ -325,6 +325,8 @@ extern "C" {
|
||||||
LLAMA_API int64_t llama_time_us(void);
|
LLAMA_API int64_t llama_time_us(void);
|
||||||
|
|
||||||
LLAMA_API size_t llama_max_devices(void);
|
LLAMA_API size_t llama_max_devices(void);
|
||||||
|
LLAMA_API size_t llama_get_device_count(void);
|
||||||
|
LLAMA_API size_t llama_get_default_device_split(int device);
|
||||||
|
|
||||||
LLAMA_API bool llama_supports_mmap (void);
|
LLAMA_API bool llama_supports_mmap (void);
|
||||||
LLAMA_API bool llama_supports_mlock (void);
|
LLAMA_API bool llama_supports_mlock (void);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue