Reduce code duplication in tensor split layer assignment

2024-02-04 21:57:13 +01:00 · 2024-02-04 21:57:13 +01:00 · c71316f825
commit c71316f825
parent a1f9c008db
2 changed files with 37 additions and 49 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -3402,61 +3402,17 @@ static bool llm_load_tensors(
        model.buft_layer[i] = llama_default_buffer_type_cpu(true);
    }
-#ifdef GGML_USE_CUBLAS
+#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN)
    if (split_mode == LLAMA_SPLIT_LAYER) {
        // calculate the split points
-        int device_count = ggml_backend_cuda_get_device_count();
+        int device_count = llama_get_device_count();
        bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
-        float splits[GGML_CUDA_MAX_DEVICES];
+        std::vector<float> splits_vec(device_count);
        float * splits = splits_vec.data();
        if (all_zero) {
            // default split, by free memory
            for (int i = 0; i < device_count; ++i) {
-                size_t total;
+                splits[i] = llama_get_default_device_split(i);
                size_t free;
                ggml_backend_cuda_get_device_memory(i, &total, &free);
                splits[i] = free;
            }
        } else {
            std::copy(tensor_split, tensor_split + device_count, splits);
        }
        // sum and normalize the splits to get the split points
        float split_sum = 0.0f;
        for (int i = 0; i < device_count; ++i) {
            split_sum += splits[i];
            splits[i] = split_sum;
        }
        for (int i = 0; i < device_count; ++i) {
            splits[i] /= split_sum;
        }
        // assign the repeating layers to the devices according to the splits
        int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
        for (int64_t i = i_gpu_start; i < n_layer; ++i) {
            int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
            model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
        }
        // assign the output layer
        if (n_gpu_layers > n_layer) {
            int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
            model.buft_output = llama_default_buffer_type_offload(layer_gpu);
        } else {
            model.buft_output = llama_default_buffer_type_cpu(true);
        }
    } else
 #elif defined(GGML_USE_VULKAN)
    if (split_mode == LLAMA_SPLIT_LAYER) {
        // calculate the split points
        int device_count = ggml_backend_vk_get_device_count();
        bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
        float splits[GGML_VK_MAX_DEVICES];
        if (all_zero) {
            // default split, by free memory
            for (int i = 0; i < device_count; ++i) {
                size_t total;
                size_t free;
                ggml_backend_vk_get_device_memory(i, &total, &free);
                splits[i] = free;
            }
        } else {
            std::copy(tensor_split, tensor_split + device_count, splits);
@ -10344,6 +10300,36 @@ size_t llama_max_devices(void) {
 #endif
 }
 size_t llama_get_device_count(void) {
 #if defined(GGML_USE_METAL)
    return 1;
 #elif defined(GGML_USE_CUBLAS)
    return ggml_backend_cuda_get_device_count();
 #elif defined(GGML_USE_SYCL)
    return 1;
 #elif defined(GGML_USE_VULKAN)
    return ggml_backend_vk_get_device_count();
 #else
    return 0;
 #endif
 }
 LLAMA_API size_t llama_get_default_device_split(int device) {
 #if defined(GGML_USE_CUBLAS)
    size_t total;
    size_t free;
    ggml_backend_cuda_get_device_memory(device, &total, &free);
    return free;
 #elif defined(GGML_USE_VULKAN)
    size_t total;
    size_t free;
    ggml_backend_vk_get_device_memory(device, &total, &free);
    return free;
 #else
    return 1;
 #endif
 }
 bool llama_supports_mmap(void) {
    return llama_mmap::SUPPORTED;
 }
--- a/llama.h
+++ b/llama.h
@ -325,6 +325,8 @@ extern "C" {
    LLAMA_API int64_t llama_time_us(void);
    LLAMA_API size_t llama_max_devices(void);
    LLAMA_API size_t llama_get_device_count(void);
    LLAMA_API size_t llama_get_default_device_split(int device);
    LLAMA_API bool llama_supports_mmap       (void);
    LLAMA_API bool llama_supports_mlock      (void);