feat: Add host buffer type for Ascend NPU(CANN backend)

2024-09-10 10:24:43 +08:00 · 2024-09-10 10:24:43 +08:00 · 490da45f54
commit 490da45f54
parent 436787f170
3 changed files with 78 additions and 0 deletions
--- a/ggml/include/ggml-cann.h
+++ b/ggml/include/ggml-cann.h
@ -80,6 +80,13 @@ ggml_backend_cann_buffer_type(int32_t device);
 */
 GGML_API GGML_CALL int32_t ggml_backend_cann_get_device_count(void);
 /**
 * @brief pinned host buffer for use with the CPU backend for faster copies between CPU and NPU.
 *
 * @return A pointer to the host buffer type interface.
 */
 GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type(void);
 /**
 * @brief Retrieves the description of a specific CANN device.
 *
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@ -1220,6 +1220,73 @@ ggml_backend_cann_buffer_type(int32_t device) {
    return &ggml_backend_cann_buffer_types[device];
 }
 // host buffer type
 GGML_CALL static const char * ggml_backend_cann_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
    return "CANN_Host";
    GGML_UNUSED(buft);
 }
 GGML_CALL static const char * ggml_backend_cann_host_buffer_name(ggml_backend_buffer_t buffer) {
    return "CANN_Host";
    GGML_UNUSED(buffer);
 }
 GGML_CALL static void ggml_backend_cann_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
    ACL_CHECK(aclrtFreeHost(buffer->context)); 
 }
 static void * ggml_cann_host_malloc(size_t size) {
    if (getenv("GGML_CANN_NO_PINNED") != nullptr) {
        return nullptr;
    }
    void * ptr = nullptr;
    aclError err = aclrtMallocHost((void **) &ptr, size); 
    if (err != ACL_SUCCESS) { 
        GGML_CANN_LOG_WARN("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                           size / 1024.0 / 1024.0, aclGetRecentErrMsg());
        return nullptr;
    }
    return ptr;
 }
 GGML_CALL static ggml_backend_buffer_t ggml_backend_cann_host_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
    void * ptr = ggml_cann_host_malloc(size);
    if (ptr == nullptr) {
        // fallback to cpu buffer
        return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
    }
    ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
    buffer->buft = buft;
    buffer->iface.get_name = ggml_backend_cann_host_buffer_name;
    buffer->iface.free_buffer = ggml_backend_cann_host_buffer_free_buffer;
    return buffer;
 }
 GGML_CALL ggml_backend_buffer_type_t ggml_backend_cann_host_buffer_type() {
    static struct ggml_backend_buffer_type ggml_backend_cann_buffer_type_host = {
        /* .iface    = */ {
            /* .get_name         = */ ggml_backend_cann_host_buffer_type_name,
            /* .alloc_buffer     = */ ggml_backend_cann_host_buffer_type_alloc_buffer,
            /* .get_alignment    = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
            /* .get_max_size     = */ NULL, // defaults to SIZE_MAX
            /* .get_alloc_size   = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
            /* .is_host          = */ ggml_backend_cpu_buffer_type()->iface.is_host,
        },
        /* .context  = */ nullptr,
    };
    return &ggml_backend_cann_buffer_type_host;
 }
 /**
 * @brief Computes the forward operation for a given tensor using CANN
 * operations.
--- a/src/llama.cpp
+++ b/src/llama.cpp
@ -2088,6 +2088,10 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer
    if (host_buffer) {
        buft = ggml_backend_sycl_host_buffer_type();
    }
 #elif defined(GGML_USE_CANN)
    if (host_buffer) {
        buft = ggml_backend_cann_host_buffer_type();
    }
 #elif defined(GGML_USE_CPU_HBM)
    buft = ggml_backend_cpu_hbm_buffer_type();
 #elif defined(GGML_USE_VULKAN)