support powerinfer without GPU

2023-12-12 15:40:07 +08:00 · 2023-12-12 15:40:07 +08:00 · 182316ecfd
commit 182316ecfd
parent e4b798a735
1 changed files with 15 additions and 2 deletions
--- a/llama.cpp
+++ b/llama.cpp
@ -2754,8 +2754,11 @@ struct llama_augmentation_model_loader {
        // 1. gpu_idx;
        // 2. gpu_bucket;
        // 3. transformed ffn_down;
-        const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + 5120*40*4 * ggml_tensor_overhead() + (int64_t)13824*5120*40*4);
-        printf("%ld\n", ggml_aux_tensor_size);
+        // const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + 5120*40*4 * ggml_tensor_overhead() + (int64_t)13824*5120*40*4);
+        int model_layer = model->layers.size();
+        int ffn_dim = model->layers[0].ffn_up->ne[1];
+        const int64_t ggml_aux_tensor_size = 4 * (100 * 100 + model_layer*ffn_dim*sizeof(float) * ggml_tensor_overhead() );
+        printf("augmentation buffer: %ld\n", ggml_aux_tensor_size);
        struct ggml_init_params params = {
            /*.mem_size   =*/ ggml_aux_tensor_size,
            /*.mem_buffer =*/ nullptr,
@ -3966,12 +3969,14 @@ static struct ggml_tensor * llm_build_ffn_sparse(
    third = cur;
    struct ggml_tensor * tmp = ggml_mul_mat_idx(ctx, up, cur, idx, gpu_index);
    cb(tmp, "ffn_up_sparse", il);
+#ifdef GGML_USE_CUBLAS
    struct ggml_tensor * tmp2 = ggml_mul_mat_special(ctx, up_gpu, cur, idx, gpu_bucket, up);
    if (tmp2 != NULL) {
        ggml_cuda_assign_buffers_no_alloc(tmp2);
        cb(tmp2, "ffn_up_sparse_gpu", il);
    }
    tmp = ggml_add(ctx, tmp, tmp2);
+#endif


    if (up_b) {
@ -3985,12 +3990,14 @@ static struct ggml_tensor * llm_build_ffn_sparse(
        third = cur;
        cur = ggml_mul_mat_idx(ctx, gate, cur, idx, gpu_index);
        cb(cur, "ffn_gate", il);
+#ifdef GGML_USE_CUBLAS
        tmp2 = ggml_mul_mat_special(ctx, gate_gpu, third, idx, gpu_bucket, gate);
        if (tmp2 != NULL) {
            ggml_cuda_assign_buffers_no_alloc(tmp2);
            cb(tmp2, "ffn_up_sparse_gpu", il);
        }
        cur = ggml_add(ctx, cur, tmp2);
+#endif

        if (gate_b) {
            cur = ggml_add(ctx, cur, gate_b);
@ -4017,14 +4024,20 @@ static struct ggml_tensor * llm_build_ffn_sparse(
    }

    third = cur;
+#ifdef GGML_USE_CUBLAS
    cur = ggml_axpy(ctx, down_gpu, cur, idx, gpu_bucket);
    if (cur != NULL) {
        ggml_cuda_assign_buffers_no_alloc(cur);
        cb(cur, "ffn_down", il);
    }
+#endif
    tmp = ggml_axpy(ctx, down_t, third, idx, gpu_index);
    cb(tmp, "ffn_down_gpu", il);
+#ifdef GGML_USE_CUBLAS
    cur = ggml_add(ctx, cur, tmp);
+#else
+    cur = tmp;
+#endif

    if (down_b) {
        cur = ggml_add(ctx, cur, down_b);