From 2c6ac0693669c8adca18caad2ba675b76a2e7c39 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Wed, 17 May 2023 17:13:01 +0800 Subject: [PATCH] gpu offload not working for other arch. debug in future. --- gpttype_adapter.cpp | 4 ++-- otherarch/gpt2_v2.cpp | 51 +++++++++++++++++++++++++++++++++++++++++-- otherarch/gptj_v2.cpp | 44 ++++++++++++++++++++++++++++++++++++- 3 files changed, 94 insertions(+), 5 deletions(-) diff --git a/gpttype_adapter.cpp b/gpttype_adapter.cpp index 96f50697d..d9423ecc4 100644 --- a/gpttype_adapter.cpp +++ b/gpttype_adapter.cpp @@ -334,7 +334,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in //newer format has bit unshuffling SetQuantsUnshuffled(file_format == FileFormat::GPT2_3); - ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format); + ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format, inputs.gpulayers); if(res==ModelLoadResult::FAIL) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); @@ -421,7 +421,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in //newer format has bit unshuffling SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4); - ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab); + ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab, inputs.gpulayers); if (loadresult == ModelLoadResult::FAIL) { fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); diff --git a/otherarch/gpt2_v2.cpp b/otherarch/gpt2_v2.cpp index 945722b48..7e957e9e2 100644 --- a/otherarch/gpt2_v2.cpp +++ b/otherarch/gpt2_v2.cpp @@ -15,10 +15,12 @@ #include "model_adapter.h" - +#if defined(GGML_USE_CLBLAST) +#include "ggml-opencl.h" +#endif // load the model's weights from a file -ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format) { +ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) { printf("%s: loading model from '%s'\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -322,6 +324,51 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g fin.close(); + +// //gpu offload for gpt2 +// #if defined(GGML_USE_CLBLAST) +// if(gpulayers>0) +// { +// const auto & hparams = model.hparams; +// const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); +// if(GetQuantsUnshuffled()) +// { +// SetGPULayers(n_gpu); + +// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + +// size_t vram_total = 0; + +// for (int i = 0; i < n_gpu; ++i) { +// const auto & layer = model.layers[i]; + +// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g); +// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b); +// ggml_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_nbytes(layer.ln_2_g); +// ggml_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_nbytes(layer.ln_2_b); +// ggml_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w); +// ggml_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_nbytes(layer.c_attn_attn_b); +// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); +// ggml_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_nbytes(layer.c_attn_proj_b); +// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); +// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b); +// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); +// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b); +// } + +// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); +// } +// else +// { +// if(n_gpu>0) +// { +// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n"); +// } +// } +// } +// #endif + + return ModelLoadResult::SUCCESS; } diff --git a/otherarch/gptj_v2.cpp b/otherarch/gptj_v2.cpp index 5e1c22567..4b207e0bd 100644 --- a/otherarch/gptj_v2.cpp +++ b/otherarch/gptj_v2.cpp @@ -18,7 +18,7 @@ // load the model's weights from a file -ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) { +ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) { printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str()); auto fin = std::ifstream(fname, std::ios::binary); @@ -328,6 +328,48 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g fin.close(); +// //gpu offload for gptj +// #if defined(GGML_USE_CLBLAST) +// if(gpulayers>0) +// { +// const auto & hparams = model.hparams; +// const int n_gpu = std::min(gpulayers, int(hparams.n_layer)); +// if(GetQuantsUnshuffled()) +// { +// SetGPULayers(n_gpu); + +// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu); + +// size_t vram_total = 0; + +// for (int i = 0; i < n_gpu; ++i) { +// const auto & layer = model.layers[i]; + +// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g); +// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b); +// ggml_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w); +// ggml_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w); +// ggml_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w); +// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w); +// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w); +// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b); +// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w); +// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b); +// } + +// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024); +// } +// else +// { +// if(n_gpu>0) +// { +// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n"); +// } +// } +// } +// #endif + + return ModelLoadResult::SUCCESS; }