gpu offload not working for other arch. debug in future.

This commit is contained in:
Concedo 2023-05-17 17:13:01 +08:00
parent 57230b5196
commit 2c6ac06936
3 changed files with 94 additions and 5 deletions

View file

@ -334,7 +334,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GPT2_3);
ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format);
ModelLoadResult res = gpt2_model_load(params.model, gpt2_ctx_v2, vocab, file_format, inputs.gpulayers);
if(res==ModelLoadResult::FAIL)
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
@ -421,7 +421,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
//newer format has bit unshuffling
SetQuantsUnshuffled(file_format == FileFormat::GPTJ_4);
ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab);
ModelLoadResult loadresult = gptj_model_load(params.model, gptj_ctx_v2, vocab, inputs.gpulayers);
if (loadresult == ModelLoadResult::FAIL)
{
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());

View file

@ -15,10 +15,12 @@
#include "model_adapter.h"
#if defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
#endif
// load the model's weights from a file
ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format) {
ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
@ -322,6 +324,51 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
fin.close();
// //gpu offload for gpt2
// #if defined(GGML_USE_CLBLAST)
// if(gpulayers>0)
// {
// const auto & hparams = model.hparams;
// const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
// if(GetQuantsUnshuffled())
// {
// SetGPULayers(n_gpu);
// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
// size_t vram_total = 0;
// for (int i = 0; i < n_gpu; ++i) {
// const auto & layer = model.layers[i];
// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g);
// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b);
// ggml_cl_transform_tensor(layer.ln_2_g); vram_total += ggml_nbytes(layer.ln_2_g);
// ggml_cl_transform_tensor(layer.ln_2_b); vram_total += ggml_nbytes(layer.ln_2_b);
// ggml_cl_transform_tensor(layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
// ggml_cl_transform_tensor(layer.c_attn_attn_b); vram_total += ggml_nbytes(layer.c_attn_attn_b);
// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
// ggml_cl_transform_tensor(layer.c_attn_proj_b); vram_total += ggml_nbytes(layer.c_attn_proj_b);
// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b);
// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b);
// }
// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
// }
// else
// {
// if(n_gpu>0)
// {
// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
// }
// }
// }
// #endif
return ModelLoadResult::SUCCESS;
}

View file

@ -18,7 +18,7 @@
// load the model's weights from a file
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab) {
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
@ -328,6 +328,48 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
fin.close();
// //gpu offload for gptj
// #if defined(GGML_USE_CLBLAST)
// if(gpulayers>0)
// {
// const auto & hparams = model.hparams;
// const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
// if(GetQuantsUnshuffled())
// {
// SetGPULayers(n_gpu);
// fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
// size_t vram_total = 0;
// for (int i = 0; i < n_gpu; ++i) {
// const auto & layer = model.layers[i];
// ggml_cl_transform_tensor(layer.ln_1_g); vram_total += ggml_nbytes(layer.ln_1_g);
// ggml_cl_transform_tensor(layer.ln_1_b); vram_total += ggml_nbytes(layer.ln_1_b);
// ggml_cl_transform_tensor(layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
// ggml_cl_transform_tensor(layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
// ggml_cl_transform_tensor(layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
// ggml_cl_transform_tensor(layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
// ggml_cl_transform_tensor(layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
// ggml_cl_transform_tensor(layer.c_mlp_fc_b); vram_total += ggml_nbytes(layer.c_mlp_fc_b);
// ggml_cl_transform_tensor(layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
// ggml_cl_transform_tensor(layer.c_mlp_proj_b); vram_total += ggml_nbytes(layer.c_mlp_proj_b);
// }
// fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
// }
// else
// {
// if(n_gpu>0)
// {
// printf("\n[WARNING: Old format does not support GPU offloading! It will be deactivated!]\n");
// }
// }
// }
// #endif
return ModelLoadResult::SUCCESS;
}