Implemented basic GPU offloading for MPT, GPT-2, GPT-J and GPT-NeoX
This commit is contained in:
parent
b1f00fa9cc
commit
1b71752a9f
6 changed files with 99 additions and 8 deletions
|
@ -671,7 +671,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
{
|
{
|
||||||
if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
if(file_format==FileFormat::NEOX_6|| file_format==FileFormat::NEOX_7)
|
||||||
{
|
{
|
||||||
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format);
|
ModelLoadResult res = gpt_neox_model_load(params.model, neox_ctx_v3, vocab, file_format, inputs.gpulayers);
|
||||||
if(res==ModelLoadResult::FAIL)
|
if(res==ModelLoadResult::FAIL)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||||
|
@ -733,7 +733,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
|
||||||
}
|
}
|
||||||
else if(file_format==FileFormat::MPT_1)
|
else if(file_format==FileFormat::MPT_1)
|
||||||
{
|
{
|
||||||
bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab);
|
bool res = mpt_model_load(params.model, mpt_ctx_v3, vocab, inputs.gpulayers);
|
||||||
if(res==false)
|
if(res==false)
|
||||||
{
|
{
|
||||||
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
|
||||||
|
|
|
@ -345,6 +345,28 @@ ModelLoadResult gpt2_model_load(const std::string & fname, gpt2_model & model, g
|
||||||
|
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
|
//gpu offload
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
size_t vram_total = 0;
|
||||||
|
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
||||||
|
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||||
|
for (int i = 0; i < n_gpu; ++i) {
|
||||||
|
const auto & layer = model.layers[i];
|
||||||
|
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return ModelLoadResult::SUCCESS;
|
return ModelLoadResult::SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,9 @@
|
||||||
|
|
||||||
#include "model_adapter.h"
|
#include "model_adapter.h"
|
||||||
|
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
#include "ggml-opencl.h"
|
||||||
|
#endif
|
||||||
|
|
||||||
// load the model's weights from a file
|
// load the model's weights from a file
|
||||||
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
|
ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & vocab, int gpulayers) {
|
||||||
|
@ -331,7 +333,31 @@ ModelLoadResult gptj_model_load(const std::string & fname, gptj_model & model, g
|
||||||
|
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
|
//gpu offload
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
size_t vram_total = 0;
|
||||||
|
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
||||||
|
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||||
|
for (int i = 0; i < n_gpu; ++i) {
|
||||||
|
const auto & layer = model.layers[i];
|
||||||
|
layer.c_attn_q_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_attn_k_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_attn_v_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_q_proj_w->data,layer.c_attn_q_proj_w); vram_total += ggml_nbytes(layer.c_attn_q_proj_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_k_proj_w->data,layer.c_attn_k_proj_w); vram_total += ggml_nbytes(layer.c_attn_k_proj_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_v_proj_w->data,layer.c_attn_v_proj_w); vram_total += ggml_nbytes(layer.c_attn_v_proj_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return ModelLoadResult::SUCCESS;
|
return ModelLoadResult::SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,7 @@
|
||||||
|
|
||||||
|
|
||||||
// load the model's weights from a file
|
// load the model's weights from a file
|
||||||
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab) {
|
bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vocab, int gpulayers) {
|
||||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
|
||||||
auto fin = std::ifstream(fname, std::ios::binary);
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
|
@ -278,6 +278,28 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo
|
||||||
|
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
|
//gpu offload
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
size_t vram_total = 0;
|
||||||
|
const int n_gpu = std::min(gpulayers, int(hparams.n_layers));
|
||||||
|
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||||
|
for (int i = 0; i < n_gpu; ++i) {
|
||||||
|
const auto & layer = model.layers[i];
|
||||||
|
layer.ffn_up_proj->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.ffn_down_proj->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_attn_wqkv_weight->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_attn_out_proj_weight->backend = GGML_BACKEND_GPU;
|
||||||
|
ggml_cl_transform_tensor(layer.ffn_up_proj->data,layer.ffn_up_proj); vram_total += ggml_nbytes(layer.ffn_up_proj);
|
||||||
|
ggml_cl_transform_tensor(layer.ffn_down_proj->data,layer.ffn_down_proj); vram_total += ggml_nbytes(layer.ffn_down_proj);
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_wqkv_weight->data,layer.c_attn_wqkv_weight); vram_total += ggml_nbytes(layer.c_attn_wqkv_weight);
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_out_proj_weight->data,layer.c_attn_out_proj_weight); vram_total += ggml_nbytes(layer.c_attn_out_proj_weight);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
|
|
||||||
// load the model's weights from a file
|
// load the model's weights from a file
|
||||||
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format) {
|
ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_vocab & vocab, FileFormat file_format, int gpulayers) {
|
||||||
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
|
||||||
|
|
||||||
auto fin = std::ifstream(fname, std::ios::binary);
|
auto fin = std::ifstream(fname, std::ios::binary);
|
||||||
|
@ -318,6 +318,28 @@ ModelLoadResult gpt_neox_model_load(const std::string & fname, gpt_neox_model &
|
||||||
|
|
||||||
fin.close();
|
fin.close();
|
||||||
|
|
||||||
|
//gpu offload
|
||||||
|
#if defined(GGML_USE_CLBLAST)
|
||||||
|
{
|
||||||
|
const auto & hparams = model.hparams;
|
||||||
|
size_t vram_total = 0;
|
||||||
|
const int n_gpu = std::min(gpulayers, int(hparams.n_layer));
|
||||||
|
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||||
|
for (int i = 0; i < n_gpu; ++i) {
|
||||||
|
const auto & layer = model.layers[i];
|
||||||
|
layer.c_attn_attn_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_attn_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_fc_w->backend = GGML_BACKEND_GPU;
|
||||||
|
layer.c_mlp_proj_w->backend = GGML_BACKEND_GPU;
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_attn_w->data,layer.c_attn_attn_w); vram_total += ggml_nbytes(layer.c_attn_attn_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_attn_proj_w->data,layer.c_attn_proj_w); vram_total += ggml_nbytes(layer.c_attn_proj_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_fc_w->data,layer.c_mlp_fc_w); vram_total += ggml_nbytes(layer.c_mlp_fc_w);
|
||||||
|
ggml_cl_transform_tensor(layer.c_mlp_proj_w->data,layer.c_mlp_proj_w); vram_total += ggml_nbytes(layer.c_mlp_proj_w);
|
||||||
|
}
|
||||||
|
fprintf(stderr, "%s: [opencl] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
return ModelLoadResult::SUCCESS;
|
return ModelLoadResult::SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -43,7 +43,6 @@ struct gptj_layer {
|
||||||
struct ggml_tensor * c_mlp_fc_b;
|
struct ggml_tensor * c_mlp_fc_b;
|
||||||
|
|
||||||
struct ggml_tensor * c_mlp_proj_w;
|
struct ggml_tensor * c_mlp_proj_w;
|
||||||
struct ggml_tensor * c_mlp_proj_w_trans; //for backwards compatibility
|
|
||||||
struct ggml_tensor * c_mlp_proj_b;
|
struct ggml_tensor * c_mlp_proj_b;
|
||||||
};
|
};
|
||||||
struct gptj_layer_v2 {
|
struct gptj_layer_v2 {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue