ggml : GPU-accelerated token generation (#1412)
* CUDA kernel for q4_0 dequant. + mat. vec. mult. * Added q4_1 via template * Added missing __syncthreads(); * --gpu_layers -> --gpu-layers * Shorter dequantize_mul_mat_vec line * q5_0 dequantize_mul_mat kernel * More readable dequantize_mul_mat_vec logic * dequantize_mul_mat_vec kernels for q5_1, q8_0, f16 * llama : offload "output" tensor to GPU too + coding style fixes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
f954edda93
commit
905d87b70a
8 changed files with 336 additions and 42 deletions
37
llama.cpp
37
llama.cpp
|
@ -9,6 +9,9 @@
|
|||
#include "llama.h"
|
||||
|
||||
#include "ggml.h"
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
#include "ggml-cuda.h"
|
||||
#endif
|
||||
|
||||
#include <array>
|
||||
#include <ctime>
|
||||
|
@ -810,6 +813,7 @@ struct llama_context_params llama_context_default_params() {
|
|||
struct llama_context_params result = {
|
||||
/*.n_ctx =*/ 512,
|
||||
/*.n_parts =*/ -1,
|
||||
/*.gpu_layers =*/ 0,
|
||||
/*.seed =*/ -1,
|
||||
/*.f16_kv =*/ false,
|
||||
/*.logits_all =*/ false,
|
||||
|
@ -876,6 +880,7 @@ static void llama_model_load_internal(
|
|||
const std::string & fname,
|
||||
llama_context & lctx,
|
||||
int n_ctx,
|
||||
int n_gpu_layers,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
bool use_mlock,
|
||||
|
@ -1022,6 +1027,33 @@ static void llama_model_load_internal(
|
|||
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
|
||||
|
||||
model.mapping = std::move(ml->mapping);
|
||||
#ifdef GGML_USE_CUBLAS
|
||||
{
|
||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||
|
||||
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||
|
||||
size_t vram_total = 0;
|
||||
|
||||
for (int i = 0; i < n_gpu; ++i) {
|
||||
const auto & layer = model.layers[i];
|
||||
|
||||
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
|
||||
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
|
||||
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
|
||||
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
|
||||
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
|
||||
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
|
||||
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
|
||||
}
|
||||
if (n_gpu_layers > (int) hparams.n_layer) {
|
||||
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
|
||||
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
|
||||
}
|
||||
|
||||
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
|
||||
}
|
||||
#endif
|
||||
|
||||
// loading time will be recalculate after the first eval, so
|
||||
// we take page faults deferred by mmap() into consideration
|
||||
|
@ -1032,6 +1064,7 @@ static bool llama_model_load(
|
|||
const std::string & fname,
|
||||
llama_context & lctx,
|
||||
int n_ctx,
|
||||
int n_gpu_layers,
|
||||
ggml_type memory_type,
|
||||
bool use_mmap,
|
||||
bool use_mlock,
|
||||
|
@ -1039,7 +1072,7 @@ static bool llama_model_load(
|
|||
llama_progress_callback progress_callback,
|
||||
void *progress_callback_user_data) {
|
||||
try {
|
||||
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
|
||||
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
|
||||
vocab_only, progress_callback, progress_callback_user_data);
|
||||
return true;
|
||||
} catch (const std::string & err) {
|
||||
|
@ -2111,7 +2144,7 @@ struct llama_context * llama_init_from_file(
|
|||
|
||||
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
|
||||
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
|
||||
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
|
||||
params.use_mmap, params.use_mlock, params.vocab_only,
|
||||
params.progress_callback, params.progress_callback_user_data)) {
|
||||
fprintf(stderr, "%s: failed to load model\n", __func__);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue