ggml : GPU-accelerated token generation (#1412)

* CUDA kernel for q4_0 dequant. + mat. vec. mult.

* Added q4_1 via template

* Added missing __syncthreads();

* --gpu_layers -> --gpu-layers

* Shorter dequantize_mul_mat_vec line

* q5_0 dequantize_mul_mat kernel

* More readable dequantize_mul_mat_vec logic

* dequantize_mul_mat_vec kernels for q5_1, q8_0, f16

* llama : offload "output" tensor to GPU too + coding style fixes

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
Johannes Gäßler 2023-05-13 15:38:36 +02:00 committed by GitHub
parent f954edda93
commit 905d87b70a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
8 changed files with 336 additions and 42 deletions

View file

@ -9,6 +9,9 @@
#include "llama.h"
#include "ggml.h"
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
#endif
#include <array>
#include <ctime>
@ -810,6 +813,7 @@ struct llama_context_params llama_context_default_params() {
struct llama_context_params result = {
/*.n_ctx =*/ 512,
/*.n_parts =*/ -1,
/*.gpu_layers =*/ 0,
/*.seed =*/ -1,
/*.f16_kv =*/ false,
/*.logits_all =*/ false,
@ -876,6 +880,7 @@ static void llama_model_load_internal(
const std::string & fname,
llama_context & lctx,
int n_ctx,
int n_gpu_layers,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
@ -1022,6 +1027,33 @@ static void llama_model_load_internal(
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
model.mapping = std::move(ml->mapping);
#ifdef GGML_USE_CUBLAS
{
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
fprintf(stderr, "%s: [cublas] offloading %d layers to GPU\n", __func__, n_gpu);
size_t vram_total = 0;
for (int i = 0; i < n_gpu; ++i) {
const auto & layer = model.layers[i];
ggml_cuda_transform_tensor(layer.wq); vram_total += ggml_nbytes(layer.wq);
ggml_cuda_transform_tensor(layer.wk); vram_total += ggml_nbytes(layer.wk);
ggml_cuda_transform_tensor(layer.wv); vram_total += ggml_nbytes(layer.wv);
ggml_cuda_transform_tensor(layer.wo); vram_total += ggml_nbytes(layer.wo);
ggml_cuda_transform_tensor(layer.w1); vram_total += ggml_nbytes(layer.w1);
ggml_cuda_transform_tensor(layer.w2); vram_total += ggml_nbytes(layer.w2);
ggml_cuda_transform_tensor(layer.w3); vram_total += ggml_nbytes(layer.w3);
}
if (n_gpu_layers > (int) hparams.n_layer) {
fprintf(stderr, "%s: [cublas] offloading output layer to GPU\n", __func__);
ggml_cuda_transform_tensor(model.output); vram_total += ggml_nbytes(model.output);
}
fprintf(stderr, "%s: [cublas] total VRAM used: %zu MB\n", __func__, vram_total / 1024 / 1024);
}
#endif
// loading time will be recalculate after the first eval, so
// we take page faults deferred by mmap() into consideration
@ -1032,6 +1064,7 @@ static bool llama_model_load(
const std::string & fname,
llama_context & lctx,
int n_ctx,
int n_gpu_layers,
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
@ -1039,7 +1072,7 @@ static bool llama_model_load(
llama_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
llama_model_load_internal(fname, lctx, n_ctx, memory_type, use_mmap, use_mlock,
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, memory_type, use_mmap, use_mlock,
vocab_only, progress_callback, progress_callback_user_data);
return true;
} catch (const std::string & err) {
@ -2111,7 +2144,7 @@ struct llama_context * llama_init_from_file(
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
if (!llama_model_load(path_model, *ctx, params.n_ctx, memory_type,
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, memory_type,
params.use_mmap, params.use_mlock, params.vocab_only,
params.progress_callback, params.progress_callback_user_data)) {
fprintf(stderr, "%s: failed to load model\n", __func__);