ggml : GPU-accelerated token generation (#1412)
* CUDA kernel for q4_0 dequant. + mat. vec. mult. * Added q4_1 via template * Added missing __syncthreads(); * --gpu_layers -> --gpu-layers * Shorter dequantize_mul_mat_vec line * q5_0 dequantize_mul_mat kernel * More readable dequantize_mul_mat_vec logic * dequantize_mul_mat_vec kernels for q5_1, q8_0, f16 * llama : offload "output" tensor to GPU too + coding style fixes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
f954edda93
commit
905d87b70a
8 changed files with 336 additions and 42 deletions
8
ggml.h
8
ggml.h
|
@ -243,6 +243,11 @@ extern "C" {
|
|||
GGML_TYPE_COUNT,
|
||||
};
|
||||
|
||||
enum ggml_backend {
|
||||
GGML_BACKEND_CPU = 0,
|
||||
GGML_BACKEND_CUDA = 1,
|
||||
};
|
||||
|
||||
// model file types
|
||||
enum ggml_ftype {
|
||||
GGML_FTYPE_UNKNOWN = -1,
|
||||
|
@ -333,6 +338,7 @@ extern "C" {
|
|||
// n-dimensional tensor
|
||||
struct ggml_tensor {
|
||||
enum ggml_type type;
|
||||
enum ggml_backend backend;
|
||||
|
||||
int n_dims;
|
||||
int64_t ne[GGML_MAX_DIMS]; // number of elements
|
||||
|
@ -363,7 +369,7 @@ extern "C" {
|
|||
|
||||
char name[32];
|
||||
|
||||
char padding[8]; // TODO: remove and add padding to name?
|
||||
char padding[9]; // TODO: remove and add padding to name?
|
||||
};
|
||||
|
||||
// computation graph
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue