ggml : GPU-accelerated token generation (#1412)
* CUDA kernel for q4_0 dequant. + mat. vec. mult. * Added q4_1 via template * Added missing __syncthreads(); * --gpu_layers -> --gpu-layers * Shorter dequantize_mul_mat_vec line * q5_0 dequantize_mul_mat kernel * More readable dequantize_mul_mat_vec logic * dequantize_mul_mat_vec kernels for q5_1, q8_0, f16 * llama : offload "output" tensor to GPU too + coding style fixes --------- Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
This commit is contained in:
parent
f954edda93
commit
905d87b70a
8 changed files with 336 additions and 42 deletions
|
@ -21,13 +21,14 @@
|
|||
int32_t get_num_physical_cores();
|
||||
|
||||
struct gpt_params {
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t seed = -1; // RNG seed
|
||||
int32_t n_threads = get_num_physical_cores();
|
||||
int32_t n_predict = -1; // new tokens to predict
|
||||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_parts = -1; // amount of model parts (-1 = determine from model dimensions)
|
||||
int32_t n_ctx = 512; // context size
|
||||
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
|
||||
int32_t n_keep = 0; // number of tokens to keep from initial prompt
|
||||
int32_t n_gpu_layers = 0; // number of layers to store in VRAM
|
||||
|
||||
// sampling parameters
|
||||
std::unordered_map<llama_token, float> logit_bias; // logit bias for specific tokens
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue