Merge branch 'master' into llava-lib
This commit is contained in:
commit
71ea278ad8
4 changed files with 38 additions and 12 deletions
|
@ -90,6 +90,19 @@ void process_escapes(std::string& input) {
|
||||||
case '\'': input[output_idx++] = '\''; break;
|
case '\'': input[output_idx++] = '\''; break;
|
||||||
case '\"': input[output_idx++] = '\"'; break;
|
case '\"': input[output_idx++] = '\"'; break;
|
||||||
case '\\': input[output_idx++] = '\\'; break;
|
case '\\': input[output_idx++] = '\\'; break;
|
||||||
|
case 'x':
|
||||||
|
// Handle \x12, etc
|
||||||
|
if (input_idx + 2 < input_len) {
|
||||||
|
const char x[3] = { input[input_idx + 1], input[input_idx + 2], 0 };
|
||||||
|
char *err_p = nullptr;
|
||||||
|
const long val = std::strtol(x, &err_p, 16);
|
||||||
|
if (err_p == x + 2) {
|
||||||
|
input_idx += 2;
|
||||||
|
input[output_idx++] = char(val);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// fall through
|
||||||
default: input[output_idx++] = '\\';
|
default: input[output_idx++] = '\\';
|
||||||
input[output_idx++] = input[input_idx]; break;
|
input[output_idx++] = input[input_idx]; break;
|
||||||
}
|
}
|
||||||
|
|
|
@ -7,7 +7,7 @@ Command line options:
|
||||||
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
- `--threads N`, `-t N`: Set the number of threads to use during generation.
|
||||||
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
- `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
|
||||||
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
- `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
|
||||||
- `-m ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
- `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
|
||||||
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
- `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
|
||||||
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
- `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
|
||||||
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
- `-mg i, --main-gpu i`: When using multiple GPUs this option controls which GPU is used for small tensors for which the overhead of splitting the computation across all GPUs is not worthwhile. The GPU in question will use slightly more VRAM to store a scratch buffer for temporary results. By default GPU 0 is used. Requires cuBLAS.
|
||||||
|
|
24
ggml-cuda.cu
24
ggml-cuda.cu
|
@ -6893,6 +6893,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
int64_t row_low[GGML_CUDA_MAX_DEVICES];
|
||||||
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
int64_t row_high[GGML_CUDA_MAX_DEVICES];
|
||||||
|
|
||||||
|
int used_devices = 0;
|
||||||
|
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
// by default, use all rows
|
// by default, use all rows
|
||||||
row_low[id] = 0;
|
row_low[id] = 0;
|
||||||
|
@ -6920,6 +6922,8 @@ static void ggml_cuda_op_mul_mat(
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
used_devices++;
|
||||||
|
|
||||||
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
const bool src1_on_device = src1->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||||
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
const bool dst_on_device = dst->backend == GGML_BACKEND_GPU && id == g_main_device;
|
||||||
|
|
||||||
|
@ -6958,12 +6962,12 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
// if multiple devices are used they need to wait for the main device
|
// if multiple devices are used they need to wait for the main device
|
||||||
// here an event is recorded that signals that the main device has finished calculating the input data
|
// here an event is recorded that signals that the main device has finished calculating the input data
|
||||||
if (split && g_device_count > 1) {
|
if (split && used_devices > 1) {
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
CUDA_CHECK(cudaEventRecord(src0_extra->events[g_main_device][0], g_cudaStreams[g_main_device][0]));
|
||||||
}
|
}
|
||||||
|
|
||||||
const int64_t src1_col_stride = split && g_device_count > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11;
|
||||||
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
for (int64_t src1_col_0 = 0; src1_col_0 < ne11; src1_col_0 += src1_col_stride) {
|
||||||
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0;
|
||||||
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride;
|
||||||
|
@ -7079,6 +7083,9 @@ static void ggml_cuda_op_mul_mat(
|
||||||
}
|
}
|
||||||
|
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
|
if ((!split && id != g_main_device) || row_low[id] == row_high[id]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||||
|
|
||||||
// free buffers again when done
|
// free buffers again when done
|
||||||
|
@ -7103,6 +7110,9 @@ static void ggml_cuda_op_mul_mat(
|
||||||
|
|
||||||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
|
if (row_low[id] == row_high[id]) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
for (int64_t is = 0; is < is_max; ++is) {
|
for (int64_t is = 0; is < is_max; ++is) {
|
||||||
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
CUDA_CHECK(cudaStreamWaitEvent(g_cudaStreams[g_main_device][0], src0_extra->events[id][is], 0));
|
||||||
}
|
}
|
||||||
|
@ -7400,10 +7410,12 @@ static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const
|
||||||
|
|
||||||
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
const bool all_on_device =
|
const bool all_on_device =
|
||||||
(src0->backend == GGML_BACKEND_GPU) &&
|
(src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) &&
|
||||||
(src1->backend == GGML_BACKEND_GPU) &&
|
(src1->backend == GGML_BACKEND_GPU) &&
|
||||||
( dst->backend == GGML_BACKEND_GPU);
|
( dst->backend == GGML_BACKEND_GPU);
|
||||||
|
|
||||||
|
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
|
||||||
|
|
||||||
int64_t min_compute_capability = INT_MAX;
|
int64_t min_compute_capability = INT_MAX;
|
||||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||||
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
if (min_compute_capability > g_compute_capabilities[id] && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
|
||||||
|
@ -7425,13 +7437,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
||||||
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
//printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name);
|
||||||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||||
|
|
||||||
if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||||
// KQ single-batch
|
// KQ single-batch
|
||||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||||
} else if (all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
} else if (!split && all_on_device && !use_tensor_cores && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||||
// KQV single-batch
|
// KQV single-batch
|
||||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||||
} else if (all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
} else if (!split && all_on_device && use_tensor_cores && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
|
||||||
// KQ + KQV multi-batch
|
// KQ + KQV multi-batch
|
||||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||||
} else if (src0->type == GGML_TYPE_F32) {
|
} else if (src0->type == GGML_TYPE_F32) {
|
||||||
|
|
11
llama.cpp
11
llama.cpp
|
@ -5164,11 +5164,12 @@ static int llama_decode_internal(
|
||||||
|
|
||||||
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
|
||||||
const bool full_offload_supported =
|
const bool full_offload_supported =
|
||||||
model.arch == LLM_ARCH_LLAMA ||
|
model.arch == LLM_ARCH_LLAMA ||
|
||||||
model.arch == LLM_ARCH_BAICHUAN ||
|
model.arch == LLM_ARCH_BAICHUAN ||
|
||||||
model.arch == LLM_ARCH_FALCON ||
|
model.arch == LLM_ARCH_FALCON ||
|
||||||
model.arch == LLM_ARCH_REFACT ||
|
model.arch == LLM_ARCH_REFACT ||
|
||||||
model.arch == LLM_ARCH_MPT;
|
model.arch == LLM_ARCH_MPT ||
|
||||||
|
model.arch == LLM_ARCH_STARCODER;
|
||||||
|
|
||||||
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
||||||
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue