add more QOL
This commit is contained in:
parent
94ef3e81cf
commit
417711be46
6 changed files with 17 additions and 2 deletions
|
@ -164,6 +164,7 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
|
||||||
cl_int err_ = (err); \
|
cl_int err_ = (err); \
|
||||||
if (err_ != CL_SUCCESS) { \
|
if (err_ != CL_SUCCESS) { \
|
||||||
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
||||||
|
fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \
|
||||||
exit(1); \
|
exit(1); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
|
@ -313,6 +313,7 @@ static std::string generate_kernels() {
|
||||||
cl_int err_ = (err); \
|
cl_int err_ = (err); \
|
||||||
if (err_ != CL_SUCCESS) { \
|
if (err_ != CL_SUCCESS) { \
|
||||||
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
fprintf(stderr, "OpenCL %s error %d at %s:%d\n", name, err_, __FILE__, __LINE__); \
|
||||||
|
fprintf(stderr, "You may be out of VRAM. Please check if you have enough.\n"); \
|
||||||
exit(1); \
|
exit(1); \
|
||||||
} \
|
} \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
@ -840,6 +841,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
||||||
&queue, &ev_sgemm);
|
&queue, &ev_sgemm);
|
||||||
|
|
||||||
if (status != clblast::StatusCode::kSuccess) {
|
if (status != clblast::StatusCode::kSuccess) {
|
||||||
|
printf("\nYou may be out of VRAM. Please check if you have enough.\n");
|
||||||
GGML_ASSERT(false);
|
GGML_ASSERT(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -872,7 +874,7 @@ bool ggml_cl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
|
||||||
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
if ((src0->type == GGML_TYPE_F32 || src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) &&
|
||||||
src1->type == GGML_TYPE_F32 &&
|
src1->type == GGML_TYPE_F32 &&
|
||||||
dst->type == GGML_TYPE_F32 &&
|
dst->type == GGML_TYPE_F32 &&
|
||||||
((GetQuantsUnshuffled() && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
|
((GetQuantsUnshuffled() && GetGPULayers()>0 && ne0 >= 32 && ne1 >= 32 && ne10 >= 32) || src0->backend == GGML_BACKEND_CL)) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
1
ggml.c
1
ggml.c
|
@ -1499,6 +1499,7 @@ quantize_fns_t ggml_internal_get_quantize_fn(size_t i) {
|
||||||
return quantize_fns[i];
|
return quantize_fns[i];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
int true_gpu_layers = 0;
|
||||||
bool quants_unshuffled = false; //new GGJT_2 is unshuffled, all old ones are shuffled
|
bool quants_unshuffled = false; //new GGJT_2 is unshuffled, all old ones are shuffled
|
||||||
static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT]; //forward decl
|
static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT]; //forward decl
|
||||||
static inline quantize_fns_t get_quantize_fn(size_t i)
|
static inline quantize_fns_t get_quantize_fn(size_t i)
|
||||||
|
|
2
ggml.h
2
ggml.h
|
@ -1094,6 +1094,8 @@ extern "C" {
|
||||||
|
|
||||||
void SetQuantsUnshuffled(bool unshuffled);
|
void SetQuantsUnshuffled(bool unshuffled);
|
||||||
bool GetQuantsUnshuffled();
|
bool GetQuantsUnshuffled();
|
||||||
|
void SetGPULayers(bool layers);
|
||||||
|
bool GetGPULayers();
|
||||||
|
|
||||||
GGML_API int ggml_cpu_has_avx (void);
|
GGML_API int ggml_cpu_has_avx (void);
|
||||||
GGML_API int ggml_cpu_has_avx2 (void);
|
GGML_API int ggml_cpu_has_avx2 (void);
|
||||||
|
|
|
@ -1579,6 +1579,14 @@ inline bool GetQuantsUnshuffled()
|
||||||
{
|
{
|
||||||
return quants_unshuffled;
|
return quants_unshuffled;
|
||||||
}
|
}
|
||||||
|
inline void SetGPULayers(bool layers)
|
||||||
|
{
|
||||||
|
true_gpu_layers = layers;
|
||||||
|
}
|
||||||
|
inline bool GetGPULayers()
|
||||||
|
{
|
||||||
|
return true_gpu_layers;
|
||||||
|
}
|
||||||
|
|
||||||
//TODO: integrate backwards compat
|
//TODO: integrate backwards compat
|
||||||
static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = {
|
static const quantize_fns_t quantize_fns_v2[GGML_TYPE_COUNT] = {
|
||||||
|
|
|
@ -1074,6 +1074,7 @@ static void llama_model_load_internal(
|
||||||
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
||||||
if(GetQuantsUnshuffled())
|
if(GetQuantsUnshuffled())
|
||||||
{
|
{
|
||||||
|
SetGPULayers(n_gpu);
|
||||||
|
|
||||||
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
fprintf(stderr, "%s: [opencl] offloading %d layers to GPU\n", __func__, n_gpu);
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue