Use pinned memory for f16 preprocessing
This commit is contained in:
parent
e4903957ec
commit
105fd199be
3 changed files with 10 additions and 6 deletions
|
@ -1447,7 +1447,7 @@ static void ggml_vk_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
ggml_vk_pool_free(d_D);
|
ggml_vk_pool_free(d_D);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, void * wdata) {
|
static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
|
||||||
#ifdef VK_DEBUG
|
#ifdef VK_DEBUG
|
||||||
std::cerr << "ggml_vk_mul_mat_f16((type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3];
|
std::cerr << "ggml_vk_mul_mat_f16((type=" << src0->type << ", ne0=" << src0->ne[0] << ", ne1=" << src0->ne[1] << ", ne2=" << src0->ne[2] << ", ne3=" << src0->ne[3];
|
||||||
std::cerr << "), (type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3];
|
std::cerr << "), (type=" << src1->type << ", ne0=" << src1->ne[0] << ", ne1=" << src1->ne[1] << ", ne2=" << src1->ne[2] << ", ne3=" << src1->ne[3];
|
||||||
|
@ -1503,6 +1503,8 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
|
|
||||||
const bool load_x = src1->backend != GGML_BACKEND_GPU;
|
const bool load_x = src1->backend != GGML_BACKEND_GPU;
|
||||||
|
|
||||||
|
ggml_fp16_t * fp16_staging = (ggml_fp16_t *) ggml_vk_host_malloc(sizeof(ggml_fp16_t) * (ne11 * ne10) * (ne02 * ne03));
|
||||||
|
|
||||||
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
for (int64_t i03 = 0; i03 < ne03; i03++) {
|
||||||
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
for (int64_t i02 = 0; i02 < ne02; i02++) {
|
||||||
const bool first = i03 == 0 && i02 == 0;
|
const bool first = i03 == 0 && i02 == 0;
|
||||||
|
@ -1529,7 +1531,7 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
// convert src1 to fp16
|
// convert src1 to fp16
|
||||||
// TODO: use multiple threads
|
// TODO: use multiple threads
|
||||||
// TODO: This memory isn't pinned
|
// TODO: This memory isn't pinned
|
||||||
ggml_fp16_t * const tmp = (ggml_fp16_t *) wdata + (ne11 * ne10) * (i03 * ne02 + i02);
|
ggml_fp16_t * const tmp = fp16_staging + (ne11 * ne10) * (i03 * ne02 + i02);
|
||||||
char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
|
char * src1i = (char *) src1->data + i03*nb13 + i02*nb12;
|
||||||
if (src1_cont_rows) {
|
if (src1_cont_rows) {
|
||||||
if (src1_cont_cols) {
|
if (src1_cont_cols) {
|
||||||
|
@ -1580,6 +1582,8 @@ static void ggml_vk_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ggml_vk_host_free(fp16_staging);
|
||||||
|
|
||||||
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
ggml_vk_submit(vk_transfer_queues[0], transfer_0_seqs, VK_NULL_HANDLE);
|
||||||
|
|
||||||
// cleanup waits for the queue to be done
|
// cleanup waits for the queue to be done
|
||||||
|
@ -1795,7 +1799,7 @@ bool ggml_vk_mul_mat_use_f16(const struct ggml_tensor * src0, const struct ggml_
|
||||||
return mul_mat_f16_transfer < mul_mat_q_transfer;
|
return mul_mat_f16_transfer < mul_mat_q_transfer;
|
||||||
}
|
}
|
||||||
|
|
||||||
void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize) {
|
void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) {
|
||||||
GGML_ASSERT(ggml_vk_can_mul_mat(src0, src1, dst));
|
GGML_ASSERT(ggml_vk_can_mul_mat(src0, src1, dst));
|
||||||
|
|
||||||
if (src0->type == GGML_TYPE_F32) {
|
if (src0->type == GGML_TYPE_F32) {
|
||||||
|
@ -1803,7 +1807,7 @@ void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor *
|
||||||
}
|
}
|
||||||
else if (src0->type == GGML_TYPE_F16) {
|
else if (src0->type == GGML_TYPE_F16) {
|
||||||
if (ggml_vk_mul_mat_use_f16(src0, src1, dst)) {
|
if (ggml_vk_mul_mat_use_f16(src0, src1, dst)) {
|
||||||
ggml_vk_mul_mat_f16(src0, src1, dst, wdata);
|
ggml_vk_mul_mat_f16(src0, src1, dst);
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
ggml_vk_mul_mat_q_f32(src0, src1, dst);
|
ggml_vk_mul_mat_q_f32(src0, src1, dst);
|
||||||
|
|
|
@ -11,7 +11,7 @@ void ggml_vk_init(void);
|
||||||
void ggml_vk_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
void ggml_vk_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
bool ggml_vk_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
size_t ggml_vk_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
size_t ggml_vk_mul_mat_get_wsize(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst, void * wdata, size_t wsize);
|
void ggml_vk_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
|
||||||
|
|
||||||
void * ggml_vk_host_malloc(size_t size);
|
void * ggml_vk_host_malloc(size_t size);
|
||||||
void ggml_vk_host_free(void * ptr);
|
void ggml_vk_host_free(void * ptr);
|
||||||
|
|
2
ggml.c
2
ggml.c
|
@ -10745,7 +10745,7 @@ static void ggml_compute_forward_mul_mat(
|
||||||
#elif defined(GGML_USE_VULKAN)
|
#elif defined(GGML_USE_VULKAN)
|
||||||
if (ggml_vk_can_mul_mat(src0, src1, dst)) {
|
if (ggml_vk_can_mul_mat(src0, src1, dst)) {
|
||||||
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
|
||||||
ggml_vk_mul_mat(src0, src1, dst, params->wdata, params->wsize);
|
ggml_vk_mul_mat(src0, src1, dst);
|
||||||
}
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue