Another faster f16 x f32 matrix multiply kernel
This commit is contained in:
parent
c9e38057f0
commit
9f353f0536
1 changed files with 1 additions and 0 deletions
|
@ -682,6 +682,7 @@ kernel void kernel_mul_mat_f16_f32_l4(
|
||||||
device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
device const half4 * x4 = (device const half4 *) (src0 + r0*nb01 + im/(ne12/ne02)*nb02);
|
||||||
|
|
||||||
for (int r1 = 0; r1 < nrows; ++r1) {
|
for (int r1 = 0; r1 < nrows; ++r1) {
|
||||||
|
|
||||||
device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
|
device const float4 * y4 = (device const float4 *) (src1 + r1*nb11 + im*nb12);
|
||||||
|
|
||||||
float sumf = 0;
|
float sumf = 0;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue