vulkan: Implement "fast divide" (mul+shift) for unary ops like copy (#10642)

This commit is contained in:
Jeff Bolz 2024-12-04 01:28:59 -06:00 committed by GitHub
parent 40c6d79fb5
commit 2759916d86
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 66 additions and 8 deletions

View file

@ -8,6 +8,13 @@ layout (push_constant) uniform parameter
uint ne10; uint ne11; uint ne12; uint ne13; uint nb10; uint nb11; uint nb12; uint nb13;
uint d_offset;
float param1; float param2;
uint ne0_012mp; uint ne0_012L;
uint ne0_01mp; uint ne0_01L;
uint ne0_0mp; uint ne0_0L;
uint ne1_012mp; uint ne1_012L;
uint ne1_01mp; uint ne1_01L;
uint ne1_0mp; uint ne1_0L;
} p;
layout (binding = 0) readonly buffer A {A_TYPE data_a[];};
@ -17,22 +24,30 @@ uint get_idx() {
return gl_GlobalInvocationID.z * 262144 + gl_GlobalInvocationID.y * 512 + gl_GlobalInvocationID.x;
}
// see init_fastdiv_values in ggml-vulkan.cpp
uint fastdiv(uint n, uint mp, uint L) {
uint msbs, lsbs;
// msbs = mulhi(n, mp)
umulExtended(n, mp, msbs, lsbs);
return (msbs + n) >> L;
}
uint src0_idx(uint idx) {
const uint i03 = idx / (p.ne02*p.ne01*p.ne00);
const uint i03 = fastdiv(idx, p.ne0_012mp, p.ne0_012L);
const uint i03_offset = i03 * p.ne02*p.ne01*p.ne00;
const uint i02 = (idx - i03_offset) / (p.ne01*p.ne00);
const uint i02 = fastdiv(idx - i03_offset, p.ne0_01mp, p.ne0_01L);
const uint i02_offset = i02*p.ne01*p.ne00;
const uint i01 = (idx - i03_offset - i02_offset) / p.ne00;
const uint i01 = fastdiv(idx - i03_offset - i02_offset, p.ne0_0mp, p.ne0_0L);
const uint i00 = idx - i03_offset - i02_offset - i01*p.ne00;
return i03*p.nb03 + i02*p.nb02 + i01*p.nb01 + i00*p.nb00;
}
uint dst_idx(uint idx) {
const uint i13 = idx / (p.ne12*p.ne11*p.ne10);
const uint i13 = fastdiv(idx, p.ne1_012mp, p.ne1_012L);
const uint i13_offset = i13 * p.ne12*p.ne11*p.ne10;
const uint i12 = (idx - i13_offset) / (p.ne11*p.ne10);
const uint i12 = fastdiv(idx - i13_offset, p.ne1_01mp, p.ne1_01L);
const uint i12_offset = i12*p.ne11*p.ne10;
const uint i11 = (idx - i13_offset - i12_offset) / p.ne10;
const uint i11 = fastdiv(idx - i13_offset - i12_offset, p.ne1_0mp, p.ne1_0L);
const uint i10 = idx - i13_offset - i12_offset - i11*p.ne10;
return i13*p.nb13 + i12*p.nb12 + i11*p.nb11 + i10*p.nb10;
}