Trying IQ3_S without a lookup table

This commit is contained in:
Iwan Kawrakow 2024-03-01 11:52:17 +02:00
parent cb49e0f8c9
commit 9c752ff0d3
2 changed files with 384 additions and 192 deletions

View file

@ -2008,72 +2008,72 @@ static const __device__ uint32_t iq3xxs_grid[256] = {
0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04,
};
static const __device__ uint32_t iq3xs_grid[512] = {
0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14,
0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414,
0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24,
0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c,
0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c,
0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34,
0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c,
0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414,
0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c,
0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404,
0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434,
0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c,
0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404,
0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414,
0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414,
0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404,
0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c,
0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c,
0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404,
0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e,
0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14,
0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c,
0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424,
0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c,
0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c,
0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e,
0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e,
0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e,
0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424,
0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e,
0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424,
0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404,
0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c,
0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e,
0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c,
0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c,
0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c,
0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404,
0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04,
0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c,
0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414,
0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c,
0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c,
0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424,
0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c,
0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c,
0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414,
0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c,
0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e,
0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04,
0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424,
0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14,
0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34,
0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c,
0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434,
0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c,
0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424,
0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24,
0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24,
0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e,
0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c,
0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c,
0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c,
0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404,
};
//static const __device__ uint32_t iq3xs_grid[512] = {
//0x04040404, 0x04142c14, 0x042c2424, 0x0404143c, 0x04140c0c, 0x042c0424, 0x04043434, 0x041c240c,
//0x04341c1c, 0x040c0c34, 0x041c0404, 0x0434341c, 0x040c2c2c, 0x04241c04, 0x043c1414, 0x0414042c,
//0x04243c04, 0x04042c14, 0x04142424, 0x042c143c, 0x04040c0c, 0x0c1c0424, 0x0c2c3434, 0x0c04240c,
//0x0c1c141c, 0x0c340c34, 0x0c0c0404, 0x0c24341c, 0x0c34242c, 0x0c0c1c04, 0x0c240c14, 0x0c3c042c,
//0x0c143404, 0x0c2c2c14, 0x14041c24, 0x1414143c, 0x142c040c, 0x14043c24, 0x141c2c34, 0x1434240c,
//0x140c141c, 0x141c0c34, 0x14340404, 0x140c341c, 0x1424242c, 0x143c1c04, 0x14140c14, 0x1424042c,
//0x1c043404, 0x1c142414, 0x1c2c1c24, 0x1c040c3c, 0x1c1c040c, 0x1c2c3424, 0x1c042c34, 0x1c1c1c0c,
//0x1c34141c, 0x1c0c0434, 0x1c243c04, 0x1c342c1c, 0x1c0c242c, 0x1c241404, 0x243c0c14, 0x2414042c,
//0x242c3404, 0x24042414, 0x24141c24, 0x242c0c3c, 0x2404040c, 0x241c3424, 0x24342434, 0x24041c0c,
//0x241c0c1c, 0x24340434, 0x240c3404, 0x2c242c1c, 0x2c3c1c2c, 0x2c141404, 0x2c240414, 0x2c043c2c,
//0x2c142c04, 0x2c2c2414, 0x2c041424, 0x2c1c0c3c, 0x2c2c040c, 0x2c043424, 0x2c1c2434, 0x2c341c0c,
//0x2c0c0c1c, 0x34240434, 0x34343404, 0x340c2c1c, 0x34241c2c, 0x343c1404, 0x34140414, 0x342c342c,
//0x34042c04, 0x34141c14, 0x342c1424, 0x3404043c, 0x341c3c0c, 0x34342c24, 0x3c042434, 0x3c1c140c,
//0x3c340c1c, 0x3c0c0434, 0x3c243404, 0x3c3c241c, 0x3c0c1c2c, 0x04240c04, 0x04040414, 0x0414342c,
//0x042c2c04, 0x04041c14, 0x041c1424, 0x042c043c, 0x04043c0c, 0x041c2c24, 0x04341c34, 0x040c140c,
//0x0424041c, 0x04343c34, 0x040c2c04, 0x0424241c, 0x043c142c, 0x04140c04, 0x042c0414, 0x0404342c,
//0x04142404, 0x042c1c14, 0x0c040c24, 0x0c1c043c, 0x0c34340c, 0x0c042c24, 0x0c1c1c34, 0x0c34140c,
//0x0c0c041c, 0x0c243c34, 0x0c3c2c04, 0x0c0c241c, 0x0c24142c, 0x0c040404, 0x0c143c14, 0x142c2c2c,
//0x14042404, 0x14141414, 0x142c0c24, 0x1404043c, 0x141c340c, 0x14342424, 0x140c1c34, 0x14240c0c,
//0x1434041c, 0x140c3434, 0x14242c04, 0x143c1c1c, 0x1414142c, 0x1c2c0404, 0x1c043c14, 0x1c142c2c,
//0x1c2c2404, 0x1c041414, 0x1c1c0c24, 0x1c343c3c, 0x1c042c0c, 0x1c1c2424, 0x1c341434, 0x1c0c0c0c,
//0x1c24041c, 0x1c3c3434, 0x240c2404, 0x24241c1c, 0x24040c2c, 0x24140404, 0x242c3414, 0x24042c2c,
//0x24141c04, 0x242c1414, 0x24040424, 0x241c3c3c, 0x24342c0c, 0x240c2424, 0x241c1434, 0x24340c0c,
//0x2c0c041c, 0x2c243434, 0x2c3c2404, 0x2c14141c, 0x2c2c0c2c, 0x2c040404, 0x2c143414, 0x2c2c242c,
//0x2c041c04, 0x2c1c0c14, 0x2c340424, 0x2c04343c, 0x2c1c2c0c, 0x2c341c24, 0x340c1434, 0x3424040c,
//0x343c3c1c, 0x340c2c34, 0x34242404, 0x3404141c, 0x34140c2c, 0x342c0404, 0x34043414, 0x3414242c,
//0x342c1c04, 0x34040c14, 0x341c0424, 0x3c34343c, 0x3c0c240c, 0x3c1c1c24, 0x3c340c34, 0x3c0c040c,
//0x3c24341c, 0x3c3c2c34, 0x04141c04, 0x0424141c, 0x0404042c, 0x04143c04, 0x042c2c14, 0x0404242c,
//0x041c1404, 0x04340c14, 0x04040424, 0x041c343c, 0x0434240c, 0x040c1c24, 0x04240c34, 0x043c040c,
//0x040c341c, 0x04242434, 0x04041c04, 0x04140c1c, 0x042c042c, 0x04043404, 0x0c142c14, 0x0c2c1c2c,
//0x0c041404, 0x0c1c0414, 0x0c343c24, 0x0c0c2c3c, 0x0c1c240c, 0x0c341424, 0x0c0c0c34, 0x0c24040c,
//0x0c3c341c, 0x0c142434, 0x0c241c04, 0x0c040c1c, 0x1414042c, 0x142c3404, 0x14042c14, 0x141c1c2c,
//0x142c1404, 0x14040414, 0x141c3424, 0x14342c3c, 0x140c1c0c, 0x14241424, 0x143c0434, 0x140c3c0c,
//0x14242c1c, 0x1c042434, 0x1c141404, 0x1c2c0c1c, 0x1c04042c, 0x1c143404, 0x1c2c2414, 0x1c041c2c,
//0x1c1c0c04, 0x1c340414, 0x1c0c3424, 0x1c1c2c3c, 0x1c341c0c, 0x1c0c1424, 0x1c240434, 0x243c3c0c,
//0x24142c1c, 0x24241c34, 0x24041404, 0x2414041c, 0x242c3c2c, 0x24042c04, 0x241c2414, 0x242c142c,
//0x24040c04, 0x241c0414, 0x24343424, 0x240c243c, 0x24241c0c, 0x2c340c24, 0x2c0c0434, 0x2c24340c,
//0x2c3c2c1c, 0x2c141c34, 0x2c2c1404, 0x2c04041c, 0x2c143c2c, 0x2c2c2c04, 0x2c042414, 0x2c1c142c,
//0x2c340404, 0x2c0c3c14, 0x341c2c24, 0x3434243c, 0x340c140c, 0x34240c24, 0x343c0434, 0x3414340c,
//0x3424241c, 0x34041c34, 0x34140c04, 0x342c041c, 0x3404342c, 0x341c2c04, 0x342c1c14, 0x3404142c,
//0x3c1c0404, 0x3c343c14, 0x3c0c2c24, 0x3c24243c, 0x3c34140c, 0x3c0c0c24, 0x3c243c34, 0x043c2c0c,
//0x0414241c, 0x042c1434, 0x04040c04, 0x0414041c, 0x042c342c, 0x04042404, 0x041c1c14, 0x04340c2c,
//0x040c0404, 0x041c3414, 0x04342c24, 0x040c1c3c, 0x0424140c, 0x043c0424, 0x04143c34, 0x04242c0c,
//0x0404241c, 0x04141434, 0x042c0c04, 0x0c04041c, 0x0c1c342c, 0x0c2c2404, 0x0c041414, 0x0c1c0c2c,
//0x0c340404, 0x0c0c3414, 0x0c242424, 0x0c341c3c, 0x0c0c0c0c, 0x0c240424, 0x0c3c3434, 0x0c142c0c,
//0x0c2c1c1c, 0x14041434, 0x14140404, 0x142c3c1c, 0x14042c2c, 0x141c2404, 0x14341414, 0x14040c2c,
//0x141c0404, 0x14343414, 0x140c2424, 0x14241c3c, 0x143c0c0c, 0x14140424, 0x1c243434, 0x1c04240c,
//0x1c141c1c, 0x1c2c0c34, 0x1c040404, 0x1c1c341c, 0x1c2c2c2c, 0x1c041c04, 0x1c1c1414, 0x1c34042c,
//0x1c0c3c04, 0x1c242c14, 0x1c342424, 0x1c0c143c, 0x24240c0c, 0x243c0424, 0x24143434, 0x242c240c,
//0x24041c1c, 0x24140c34, 0x242c0404, 0x2404341c, 0x241c242c, 0x24341c04, 0x24040c14, 0x241c042c,
//0x24343404, 0x2c0c2c14, 0x2c241c24, 0x2c3c143c, 0x2c0c040c, 0x2c243c24, 0x2c042c34, 0x2c14240c,
//0x2c2c141c, 0x2c040c34, 0x2c1c0404, 0x2c2c341c, 0x2c04242c, 0x2c1c1c04, 0x2c340c14, 0x340c042c,
//0x34243404, 0x34342c14, 0x340c1c24, 0x34240c3c, 0x343c040c, 0x34143424, 0x342c2c34, 0x34041c0c,
//0x3414141c, 0x342c0434, 0x34043c04, 0x341c2c1c, 0x3434242c, 0x3c041404, 0x3c1c0c14, 0x3c34042c,
//0x3c0c3404, 0x3c242414, 0x3c3c1c24, 0x040c0c3c, 0x0424040c, 0x04043424, 0x04142c34, 0x042c1c0c,
//0x0404141c, 0x04140434, 0x042c3c04, 0x04042c1c, 0x041c1c2c, 0x04341404, 0x040c0414, 0x041c3c2c,
//0x04342c04, 0x040c2414, 0x04241424, 0x043c0c3c, 0x0414040c, 0x042c3424, 0x04042434, 0x04141c0c,
//0x0c2c0c1c, 0x0c040434, 0x0c1c3404, 0x0c342c1c, 0x0c041c2c, 0x0c1c1404, 0x0c340414, 0x0c0c3c2c,
//0x0c242c04, 0x0c3c2414, 0x0c0c1424, 0x0c24043c, 0x0c043c0c, 0x14142c24, 0x142c2434, 0x1404140c,
//0x14140c1c, 0x142c0434, 0x14043404, 0x141c241c, 0x14341c2c, 0x140c0c04, 0x141c0414, 0x1434342c,
//0x140c2c04, 0x14241c14, 0x143c1424, 0x1c14043c, 0x1c243c0c, 0x1c042c24, 0x1c142434, 0x1c2c140c,
//0x1c040c1c, 0x1c1c3c34, 0x1c342c04, 0x1c04241c, 0x1c1c142c, 0x1c340c04, 0x1c0c0414, 0x1c24342c,
//0x1c3c2404, 0x240c1c14, 0x24240c24, 0x2404043c, 0x2414340c, 0x242c2c24, 0x24041c34, 0x2414140c,
//0x242c041c, 0x24043c34, 0x241c2c04, 0x2434241c, 0x240c142c, 0x241c0c04, 0x2c340414, 0x2c0c342c,
//};
static const __device__ uint64_t iq1s_grid[512] = {
@ -2370,6 +2370,12 @@ static __global__ void dequantize_block_iq3_xxs(const void * __restrict__ vx, ds
}
//#define IQ3S_MULTIPLIER 2469109
//#define IQ3S_MULTIPLIER 746226
//#define IQ3S_MULTIPLIER 717154
#define IQ3S_MULTIPLIER 677595
//static const __device__ uint8_t iq3s_values[16] = {1, 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15};
template<typename dst_t>
static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_t * __restrict__ yy) {
@ -2382,14 +2388,25 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_
const int ib = tid%8; // 0...7
dst_t * y = yy + i*QK_K + 32*ib + 8*il;
const uint8_t * qs = x[i].qs + 8*ib;
const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
int32_t aux32[2];
const uint8_t * grid = (const uint8_t *)aux32;
const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf));
const uint8_t signs = x[i].signs[4*ib + il];
for (int j = 0; j < 4; ++j) {
y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
aux32[0] = ((qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
aux32[1] = ((qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
for (int j = 0; j < 8; ++j) {
//y[j] = d * (2*((grid[j]-1)/2) + 1) * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
//y[j] = d * iq3s_values[grid[j]] * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
y[j] = d * (2*(((grid[j]+1)/2) & 7) + 1) * (signs & kmask_iq2xs[j] ? -1.f : 1.f);
}
// const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256)));
// const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256)));
// const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f;
// const uint8_t signs = x[i].signs[4*ib + il];
// for (int j = 0; j < 4; ++j) {
// y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f);
// y[j+4] = d * grid2[j] * (signs & kmask_iq2xs[j+4] ? -1.f : 1.f);
// }
#else
assert(false);
#endif
@ -5196,22 +5213,36 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1(
#if QK_K == 256
const block_iq3_s * bq2 = (const block_iq3_s *) vbq;
uint32_t aux32[2];
const uint8_t * grid = (const uint8_t *)aux32;
const int ib32 = iqs;
const uint8_t * qs = bq2->qs + 8*ib32;
const int8_t * q8 = bq8_1[ib32].qs;
int sumi = 0;
for (int l = 0; l < 4; ++l) {
const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
aux32[0] = ((qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
aux32[1] = ((qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)) * IQ3S_MULTIPLIER) & 0x0f0f0f0f;
aux32[0] = __vadd4(((__vadd4(aux32[0], 0x01010101) >> 1) & 0x07070707) << 1, 0x01010101);
aux32[1] = __vadd4(((__vadd4(aux32[1], 0x01010101) >> 1) & 0x07070707) << 1, 0x01010101);
uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
const int grid_l = __vsub4(aux32[0] ^ signs0, signs0);
const int grid_h = __vsub4(aux32[1] ^ signs1, signs1);
sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
//const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256));
//const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256));
//uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201);
//uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201);
//const int grid_l = __vsub4(grid1[0] ^ signs0, signs0);
//const int grid_h = __vsub4(grid2[0] ^ signs1, signs1);
//sumi = __dp4a(grid_l, *((int *)q8+0), sumi);
//sumi = __dp4a(grid_h, *((int *)q8+1), sumi);
q8 += 8;
}
const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
//const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f;
const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds);
return d * sumi;
#else
assert(false);