diff --git a/ggml-vulkan-shaders.hpp b/ggml-vulkan-shaders.hpp index 7a768e1ed..612c8669d 100644 --- a/ggml-vulkan-shaders.hpp +++ b/ggml-vulkan-shaders.hpp @@ -1793,7 +1793,7 @@ const uint64_t dequant_q3_K_len = 4804; unsigned char dequant_q4_0_data[] = { 0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00, -0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00, +0x83,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00, 0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00, 0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00, 0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00, @@ -1801,43 +1801,43 @@ unsigned char dequant_q4_0_data[] = { 0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00, 0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00, 0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00, -0x0b,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x45,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x10,0x00,0x06,0x00, -0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00, +0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2b,0x00,0x00,0x00, +0x4d,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x10,0x00,0x06,0x00, +0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x01,0x00,0x00, 0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00, 0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x0b,0x00,0x00,0x00, -0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x21,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0x12,0x00,0x00,0x00,0x0b,0x00,0x00,0x00, +0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x29,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x48,0x00,0x05,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x48,0x00,0x05,0x00,0x29,0x00,0x00,0x00,0x01,0x00,0x00,0x00, 0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00, -0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x21,0x00,0x00,0x00, +0x29,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x29,0x00,0x00,0x00, 0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, -0x48,0x00,0x05,0x00,0x21,0x00,0x00,0x00,0x04,0x00,0x00,0x00, +0x48,0x00,0x05,0x00,0x29,0x00,0x00,0x00,0x04,0x00,0x00,0x00, 0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x47,0x00,0x03,0x00, -0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0x40,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x48,0x00,0x05,0x00,0x41,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x29,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0x48,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x48,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00, -0x41,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00, -0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x42,0x00,0x00,0x00, +0x49,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00, +0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4a,0x00,0x00,0x00, 0x06,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x48,0x00,0x04,0x00, -0x43,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00, -0x48,0x00,0x05,0x00,0x43,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x4b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00, +0x48,0x00,0x05,0x00,0x4b,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00, -0x43,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0x45,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x21,0x00,0x00,0x00, -0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5b,0x00,0x00,0x00, +0x4b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0x4d,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x21,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x63,0x00,0x00,0x00, 0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00, -0x5c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00, -0x48,0x00,0x05,0x00,0x5c,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x64,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00, +0x48,0x00,0x05,0x00,0x64,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00, -0x5c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, -0x5e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x47,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,0x21,0x00,0x00,0x00, -0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8b,0x00,0x00,0x00, +0x64,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00, +0x66,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0x47,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x21,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x94,0x00,0x00,0x00, 0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00, 0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00, 0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00, @@ -1848,232 +1848,385 @@ unsigned char dequant_q4_0_data[] = { 0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, 0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x06,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00, -0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x08,0x00,0x00,0x00, -0x1e,0x00,0x07,0x00,0x21,0x00,0x00,0x00,0x06,0x00,0x00,0x00, +0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x3b,0x00,0x04,0x00, +0x0a,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x15,0x00,0x00,0x00, +0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x1e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x07,0x00, +0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00, 0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x22,0x00,0x00,0x00, -0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3b,0x00,0x04,0x00, -0x22,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x09,0x00,0x00,0x00, -0x15,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x20,0x00,0x00,0x00, -0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x24,0x00,0x00,0x00, -0x25,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x20,0x00,0x04,0x00, -0x26,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00, -0x14,0x00,0x02,0x00,0x29,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x00,0x01,0x00,0x00, -0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00, -0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0x36,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x16,0x00,0x03,0x00, -0x3a,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x16,0x00,0x03,0x00, -0x3d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00, -0x3e,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00, -0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x40,0x00,0x00,0x00, -0x3e,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x1e,0x00,0x04,0x00, -0x41,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x40,0x00,0x00,0x00, -0x1d,0x00,0x03,0x00,0x42,0x00,0x00,0x00,0x41,0x00,0x00,0x00, -0x1e,0x00,0x03,0x00,0x43,0x00,0x00,0x00,0x42,0x00,0x00,0x00, -0x20,0x00,0x04,0x00,0x44,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, -0x43,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x44,0x00,0x00,0x00, -0x45,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x24,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x00,0x00,0x00,0x00, -0x20,0x00,0x04,0x00,0x48,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, -0x3d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x3a,0x00,0x00,0x00, -0x4d,0x00,0x00,0x00,0x00,0x00,0x00,0xc1,0x1d,0x00,0x03,0x00, -0x5b,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00, -0x5c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x20,0x00,0x04,0x00, -0x5d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x5c,0x00,0x00,0x00, -0x3b,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,0x5e,0x00,0x00,0x00, -0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x24,0x00,0x00,0x00, -0x65,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00, -0x69,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x6e,0x00,0x00,0x00, -0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0x8a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00, -0x09,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x32,0x00,0x00,0x00, -0x8a,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x11,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00, -0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, -0xf4,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x03,0x00,0x00,0x00, -0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00, -0x13,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00, +0x20,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00, +0x29,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x2a,0x00,0x00,0x00, +0x2b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00, +0x04,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2e,0x00,0x00,0x00, +0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00, +0x32,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x38,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x2b,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x08,0x00,0x00,0x00, +0x16,0x00,0x03,0x00,0x42,0x00,0x00,0x00,0x20,0x00,0x00,0x00, +0x16,0x00,0x03,0x00,0x45,0x00,0x00,0x00,0x10,0x00,0x00,0x00, +0x15,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x08,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x47,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00, +0x48,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x47,0x00,0x00,0x00, +0x1e,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x45,0x00,0x00,0x00, +0x48,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x4a,0x00,0x00,0x00, +0x49,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x4b,0x00,0x00,0x00, +0x4a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x4c,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00, +0x4c,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x50,0x00,0x00,0x00, +0x0c,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x00,0x00,0x00,0xc1, +0x1d,0x00,0x03,0x00,0x63,0x00,0x00,0x00,0x45,0x00,0x00,0x00, +0x1e,0x00,0x03,0x00,0x64,0x00,0x00,0x00,0x63,0x00,0x00,0x00, +0x20,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0x64,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x65,0x00,0x00,0x00, +0x66,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x01,0x00,0x00,0x00, +0x20,0x00,0x04,0x00,0x71,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, +0x46,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00, +0x76,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x00,0x01,0x00,0x00, +0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00, +0x94,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x93,0x00,0x00,0x00, +0x93,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x77,0x01,0x00,0x00,0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x02,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x79,0x01,0x00,0x00, +0x12,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x7a,0x01,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x13,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00, +0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x7d,0x01,0x00,0x00,0x05,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x15,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7f,0x01,0x00,0x00, +0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x80,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x2b,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x07,0x00,0x00,0x00, +0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x82,0x01,0x00,0x00, +0x17,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00, 0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00, 0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00, -0x8c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00, -0x0c,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, -0x8d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00, +0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00, +0x0c,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, +0x96,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00, 0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, 0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00, -0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00, -0x12,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0c,0x00,0x00,0x00, -0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x13,0x00,0x00,0x00, -0x12,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x17,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x16,0x00,0x00,0x00, -0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00, -0x13,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x84,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x16,0x00,0x00,0x00, -0x0f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x1f,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x1a,0x00,0x00,0x00, -0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00, -0x23,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x27,0x00,0x00,0x00, -0xae,0x00,0x05,0x00,0x29,0x00,0x00,0x00,0x2a,0x00,0x00,0x00, -0x1f,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0xf7,0x00,0x03,0x00, -0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00, -0x2a,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00, -0xf8,0x00,0x02,0x00,0x2b,0x00,0x00,0x00,0xf9,0x00,0x02,0x00, -0x8c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2c,0x00,0x00,0x00, -0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x31,0x00,0x00,0x00, -0x2f,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x84,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x32,0x00,0x00,0x00, -0x1a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x35,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x34,0x00,0x00,0x00, -0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00, -0x36,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x35,0x00,0x00,0x00, -0x38,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x48,0x00,0x00,0x00, -0x49,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00, -0x1f,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x3d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00, -0x73,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,0x4b,0x00,0x00,0x00, -0x4a,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x3a,0x00,0x00,0x00, -0x4f,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4b,0x00,0x00,0x00, -0x41,0x00,0x08,0x00,0x69,0x00,0x00,0x00,0x6a,0x00,0x00,0x00, -0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x1f,0x00,0x00,0x00, -0x65,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x3e,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x6a,0x00,0x00,0x00, -0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00, -0x6b,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x24,0x00,0x00,0x00, -0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0xc7,0x00,0x05,0x00, -0x24,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00, -0x6e,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x3a,0x00,0x00,0x00, -0x70,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, -0x3a,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x70,0x00,0x00,0x00, -0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00, -0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0x48,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x5e,0x00,0x00,0x00, -0x46,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x39,0x00,0x00,0x00, -0x3f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x3e,0x00,0x00,0x00, -0x80,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0xc2,0x00,0x05,0x00, -0x3e,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x80,0x00,0x00,0x00, -0x25,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x3a,0x00,0x00,0x00, -0x82,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, -0x3a,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x82,0x00,0x00,0x00, -0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00, -0x86,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0x48,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x5e,0x00,0x00,0x00, -0x46,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0x87,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x39,0x00,0x00,0x00, -0x8a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0x9b,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x8a,0x00,0x00,0x00, -0x41,0x00,0x08,0x00,0x69,0x00,0x00,0x00,0x9c,0x00,0x00,0x00, -0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x1f,0x00,0x00,0x00, -0x65,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x3e,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x9c,0x00,0x00,0x00, -0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00, -0x9d,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x24,0x00,0x00,0x00, -0x9f,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0xc7,0x00,0x05,0x00, -0x24,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00, -0x6e,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x3a,0x00,0x00,0x00, -0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, -0x3a,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xa1,0x00,0x00,0x00, -0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00, -0xa4,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0x48,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x5e,0x00,0x00,0x00, -0x46,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x39,0x00,0x00,0x00, -0xf2,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x3e,0x00,0x00,0x00, -0xaa,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0xc2,0x00,0x05,0x00, -0x3e,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0xaa,0x00,0x00,0x00, -0x25,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x3a,0x00,0x00,0x00, -0xac,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, -0x3a,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xac,0x00,0x00,0x00, -0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00, -0xaf,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0x48,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x5e,0x00,0x00,0x00, -0x46,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x39,0x00,0x00,0x00, -0xf3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0xba,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0xf3,0x00,0x00,0x00, -0x41,0x00,0x08,0x00,0x69,0x00,0x00,0x00,0xbb,0x00,0x00,0x00, -0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x1f,0x00,0x00,0x00, -0x65,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x3e,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0xbb,0x00,0x00,0x00, -0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00, -0xbc,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x24,0x00,0x00,0x00, -0xbe,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0xc7,0x00,0x05,0x00, -0x24,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0xbe,0x00,0x00,0x00, -0x6e,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x3a,0x00,0x00,0x00, -0xc0,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, -0x3a,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xc0,0x00,0x00,0x00, -0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00, -0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0x48,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x5e,0x00,0x00,0x00, -0x46,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0xc4,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x39,0x00,0x00,0x00, -0xf4,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x3e,0x00,0x00,0x00, -0xc9,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xc2,0x00,0x05,0x00, -0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0xc9,0x00,0x00,0x00, -0x25,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x3a,0x00,0x00,0x00, -0xcb,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, -0x3a,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xcb,0x00,0x00,0x00, -0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00, -0xce,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0x48,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x5e,0x00,0x00,0x00, -0x46,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0xcf,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0x39,0x00,0x00,0x00, -0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00, -0xd9,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0xf5,0x00,0x00,0x00, -0x41,0x00,0x08,0x00,0x69,0x00,0x00,0x00,0xda,0x00,0x00,0x00, -0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x1f,0x00,0x00,0x00, -0x65,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, -0x3e,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,0xda,0x00,0x00,0x00, -0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00, -0xdb,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x24,0x00,0x00,0x00, -0xdd,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xc7,0x00,0x05,0x00, -0x24,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdd,0x00,0x00,0x00, -0x6e,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x3a,0x00,0x00,0x00, -0xdf,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, -0x3a,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xdf,0x00,0x00,0x00, -0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00, -0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0x48,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x5e,0x00,0x00,0x00, -0x46,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0xe3,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00, -0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x39,0x00,0x00,0x00, -0xf6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x3e,0x00,0x00,0x00, -0xe8,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0xc2,0x00,0x05,0x00, -0x3e,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe8,0x00,0x00,0x00, -0x25,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x3a,0x00,0x00,0x00, -0xea,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x0c,0x00,0x08,0x00, -0x3a,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00, -0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xea,0x00,0x00,0x00, -0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00, -0xed,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x41,0x00,0x06,0x00, -0x48,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x5e,0x00,0x00,0x00, -0x46,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x3e,0x00,0x03,0x00, -0xee,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0xf9,0x00,0x02,0x00, -0x8c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x8c,0x00,0x00,0x00, -0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00, +0x0e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x10,0x00,0x00,0x00, +0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x13,0x00,0x00,0x00, +0x12,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, +0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00, +0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00, +0x14,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x80,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x11,0x00,0x00,0x00, +0x16,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x1b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x15,0x00,0x00,0x00, +0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00, +0x1b,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x89,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x1b,0x00,0x00,0x00, +0x1e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x25,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x17,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x00,0x00,0x00, +0x25,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x41,0x00,0x05,0x00, +0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2b,0x00,0x00,0x00, +0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x30,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x86,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x30,0x00,0x00,0x00, +0x1e,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x32,0x00,0x00,0x00, +0x33,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x31,0x00,0x00,0x00, +0xf7,0x00,0x03,0x00,0x35,0x00,0x00,0x00,0x00,0x00,0x00,0x00, +0xfa,0x00,0x04,0x00,0x33,0x00,0x00,0x00,0x34,0x00,0x00,0x00, +0x35,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x00,0x00,0x00, +0xf9,0x00,0x02,0x00,0x95,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, +0x35,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x17,0x00,0x00,0x00, +0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3c,0x00,0x00,0x00, +0x1e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x80,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3a,0x00,0x00,0x00, +0x3c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00, +0x40,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00, +0x3d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x41,0x00,0x07,0x00, +0x50,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x4d,0x00,0x00,0x00, +0x4e,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x52,0x00,0x00,0x00, +0x51,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x42,0x00,0x00,0x00, +0x53,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x85,0x00,0x05,0x00, +0x42,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x55,0x00,0x00,0x00, +0x53,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00, +0x72,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x40,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x73,0x00,0x00,0x00, +0x72,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x7c,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00, +0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x77,0x00,0x00,0x00, +0x75,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x7b,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x78,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x7b,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x7d,0x00,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x41,0x00,0x00,0x00, +0x3e,0x00,0x03,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x81,0x00,0x00,0x00, +0x41,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x3d,0x00,0x04,0x00, +0x46,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x72,0x00,0x00,0x00, +0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x89,0x00,0x00,0x00, +0x88,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x89,0x00,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x8d,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x8a,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x8f,0x00,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x81,0x00,0x00,0x00, +0x3e,0x00,0x03,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00, +0x41,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x80,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x40,0x00,0x00,0x00, +0x93,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00, +0xa5,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xa4,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0xa6,0x00,0x00,0x00, +0xa5,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0xa7,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x7c,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0xa7,0x00,0x00,0x00, +0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0xa9,0x00,0x00,0x00, +0xa8,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xac,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0xaa,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0xac,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xae,0x00,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xa2,0x00,0x00,0x00, +0x3e,0x00,0x03,0x00,0xae,0x00,0x00,0x00,0xad,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00, +0x41,0x00,0x00,0x00,0x77,0x01,0x00,0x00,0x3d,0x00,0x04,0x00, +0x46,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0xa5,0x00,0x00,0x00, +0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0xb4,0x00,0x00,0x00, +0xb3,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb4,0x00,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xb7,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0xb5,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0xb7,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xb9,0x00,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xb0,0x00,0x00,0x00, +0x3e,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,0xb8,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x00,0x00,0x00, +0x41,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x80,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x40,0x00,0x00,0x00, +0x78,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00, +0xc4,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xc3,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0xc5,0x00,0x00,0x00, +0xc4,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0xc6,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x7c,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xc6,0x00,0x00,0x00, +0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0xc8,0x00,0x00,0x00, +0xc7,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0xc8,0x00,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xcb,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0xc9,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xcb,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xcd,0x00,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xc1,0x00,0x00,0x00, +0x3e,0x00,0x03,0x00,0xcd,0x00,0x00,0x00,0xcc,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcf,0x00,0x00,0x00, +0x41,0x00,0x00,0x00,0x79,0x01,0x00,0x00,0x3d,0x00,0x04,0x00, +0x46,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xc4,0x00,0x00,0x00, +0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0xd3,0x00,0x00,0x00, +0xd2,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd3,0x00,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xd6,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0xd4,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xd8,0x00,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xcf,0x00,0x00,0x00, +0x3e,0x00,0x03,0x00,0xd8,0x00,0x00,0x00,0xd7,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe0,0x00,0x00,0x00, +0x41,0x00,0x00,0x00,0x7a,0x01,0x00,0x00,0x80,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0x40,0x00,0x00,0x00, +0x7a,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00, +0xe3,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xe2,0x00,0x00,0x00, +0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0xe4,0x00,0x00,0x00, +0xe3,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0xe5,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0x7c,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0xe5,0x00,0x00,0x00, +0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0xe7,0x00,0x00,0x00, +0xe6,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe7,0x00,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xea,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0xe8,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0xea,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xec,0x00,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xe0,0x00,0x00,0x00, +0x3e,0x00,0x03,0x00,0xec,0x00,0x00,0x00,0xeb,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00, +0x41,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x3d,0x00,0x04,0x00, +0x46,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xe3,0x00,0x00,0x00, +0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0xf2,0x00,0x00,0x00, +0xf1,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf2,0x00,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xf5,0x00,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0xf3,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xf7,0x00,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xee,0x00,0x00,0x00, +0x3e,0x00,0x03,0x00,0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xff,0x00,0x00,0x00, +0x41,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x80,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x40,0x00,0x00,0x00, +0x10,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00, +0x02,0x01,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x01,0x01,0x00,0x00, +0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x03,0x01,0x00,0x00, +0x02,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x04,0x01,0x00,0x00,0x03,0x01,0x00,0x00,0x7c,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0x04,0x01,0x00,0x00, +0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x06,0x01,0x00,0x00, +0x05,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x06,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x09,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x07,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x09,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x0b,0x01,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xff,0x00,0x00,0x00, +0x3e,0x00,0x03,0x00,0x0b,0x01,0x00,0x00,0x0a,0x01,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0d,0x01,0x00,0x00, +0x41,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,0x3d,0x00,0x04,0x00, +0x46,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x02,0x01,0x00,0x00, +0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x11,0x01,0x00,0x00, +0x10,0x01,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0x11,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x14,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x12,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x16,0x01,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x0d,0x01,0x00,0x00, +0x3e,0x00,0x03,0x00,0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x01,0x00,0x00, +0x41,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,0x80,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x40,0x00,0x00,0x00, +0x7d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00, +0x21,0x01,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x20,0x01,0x00,0x00, +0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x22,0x01,0x00,0x00, +0x21,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x23,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0x7c,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x23,0x01,0x00,0x00, +0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x25,0x01,0x00,0x00, +0x24,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x25,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x28,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x26,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x2a,0x01,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x1e,0x01,0x00,0x00, +0x3e,0x00,0x03,0x00,0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2c,0x01,0x00,0x00, +0x41,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3d,0x00,0x04,0x00, +0x46,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,0x21,0x01,0x00,0x00, +0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x30,0x01,0x00,0x00, +0x2f,0x01,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0x30,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x33,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x31,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x34,0x01,0x00,0x00,0x33,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x35,0x01,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x2c,0x01,0x00,0x00, +0x3e,0x00,0x03,0x00,0x35,0x01,0x00,0x00,0x34,0x01,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00, +0x41,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0x80,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x40,0x00,0x00,0x00, +0x7f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00, +0x40,0x01,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x3f,0x01,0x00,0x00, +0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x41,0x01,0x00,0x00, +0x40,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x42,0x01,0x00,0x00,0x41,0x01,0x00,0x00,0x7c,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00, +0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x44,0x01,0x00,0x00, +0x43,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x47,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x45,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x47,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x49,0x01,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x3d,0x01,0x00,0x00, +0x3e,0x00,0x03,0x00,0x49,0x01,0x00,0x00,0x48,0x01,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4b,0x01,0x00,0x00, +0x41,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x3d,0x00,0x04,0x00, +0x46,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x40,0x01,0x00,0x00, +0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x4f,0x01,0x00,0x00, +0x4e,0x01,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x50,0x01,0x00,0x00,0x4f,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x52,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x50,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x53,0x01,0x00,0x00,0x52,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x54,0x01,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4b,0x01,0x00,0x00, +0x3e,0x00,0x03,0x00,0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00, +0x41,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x80,0x00,0x05,0x00, +0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x40,0x00,0x00,0x00, +0x81,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00, +0x5f,0x01,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00, +0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x5e,0x01,0x00,0x00, +0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x60,0x01,0x00,0x00, +0x5f,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00, +0x61,0x01,0x00,0x00,0x60,0x01,0x00,0x00,0x7c,0x00,0x04,0x00, +0x2c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x61,0x01,0x00,0x00, +0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x63,0x01,0x00,0x00, +0x62,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x63,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x66,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x64,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x68,0x01,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x5c,0x01,0x00,0x00, +0x3e,0x00,0x03,0x00,0x68,0x01,0x00,0x00,0x67,0x01,0x00,0x00, +0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6a,0x01,0x00,0x00, +0x41,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x3d,0x00,0x04,0x00, +0x46,0x00,0x00,0x00,0x6d,0x01,0x00,0x00,0x5f,0x01,0x00,0x00, +0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x6e,0x01,0x00,0x00, +0x6d,0x01,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00, +0x42,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x6e,0x01,0x00,0x00, +0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x71,0x01,0x00,0x00, +0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00, +0x6f,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00, +0x45,0x00,0x00,0x00,0x72,0x01,0x00,0x00,0x71,0x01,0x00,0x00, +0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x73,0x01,0x00,0x00, +0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x6a,0x01,0x00,0x00, +0x3e,0x00,0x03,0x00,0x73,0x01,0x00,0x00,0x72,0x01,0x00,0x00, +0xf9,0x00,0x02,0x00,0x95,0x00,0x00,0x00,0xf8,0x00,0x02,0x00, +0x95,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00, + }; -const uint64_t dequant_q4_0_len = 3356; +const uint64_t dequant_q4_0_len = 5184; unsigned char dequant_q4_1_data[] = { 0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00, diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index fda66804c..105c86840 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -68,6 +68,26 @@ struct vk_queue { vk::PipelineStageFlags stage_flags; }; +struct vk_pipeline_struct { + std::string name; + vk::ShaderModule shader_module; + vk::DescriptorSetLayout dsl; + std::vector descriptor_pools; + std::vector descriptor_sets; + uint32_t descriptor_set_idx; + vk::PipelineLayout layout; + vk::Pipeline pipeline; + uint32_t push_constant_size; + uint32_t parameter_count; + std::array wg_denoms; + uint32_t align; +}; + +typedef std::shared_ptr vk_pipeline; +typedef std::weak_ptr vk_pipeline_ref; + +static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline); + struct vk_device { vk::PhysicalDevice physical_device; vk::PhysicalDeviceProperties properties; @@ -83,10 +103,61 @@ struct vk_device { uint32_t subgroup_size; bool uma; + bool initialized; + size_t idx; + + vk_pipeline pipeline_matmul_f32_l, pipeline_matmul_f32_m, pipeline_matmul_f32_s; + vk_pipeline pipeline_matmul_f32_aligned_l, pipeline_matmul_f32_aligned_m, pipeline_matmul_f32_aligned_s; + vk_pipeline pipeline_matmul_f16_l, pipeline_matmul_f16_m, pipeline_matmul_f16_s; + vk_pipeline pipeline_matmul_f16_aligned_l, pipeline_matmul_f16_aligned_m, pipeline_matmul_f16_aligned_s; + vk_pipeline pipeline_matmul_f16_f32_l, pipeline_matmul_f16_f32_m, pipeline_matmul_f16_f32_s; + vk_pipeline pipeline_matmul_f16_f32_aligned_l, pipeline_matmul_f16_f32_aligned_m, pipeline_matmul_f16_f32_aligned_s; + vk_pipeline pipeline_matmul_split_k_reduce; + + vk_pipeline pipeline_dequant[VK_NUM_TYPES]; + vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES]; + + vk_pipeline pipeline_mul_mat_vec_p021_f16_f32; + vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; + vk_pipeline pipeline_get_rows[VK_NUM_TYPES]; + vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES]; + vk_pipeline pipeline_mul_f32; + vk_pipeline pipeline_add_f32; + vk_pipeline pipeline_scale_f32; + vk_pipeline pipeline_sqr_f32; + vk_pipeline pipeline_clamp_f32; + vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; + vk_pipeline pipeline_norm_f32; + vk_pipeline pipeline_rms_norm_f32; + vk_pipeline pipeline_gelu_f32; + vk_pipeline pipeline_silu_f32; + vk_pipeline pipeline_relu_f32; + vk_pipeline pipeline_diag_mask_inf_f32; + vk_pipeline pipeline_soft_max_f32; + vk_pipeline pipeline_rope_f32, pipeline_rope_f16; + vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; + + std::vector pipelines; + ~vk_device() { #ifdef GGML_VULKAN_DEBUG std::cerr << "destroy device " << name << std::endl; #endif + device.destroyCommandPool(compute_queue.pool); + if (!single_queue) { + device.destroyCommandPool(transfer_queue.pool); + } + + for (auto& pipeline : pipelines) { + if (pipeline.expired()) { + continue; + } + + vk_pipeline pl = pipeline.lock(); + ggml_vk_destroy_pipeline(device, pl); + } + pipelines.clear(); + device.destroy(); } }; @@ -124,21 +195,6 @@ struct vk_subbuffer { uint64_t size; }; -struct vk_pipeline { - std::string name; - vk::ShaderModule shader_module; - vk::DescriptorSetLayout dsl; - std::vector descriptor_pools; - std::vector descriptor_sets; - uint32_t descriptor_set_idx; - vk::PipelineLayout layout; - vk::Pipeline pipeline; - uint32_t push_constant_size; - uint32_t parameter_count; - std::array wg_denoms; - uint32_t align; -}; - struct vk_semaphore { vk::Semaphore s; uint64_t value; @@ -235,7 +291,6 @@ struct ggml_tensor_extra_gpu { }; struct ggml_vk_garbage_collector { - std::vector pipelines; std::vector tl_semaphores; std::vector semaphores; std::vector events; @@ -246,37 +301,7 @@ struct ggml_vk_garbage_collector { struct ggml_backend_vk_context { std::string name; - std::weak_ptr device; - vk_pipeline pipeline_matmul_f32_l, pipeline_matmul_f32_m, pipeline_matmul_f32_s; - vk_pipeline pipeline_matmul_f32_aligned_l, pipeline_matmul_f32_aligned_m, pipeline_matmul_f32_aligned_s; - vk_pipeline pipeline_matmul_f16_l, pipeline_matmul_f16_m, pipeline_matmul_f16_s; - vk_pipeline pipeline_matmul_f16_aligned_l, pipeline_matmul_f16_aligned_m, pipeline_matmul_f16_aligned_s; - vk_pipeline pipeline_matmul_f16_f32_l, pipeline_matmul_f16_f32_m, pipeline_matmul_f16_f32_s; - vk_pipeline pipeline_matmul_f16_f32_aligned_l, pipeline_matmul_f16_f32_aligned_m, pipeline_matmul_f16_f32_aligned_s; - vk_pipeline pipeline_matmul_split_k_reduce; - - vk_pipeline pipeline_dequant[VK_NUM_TYPES]; - vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES]; - - vk_pipeline pipeline_mul_mat_vec_p021_f16_f32; - vk_pipeline pipeline_mul_mat_vec_nc_f16_f32; - vk_pipeline pipeline_get_rows[VK_NUM_TYPES]; - vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES]; - vk_pipeline pipeline_mul_f32; - vk_pipeline pipeline_add_f32; - vk_pipeline pipeline_scale_f32; - vk_pipeline pipeline_sqr_f32; - vk_pipeline pipeline_clamp_f32; - vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16; - vk_pipeline pipeline_norm_f32; - vk_pipeline pipeline_rms_norm_f32; - vk_pipeline pipeline_gelu_f32; - vk_pipeline pipeline_silu_f32; - vk_pipeline pipeline_relu_f32; - vk_pipeline pipeline_diag_mask_inf_f32; - vk_pipeline pipeline_soft_max_f32; - vk_pipeline pipeline_rope_f32, pipeline_rope_f16; - vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16; + std::shared_ptr device; size_t semaphore_idx, event_idx; ggml_vk_garbage_collector gc; @@ -305,13 +330,31 @@ struct vk_instance { std::vector device_indices; - std::shared_ptr devices[GGML_VK_MAX_DEVICES]; ggml_backend_t backends[GGML_VK_MAX_DEVICES]; ggml_backend_vk_context contexts[GGML_VK_MAX_DEVICES]; ggml_backend_buffer_type buffer_types[GGML_VK_MAX_DEVICES]; bool initialized[GGML_VK_MAX_DEVICES]; }; +static std::shared_ptr ggml_vk_get_device(size_t idx) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl; +#endif + static std::weak_ptr devices[GGML_VK_MAX_DEVICES]; + + if (devices[idx].expired()) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "Initializing new vk_device" << std::endl; +#endif + std::shared_ptr device = std::make_shared(); + device->initialized = false; + devices[idx] = device; + return device; + } + + return devices[idx].lock(); +} + #ifdef GGML_VULKAN_CHECK_RESULTS static size_t vk_skip_checks; static size_t vk_output_tensor; @@ -335,14 +378,15 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& GGML_ASSERT(parameter_count > 0); GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT - pipeline.name = name; - pipeline.parameter_count = parameter_count; - pipeline.push_constant_size = push_constant_size; - pipeline.wg_denoms = wg_denoms; - pipeline.align = align; + pipeline = std::make_shared(); + pipeline->name = name; + pipeline->parameter_count = parameter_count; + pipeline->push_constant_size = push_constant_size; + pipeline->wg_denoms = wg_denoms; + pipeline->align = align; vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast(spv_data)); - pipeline.shader_module = ctx->device.lock()->device.createShaderModule(shader_module_create_info); + pipeline->shader_module = ctx->device->device.createShaderModule(shader_module_create_info); std::vector dsl_binding; std::vector dsl_binding_flags; @@ -356,49 +400,49 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& vk::PushConstantRange pcr( vk::ShaderStageFlagBits::eCompute, 0, - pipeline.push_constant_size + pipeline->push_constant_size ); vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info( {}, dsl_binding); descriptor_set_layout_create_info.setPNext(&dslbfci); - pipeline.dsl = ctx->device.lock()->device.createDescriptorSetLayout(descriptor_set_layout_create_info); + pipeline->dsl = ctx->device->device.createDescriptorSetLayout(descriptor_set_layout_create_info); // Check if device supports multiple descriptors per pool - if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) { + if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) { const uint32_t alloc_count = 2; // Try allocating multiple sets from one pool // This fails on AMD for some reason, so add a fall back to allocating one pool per set - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); + vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size); - vk::DescriptorPool pool = ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info); + vk::DescriptorPool pool = ctx->device->device.createDescriptorPool(descriptor_pool_create_info); std::vector layouts(alloc_count); for (uint32_t i = 0; i < alloc_count; i++) { - layouts[i] = pipeline.dsl; + layouts[i] = pipeline->dsl; } try { vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data()); - std::vector sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info); + std::vector sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info); } catch(vk::OutOfPoolMemoryError const&) { - ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE; + ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE; } - ctx->device.lock()->device.destroyDescriptorPool(pool); + ctx->device->device.destroyDescriptorPool(pool); } - if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); + if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { + vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size); - pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info)); + pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info)); } - pipeline.descriptor_set_idx = 0; + pipeline->descriptor_set_idx = 0; - vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline.dsl, pcr); - pipeline.layout = ctx->device.lock()->device.createPipelineLayout(pipeline_layout_create_info); + vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr); + pipeline->layout = ctx->device->device.createPipelineLayout(pipeline_layout_create_info); std::vector specialization_entries(specialization_constants.size()); @@ -418,72 +462,75 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline& vk::PipelineShaderStageCreateInfo pipeline_shader_create_info( vk::PipelineShaderStageCreateFlags(), vk::ShaderStageFlagBits::eCompute, - pipeline.shader_module, + pipeline->shader_module, entrypoint.c_str(), &specialization_info); vk::ComputePipelineCreateInfo compute_pipeline_create_info( vk::PipelineCreateFlags(), pipeline_shader_create_info, - pipeline.layout); - pipeline.pipeline = ctx->device.lock()->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; + pipeline->layout); + pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value; - ctx->gc.pipelines.push_back(&pipeline); + ctx->device->pipelines.push_back(pipeline); } -static void ggml_vk_destroy_pipeline(ggml_backend_vk_context * ctx, vk_pipeline * pipeline) { +static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) { +#ifdef GGML_VULKAN_DEBUG + std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl; +#endif for (auto& pool : pipeline->descriptor_pools) { - ctx->device.lock()->device.destroyDescriptorPool(pool); + device.destroyDescriptorPool(pool); } pipeline->descriptor_pools.clear(); pipeline->descriptor_sets.clear(); pipeline->descriptor_set_idx = 0; - ctx->device.lock()->device.destroyDescriptorSetLayout(pipeline->dsl); + device.destroyDescriptorSetLayout(pipeline->dsl); - ctx->device.lock()->device.destroyPipelineLayout(pipeline->layout); + device.destroyPipelineLayout(pipeline->layout); - ctx->device.lock()->device.destroyShaderModule(pipeline->shader_module); + device.destroyShaderModule(pipeline->shader_module); - ctx->device.lock()->device.destroyPipeline(pipeline->pipeline); + device.destroyPipeline(pipeline->pipeline); } static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline.name << ", " << n << ")" << std::endl; + std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl; #endif - if (pipeline.descriptor_sets.size() >= pipeline.descriptor_set_idx + n) { + if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) { // Enough descriptors are available return; } - if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { - const uint32_t alloc_count = pipeline.descriptor_set_idx + n - pipeline.descriptor_sets.size(); + if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) { + const uint32_t alloc_count = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size(); std::vector layouts(alloc_count); for (uint32_t i = 0; i < alloc_count; i++) { - layouts[i] = pipeline.dsl; + layouts[i] = pipeline->dsl; } - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[0], alloc_count, layouts.data()); - std::vector sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info); - pipeline.descriptor_sets.insert(pipeline.descriptor_sets.end(), sets.begin(), sets.end()); + vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[0], alloc_count, layouts.data()); + std::vector sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info); + pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end()); } else { - for (uint32_t i = pipeline.descriptor_sets.size(); i < pipeline.descriptor_set_idx + n; i++) { - vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count); + for (uint32_t i = pipeline->descriptor_sets.size(); i < pipeline->descriptor_set_idx + n; i++) { + vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count); vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size); - pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info)); + pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info)); - vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[i], 1, &pipeline.dsl); - std::vector sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info); - pipeline.descriptor_sets.push_back(sets[0]); + vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[i], 1, &pipeline->dsl); + std::vector sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info); + pipeline->descriptor_sets.push_back(sets[0]); } } } static void ggml_pipeline_cleanup(vk_pipeline& pipeline) { #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_pipeline_cleanup(" << pipeline.name << ")" << std::endl; + std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl; #endif - pipeline.descriptor_set_idx = 0; + pipeline->descriptor_set_idx = 0; } static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) { @@ -499,7 +546,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx q.pool, vk::CommandBufferLevel::ePrimary, 1); - const std::vector cmd_buffers = ctx->device.lock()->device.allocateCommandBuffers(command_buffer_alloc_info); + const std::vector cmd_buffers = ctx->device->device.allocateCommandBuffers(command_buffer_alloc_info); auto buf = cmd_buffers.front(); q.cmd_buffers.push_back(buf); @@ -644,11 +691,11 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin q.queue_family_index = queue_family_index; vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index); - q.pool = ctx->device.lock()->device.createCommandPool(command_pool_create_info_compute); + q.pool = ctx->device->device.createCommandPool(command_pool_create_info_compute); q.cmd_buffer_idx = 0; - q.queue = ctx->device.lock()->device.getQueue(queue_family_index, queue_index); + q.queue = ctx->device->device.getQueue(queue_family_index, queue_index); q.stage_flags = stage_flags; } @@ -672,7 +719,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context * vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); - vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci); + vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); ctx->gc.semaphores.push_back({ semaphore, 0 }); return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1]; } @@ -685,7 +732,7 @@ static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 }; vk::SemaphoreCreateInfo ci{}; ci.setPNext(&tci); - vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci); + vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci); ctx->gc.tl_semaphores.push_back({ semaphore, 0 }); } return &ctx->gc.tl_semaphores[ctx->semaphore_idx++]; @@ -693,7 +740,7 @@ static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) { if (ctx->event_idx >= ctx->gc.events.size()) { - ctx->gc.events.push_back(ctx->device.lock()->device.createEvent({})); + ctx->gc.events.push_back(ctx->device->device.createEvent({})); } return ctx->gc.events[ctx->event_idx++]; } @@ -704,7 +751,7 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) { #endif // Requires command buffers to be done - ctx->device.lock()->device.resetCommandPool(q.pool); + ctx->device->device.resetCommandPool(q.pool); q.cmd_buffer_idx = 0; } @@ -729,11 +776,11 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz nullptr, }; - buf->buffer = ctx->device.lock()->device.createBuffer(buffer_create_info); + buf->buffer = ctx->device->device.createBuffer(buffer_create_info); - vk::MemoryRequirements mem_req = ctx->device.lock()->device.getBufferMemoryRequirements(buf->buffer); + vk::MemoryRequirements mem_req = ctx->device->device.getBufferMemoryRequirements(buf->buffer); - vk::PhysicalDeviceMemoryProperties mem_props = ctx->device.lock()->physical_device.getMemoryProperties(); + vk::PhysicalDeviceMemoryProperties mem_props = ctx->device->physical_device.getMemoryProperties(); uint32_t memory_type_index = UINT32_MAX; @@ -746,16 +793,16 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz } if (memory_type_index >= mem_props.memoryTypeCount) { - ctx->device.lock()->device.destroyBuffer(buf->buffer); + ctx->device->device.destroyBuffer(buf->buffer); buf->size = 0; throw vk::OutOfDeviceMemoryError("No suitable memory type found"); } try { - buf->device_memory = ctx->device.lock()->device.allocateMemory({ mem_req.size, memory_type_index }); + buf->device_memory = ctx->device->device.allocateMemory({ mem_req.size, memory_type_index }); } catch (const vk::SystemError& e) { // Out of Host/Device memory, clean up buffer - ctx->device.lock()->device.destroyBuffer(buf->buffer); + ctx->device->device.destroyBuffer(buf->buffer); buf->size = 0; throw e; } @@ -763,14 +810,14 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz buf->ptr = nullptr; if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) { - buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); + buf->ptr = ctx->device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE); } - ctx->device.lock()->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); + ctx->device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0); buf->ctx = ctx; - buf->device = ctx->device.lock(); + buf->device = ctx->device; #ifdef GGML_VULKAN_DEBUG std::cerr << "Created buffer " << buf->buffer << std::endl; @@ -794,7 +841,7 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz try { buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal); } catch (const vk::SystemError& e) { - if (ctx->device.lock()->uma) { + if (ctx->device->uma) { // Fall back to host memory type buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent); } else { @@ -873,7 +920,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl; #endif - const std::shared_ptr device = ctx->device.lock(); + const std::shared_ptr device = ctx->device; // mulmat std::initializer_list warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size }; @@ -889,125 +936,125 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) { uint32_t s_align = 32; if (device->fp16) { - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); } else { - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align); } - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32", mul_mat_vec_f16_f32_len, mul_mat_vec_f16_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1); // dequant shaders - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 5 * sizeof(uint32_t), { 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F16 ], "dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), { 8 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16", f32_to_f16_len, f32_to_f16_data, "main", 2, 5 * sizeof(uint32_t), { 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F16 ], "dequant_f16", dequant_f16_len, dequant_f16_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1); // get_rows - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16", get_rows_f16_len, get_rows_f16_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32", get_rows_f16_f32_len, get_rows_f16_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); - ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); + ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1); } static void ggml_vk_print_gpu_info(size_t idx) { @@ -1157,140 +1204,148 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { throw std::runtime_error("Device not found"); } - vk_instance.devices[idx] = std::make_shared(); - ctx->device = vk_instance.devices[idx]; - ctx->device.lock()->physical_device = devices[dev_num]; - std::vector ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties(); + ctx->device = ggml_vk_get_device(idx); + if (!ctx->device->initialized) { + ctx->device->physical_device = devices[dev_num]; + std::vector ext_props = ctx->device->physical_device.enumerateDeviceExtensionProperties(); - bool maintenance4_support = false; + bool maintenance4_support = false; - // Check if maintenance4 is supported - for (auto properties : ext_props) { - if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { - maintenance4_support = true; + // Check if maintenance4 is supported + for (auto properties : ext_props) { + if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) { + maintenance4_support = true; + } } - } - vk::PhysicalDeviceProperties2 props2; - vk::PhysicalDeviceMaintenance3Properties props3; - vk::PhysicalDeviceMaintenance4Properties props4; - vk::PhysicalDeviceSubgroupProperties subgroup_props; - props2.pNext = &props3; - props3.pNext = &subgroup_props; - if (maintenance4_support) { - subgroup_props.pNext = &props4; - } - ctx->device.lock()->physical_device.getProperties2(&props2); - ctx->device.lock()->properties = props2.properties; - - if (maintenance4_support) { - ctx->device.lock()->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); - } else { - ctx->device.lock()->max_memory_allocation_size = props3.maxMemoryAllocationSize; - } - - ctx->device.lock()->vendor_id = ctx->device.lock()->properties.vendorID; - ctx->device.lock()->subgroup_size = subgroup_props.subgroupSize; - ctx->device.lock()->uma = ctx->device.lock()->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; - - bool fp16_storage = false; - bool fp16_compute = false; - - for (auto properties : ext_props) { - if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { - fp16_storage = true; - } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { - fp16_compute = true; + vk::PhysicalDeviceProperties2 props2; + vk::PhysicalDeviceMaintenance3Properties props3; + vk::PhysicalDeviceMaintenance4Properties props4; + vk::PhysicalDeviceSubgroupProperties subgroup_props; + props2.pNext = &props3; + props3.pNext = &subgroup_props; + if (maintenance4_support) { + subgroup_props.pNext = &props4; } - } + ctx->device->physical_device.getProperties2(&props2); + ctx->device->properties = props2.properties; - const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16"); - bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr; + if (maintenance4_support) { + ctx->device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize); + } else { + ctx->device->max_memory_allocation_size = props3.maxMemoryAllocationSize; + } - ctx->device.lock()->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; + ctx->device->vendor_id = ctx->device->properties.vendorID; + ctx->device->subgroup_size = subgroup_props.subgroupSize; + ctx->device->uma = ctx->device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu; - std::vector queue_family_props = ctx->device.lock()->physical_device.getQueueFamilyProperties(); + bool fp16_storage = false; + bool fp16_compute = false; - // Try to find a non-graphics compute queue and transfer-focused queues - const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1); - const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1); + for (auto properties : ext_props) { + if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) { + fp16_storage = true; + } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) { + fp16_compute = true; + } + } - const float priorities[] = { 1.0f, 1.0f }; - ctx->device.lock()->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; + const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16"); + bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr; - std::vector device_queue_create_infos; - if (compute_queue_family_index != transfer_queue_family_index) { - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1}); - } else if(!ctx->device.lock()->single_queue) { - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities}); - } else { - device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); - } - vk::DeviceCreateInfo device_create_info; - std::vector device_extensions; - vk::PhysicalDeviceFeatures device_features = ctx->device.lock()->physical_device.getFeatures(); + ctx->device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute; - VkPhysicalDeviceFeatures2 device_features2; - device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; - device_features2.pNext = nullptr; - device_features2.features = (VkPhysicalDeviceFeatures)device_features; + std::vector queue_family_props = ctx->device->physical_device.getQueueFamilyProperties(); - VkPhysicalDeviceVulkan11Features vk11_features; - vk11_features.pNext = nullptr; - vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; - device_features2.pNext = &vk11_features; + // Try to find a non-graphics compute queue and transfer-focused queues + const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1); + const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1); - VkPhysicalDeviceVulkan12Features vk12_features; - vk12_features.pNext = nullptr; - vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; - vk11_features.pNext = &vk12_features; + const float priorities[] = { 1.0f, 1.0f }; + ctx->device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1; - vkGetPhysicalDeviceFeatures2(ctx->device.lock()->physical_device, &device_features2); + std::vector device_queue_create_infos; + if (compute_queue_family_index != transfer_queue_family_index) { + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1}); + } else if(!ctx->device->single_queue) { + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities}); + } else { + device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities}); + } + vk::DeviceCreateInfo device_create_info; + std::vector device_extensions; + vk::PhysicalDeviceFeatures device_features = ctx->device->physical_device.getFeatures(); - ctx->device.lock()->fp16 = ctx->device.lock()->fp16 && vk12_features.shaderFloat16; + VkPhysicalDeviceFeatures2 device_features2; + device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2; + device_features2.pNext = nullptr; + device_features2.features = (VkPhysicalDeviceFeatures)device_features; - if (!vk11_features.storageBuffer16BitAccess) { - std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; - throw std::runtime_error("Unsupported device"); - } + VkPhysicalDeviceVulkan11Features vk11_features; + vk11_features.pNext = nullptr; + vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES; + device_features2.pNext = &vk11_features; - device_extensions.push_back("VK_KHR_16bit_storage"); + VkPhysicalDeviceVulkan12Features vk12_features; + vk12_features.pNext = nullptr; + vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES; + vk11_features.pNext = &vk12_features; + + vkGetPhysicalDeviceFeatures2(ctx->device->physical_device, &device_features2); + + ctx->device->fp16 = ctx->device->fp16 && vk12_features.shaderFloat16; + + if (!vk11_features.storageBuffer16BitAccess) { + std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl; + throw std::runtime_error("Unsupported device"); + } + + device_extensions.push_back("VK_KHR_16bit_storage"); #ifdef GGML_VULKAN_VALIDATE - device_extensions.push_back("VK_KHR_shader_non_semantic_info"); + device_extensions.push_back("VK_KHR_shader_non_semantic_info"); #endif - if (ctx->device.lock()->fp16) { - device_extensions.push_back("VK_KHR_shader_float16_int8"); - } - ctx->device.lock()->name = ctx->device.lock()->properties.deviceName.data(); + if (ctx->device->fp16) { + device_extensions.push_back("VK_KHR_shader_float16_int8"); + } + ctx->device->name = ctx->device->properties.deviceName.data(); - device_create_info = { - vk::DeviceCreateFlags(), - device_queue_create_infos, - {}, - device_extensions - }; - device_create_info.setPNext(&device_features2); - ctx->device.lock()->device = ctx->device.lock()->physical_device.createDevice(device_create_info); + device_create_info = { + vk::DeviceCreateFlags(), + device_queue_create_infos, + {}, + device_extensions + }; + device_create_info.setPNext(&device_features2); + ctx->device->device = ctx->device->physical_device.createDevice(device_create_info); - ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN; + ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN; - // Shaders - ggml_vk_load_shaders(ctx); + // Queues + ggml_vk_create_queue(ctx, ctx->device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); - // Queues - ggml_vk_create_queue(ctx, ctx->device.lock()->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer }); - if (!ctx->device.lock()->single_queue) { - const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; - ggml_vk_create_queue(ctx, ctx->device.lock()->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }); - } else { - // TODO: Use pointer or reference to avoid copy - ctx->device.lock()->transfer_queue = ctx->device.lock()->compute_queue; + // Shaders + ggml_vk_load_shaders(ctx); + + if (!ctx->device->single_queue) { + const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0; + ggml_vk_create_queue(ctx, ctx->device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer }); + } else { + // TODO: Use pointer or reference to avoid copy + ctx->device->transfer_queue = ctx->device->compute_queue; + } + + ctx->device->idx = dev_num; + ctx->device->initialized = true; + } else if (ctx->device->idx != dev_num) { + std::cerr << "ggml_vulkan: Device " << ctx->device->name << " already initialized with index " << ctx->device->idx << ", but trying to reinitialize with index " << dev_num << std::endl; + throw std::runtime_error("Device already initialized"); } - ctx->fence = ctx->device.lock()->device.createFence({}); + ctx->fence = ctx->device->device.createFence({}); ctx->compute_ctx = nullptr; ctx->transfer_ctx = nullptr; @@ -1308,7 +1363,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) { #endif } -static vk_pipeline* ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { +static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_get_to_fp16()" << std::endl; #endif @@ -1329,10 +1384,10 @@ static vk_pipeline* ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type return nullptr; } - return &ctx->pipeline_dequant[type]; + return ctx->device->pipeline_dequant[type]; } -static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) { +static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl; #endif @@ -1353,7 +1408,7 @@ static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * return nullptr; } - return &ctx->pipeline_dequant_mul_mat_vec_f32[type]; + return ctx->device->pipeline_dequant_mul_mat_vec_f32[type]; } static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) { @@ -1430,8 +1485,8 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) { if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) { fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n", size/1024.0/1024.0); - ctx->device.lock()->device.freeMemory(buf->device_memory); - ctx->device.lock()->device.destroyBuffer(buf->buffer); + ctx->device->device.freeMemory(buf->device_memory); + ctx->device->device.destroyBuffer(buf->buffer); return nullptr; } @@ -1495,30 +1550,30 @@ static vk_submission ggml_vk_begin_submission(ggml_backend_vk_context * ctx, vk_ } static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector&& buffers, size_t push_constant_size, const void* push_constants, std::array elements) { - const uint32_t wg0 = CEIL_DIV(elements[0], pipeline.wg_denoms[0]); - const uint32_t wg1 = CEIL_DIV(elements[1], pipeline.wg_denoms[1]); - const uint32_t wg2 = CEIL_DIV(elements[2], pipeline.wg_denoms[2]); + const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]); + const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]); + const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]); #ifdef GGML_VULKAN_DEBUG - std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline.name << ", (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl; + std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl; #endif std::vector descriptor_buffer_infos; std::vector write_descriptor_sets; - GGML_ASSERT(pipeline.descriptor_set_idx < pipeline.descriptor_sets.size()); - GGML_ASSERT(buffers.size() == pipeline.parameter_count); - vk::DescriptorSet& descriptor_set = pipeline.descriptor_sets[pipeline.descriptor_set_idx++]; - for (uint32_t i = 0; i < pipeline.parameter_count; i++) { + GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size()); + GGML_ASSERT(buffers.size() == pipeline->parameter_count); + vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++]; + for (uint32_t i = 0; i < pipeline->parameter_count; i++) { descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size}); } - for (uint32_t i = 0; i < pipeline.parameter_count; i++) { + for (uint32_t i = 0; i < pipeline->parameter_count; i++) { write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]}); } - ctx->device.lock()->device.updateDescriptorSets(write_descriptor_sets, {}); + ctx->device->device.updateDescriptorSets(write_descriptor_sets, {}); - subctx->s->buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); - subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline); + subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants); + subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline); subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute, - pipeline.layout, + pipeline->layout, 0, { descriptor_set }, {}); @@ -1775,7 +1830,7 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width); } } else { - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); ggml_vk_ctx_begin(ctx, subctx); ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, spitch, width, height, true); ggml_vk_ctx_end(subctx); @@ -1785,8 +1840,8 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds } ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); } } @@ -1871,14 +1926,14 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s memcpy(dst, (uint8_t *) src->ptr + offset, size); } else { - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); ggml_vk_ctx_begin(ctx, subctx); ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst, size, true); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); @@ -1908,13 +1963,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr VkBufferCopy bc{ src_offset, dst_offset, size }; - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); ggml_vk_ctx_begin(ctx, subctx); ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); } else { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl; @@ -1942,14 +1997,14 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst, // Make sure ctx owns the buffer GGML_ASSERT(dst->ctx == ctx); - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); ggml_vk_ctx_begin(ctx, subctx); subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c); ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); } static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) { @@ -2030,74 +2085,74 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl; #endif if (m <= 32 || n <= 32) { - return ctx->pipeline_matmul_f32_aligned_s.align; + return ctx->device->pipeline_matmul_f32_aligned_s->align; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { - return ctx->pipeline_matmul_f32_aligned_m.align; + if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) { + return ctx->device->pipeline_matmul_f32_aligned_m->align; } - return ctx->pipeline_matmul_f32_aligned_l.align; + return ctx->device->pipeline_matmul_f32_aligned_l->align; } -static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { +static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")"; #endif if (bit16_x && bit16_y) { - if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif - return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s; + return aligned ? ctx->device->pipeline_matmul_f16_aligned_s : ctx->device->pipeline_matmul_f16_s; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { + if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif - return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m; + return aligned ? ctx->device->pipeline_matmul_f16_aligned_m : ctx->device->pipeline_matmul_f16_m; } #ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif - return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l; + return aligned ? ctx->device->pipeline_matmul_f16_aligned_l : ctx->device->pipeline_matmul_f16_l; } if (bit16_x && !bit16_y) { - if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif - return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s; + return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_s : ctx->device->pipeline_matmul_f16_f32_s; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { + if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif - return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m; + return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_m : ctx->device->pipeline_matmul_f16_f32_m; } #ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif - return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_l : &ctx->pipeline_matmul_f16_f32_l; + return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_l : ctx->device->pipeline_matmul_f16_f32_l; } if (!bit16_x && bit16_y) { GGML_ASSERT(false); } - if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { + if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) { #ifdef GGML_VULKAN_DEBUG std::cerr << " S" << std::endl; #endif - return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s; + return aligned ? ctx->device->pipeline_matmul_f32_aligned_s : ctx->device->pipeline_matmul_f32_s; } - if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) { + if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) { #ifdef GGML_VULKAN_DEBUG std::cerr << " M" << std::endl; #endif - return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m; + return aligned ? ctx->device->pipeline_matmul_f32_aligned_m : ctx->device->pipeline_matmul_f32_m; } #ifdef GGML_VULKAN_DEBUG std::cerr << " L" << std::endl; #endif - return aligned ? &ctx->pipeline_matmul_f32_aligned_l : &ctx->pipeline_matmul_f32_l; + return aligned ? ctx->device->pipeline_matmul_f32_aligned_l : ctx->device->pipeline_matmul_f32_l; } static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) { @@ -2115,10 +2170,10 @@ static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, v const std::array pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d }; // Make sure enough workgroups get assigned for split k to work - ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch }); ggml_vk_sync_buffers(subctx); const std::array pc2 = { (uint32_t)(m * n * batch), split_k }; - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 }); } static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { @@ -2128,22 +2183,22 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) { tensor->nb[3] == tensor->nb[2]*tensor->ne[2]; } -static vk_pipeline * ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) { +static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) { if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) { - return &ctx->pipeline_cpy_f32_f32; + return ctx->device->pipeline_cpy_f32_f32; } if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) { - return &ctx->pipeline_cpy_f32_f16; + return ctx->device->pipeline_cpy_f32_f16; } if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) { - return &ctx->pipeline_cpy_f16_f16; + return ctx->device->pipeline_cpy_f16_f16; } std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl; GGML_ASSERT(false); } -static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline * pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) { +static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) { #ifdef GGML_VULKAN_DEBUG std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), "; std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl; @@ -2153,7 +2208,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context const uint32_t ne = tensor->ne[0] * tensor->ne[1] * tensor->ne[2]; - const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1]; + const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1]; const vk_op_cpy_push_constants pc = { (uint32_t)ne, @@ -2162,7 +2217,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context 0, }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 }); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 }); } static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { @@ -2202,7 +2257,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su bool src0_uma = false; bool src1_uma = false; - if (ctx->device.lock()->uma) { + if (ctx->device->uma) { ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src0_uma = d_Qx != nullptr; @@ -2232,7 +2287,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10); - vk_pipeline * pipeline = ggml_vk_guess_matmul_pipeline(ctx, true, !f16_f32_kernel, ne01, ne11, aligned); + vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, true, !f16_f32_kernel, ne01, ne11, aligned); const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); @@ -2279,8 +2334,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su GGML_ASSERT(qy_sz == y_sz); } - vk_pipeline * to_fp16_vk_0 = nullptr; - vk_pipeline * to_fp16_vk_1 = nullptr; + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16); @@ -2296,15 +2351,15 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT // Allocate descriptor sets - ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1); if (qx_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, 1); } if (split_k > 1) { - ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1); } if (x_non_contig) { @@ -2317,9 +2372,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su } if (qx_needs_dequant) { - const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0) / 32) }; // TODO: replace with subgroup size + const std::vector pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1}); } } if (y_non_contig) { @@ -2340,7 +2395,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su } // compute - ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT + ggml_vk_matmul(ctx, subctx, pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21); // NOLINT if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host @@ -2388,7 +2443,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context bool src0_uma = false; bool src1_uma = false; - if (ctx->device.lock()->uma) { + if (ctx->device->uma) { ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset); ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src0_uma = d_Qx != nullptr; @@ -2410,9 +2465,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context const uint64_t y_ne = ne11 * ne10; const uint64_t d_ne = ne11 * ne01; - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); + const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); - const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; + const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz; const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne; const uint64_t d_sz = sizeof(float) * d_ne; @@ -2452,8 +2507,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context GGML_ASSERT(qy_sz == y_sz); } - vk_pipeline * to_fp16_vk_0 = nullptr; - vk_pipeline* to_fp16_vk_1 = nullptr; + vk_pipeline to_fp16_vk_0 = nullptr; + vk_pipeline to_fp16_vk_1 = nullptr; if (x_non_contig) { to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type); } @@ -2462,22 +2517,22 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context } else { to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type); } - vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type); + vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type); GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr); // NOLINT GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr); // NOLINT GGML_ASSERT(dmmv != nullptr); // Allocate descriptor sets if (qx_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1); } if (qy_needs_dequant) { - ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13); } - ggml_pipeline_allocate_descriptor_sets(ctx, *dmmv, ne12 * ne13); + ggml_pipeline_allocate_descriptor_sets(ctx, dmmv, ne12 * ne13); if (x_non_contig) { - GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment)); + GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment)); ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, src0->type); } else if (load_x) { // copy data to device @@ -2502,22 +2557,22 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context const uint64_t y_offset = y_buf_offset + y_sz * it_idx1; const uint64_t d_offset = d_buf_offset + d_sz * it_idx1; - const uint64_t y_buffer_offset = (y_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; + const uint64_t y_buffer_offset = (y_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t y_shader_offset = y_offset - y_buffer_offset; - const uint64_t d_buffer_offset = (d_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; + const uint64_t d_buffer_offset = (d_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_offset - d_buffer_offset; if (!y_non_contig && qy_needs_dequant) { const std::vector pc = { (uint32_t)ne11, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(y_ne / 32) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)y_ne, 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)y_ne, 1, 1}); } // compute const std::array pc = { (uint32_t)ne00, (uint32_t)(y_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type))}; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); + ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1}); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host @@ -2563,7 +2618,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c bool src1_uma = false; - if (ctx->device.lock()->uma) { + if (ctx->device->uma) { ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src1_uma = d_Qy != nullptr; } @@ -2574,7 +2629,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c const uint64_t y_ne = ne10 * ne11 * ne12; const uint64_t d_ne = ne01 * ne11 * ne12; - const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); + const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment); const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type); const uint64_t d_sz = sizeof(float) * d_ne; @@ -2593,12 +2648,12 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c } // Allocate descriptor sets - ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1); - const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; + const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; - const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; + const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; if (load_y) { @@ -2608,7 +2663,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host @@ -2655,7 +2710,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con bool src1_uma = false; - if (ctx->device.lock()->uma) { + if (ctx->device->uma) { ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset); src1_uma = d_Qy != nullptr; } @@ -2686,12 +2741,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con } // Allocate descriptor sets - ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1); - const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; + const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset; - const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; + const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset; if (load_y) { @@ -2701,7 +2756,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con // compute const std::array pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) }; ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); + ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 }); if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host @@ -2810,40 +2865,40 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx } -static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) { +static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) { switch (op) { case GGML_OP_ADD: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_add_f32; + return ctx->device->pipeline_add_f32; } return nullptr; case GGML_OP_GET_ROWS: GGML_ASSERT(src1->type == GGML_TYPE_I32); if (dst->type == GGML_TYPE_F16) { - return &ctx->pipeline_get_rows[src0->type]; + return ctx->device->pipeline_get_rows[src0->type]; } if (dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_get_rows_f32[src0->type]; + return ctx->device->pipeline_get_rows_f32[src0->type]; } return nullptr; case GGML_OP_MUL: if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_mul_f32; + return ctx->device->pipeline_mul_f32; } return nullptr; case GGML_OP_SCALE: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_scale_f32; + return ctx->device->pipeline_scale_f32; } return nullptr; case GGML_OP_SQR: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_sqr_f32; + return ctx->device->pipeline_sqr_f32; } return nullptr; case GGML_OP_CLAMP: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_clamp_f32; + return ctx->device->pipeline_clamp_f32; } return nullptr; case GGML_OP_CPY: @@ -2852,29 +2907,29 @@ static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type); case GGML_OP_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_norm_f32; + return ctx->device->pipeline_norm_f32; } return nullptr; case GGML_OP_RMS_NORM: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_rms_norm_f32; + return ctx->device->pipeline_rms_norm_f32; } return nullptr; case GGML_OP_UNARY: switch (ggml_get_unary_op(dst)) { case GGML_UNARY_OP_SILU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_silu_f32; + return ctx->device->pipeline_silu_f32; } break; case GGML_UNARY_OP_GELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_gelu_f32; + return ctx->device->pipeline_gelu_f32; } break; case GGML_UNARY_OP_RELU: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_relu_f32; + return ctx->device->pipeline_relu_f32; } break; default: @@ -2883,12 +2938,12 @@ static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const return nullptr; case GGML_OP_DIAG_MASK_INF: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_diag_mask_inf_f32; + return ctx->device->pipeline_diag_mask_inf_f32; } return nullptr; case GGML_OP_SOFT_MAX: if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_soft_max_f32; + return ctx->device->pipeline_soft_max_f32; } return nullptr; case GGML_OP_ROPE: @@ -2903,17 +2958,17 @@ static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const if (is_neox) { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_rope_neox_f32; + return ctx->device->pipeline_rope_neox_f32; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return &ctx->pipeline_rope_neox_f16; + return ctx->device->pipeline_rope_neox_f16; } } else { if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) { - return &ctx->pipeline_rope_f32; + return ctx->device->pipeline_rope_f32; } if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) { - return &ctx->pipeline_rope_f16; + return ctx->device->pipeline_rope_f16; } } return nullptr; @@ -2960,7 +3015,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c const uint64_t nb2 = dst->nb[2]; const uint64_t nb3 = dst->nb[3]; - vk_pipeline * pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op); + vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op); ggml_vk_func_t op_func; if (pipeline == nullptr) { @@ -2990,7 +3045,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c bool src0_uma = false; bool src1_uma = false; - if (ctx->device.lock()->uma) { + if (ctx->device->uma) { ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset); src0_uma = d_X != nullptr; if (use_src1) { @@ -3002,8 +3057,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma; const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma; - uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment); - uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0; + uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment); + uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0; uint64_t d_sz = ggml_type_size(dst->type) * ne0; vk_buffer d_D = extra->buffer_gpu.lock(); @@ -3014,7 +3069,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c } GGML_ASSERT(d_D != nullptr); - uint64_t d_buf_offset = (extra->offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; + uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment; GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY); // NOLINT if (transfer_src0) { d_X = ctx->prealloc_qx; @@ -3058,7 +3113,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c // Single call if dimension 2 is contiguous if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) { - ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, 1); + ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1); switch (dst->op) { case GGML_OP_NORM: @@ -3090,13 +3145,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (!use_src1 && op == GGML_OP_SOFT_MAX) { // Empty src1 is possible on soft_max, but the shader needs a buffer ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) { ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst); @@ -3106,7 +3161,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz); } } else { - ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, ne02 * ne03); + ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03); switch (dst->op) { case GGML_OP_NORM: @@ -3134,13 +3189,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c if (!use_src1 && op == GGML_OP_SOFT_MAX) { // Empty src1 is possible on soft_max, but the shader needs a buffer ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements); } else if (use_src1) { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } else { ggml_vk_sync_buffers(subctx); - ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); + ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements); } if (dst->backend == GGML_BACKEND_CPU) { // copy dst to host @@ -3185,7 +3240,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra; const int src0_type_size = ggml_type_size(src0->type); const int dst_type_size = ggml_type_size(dst->type); - const uint32_t d_offset = (extra->offset % ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; + const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size; ggml_vk_op_f32(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, { (uint32_t)ggml_nelements(src0), (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size, @@ -3297,43 +3352,43 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t const size_t y_ne = k * n * batch; const size_t d_ne = m * n * batch; - vk_pipeline * p; + vk_pipeline p; std::string shname; if (shader_size == 0) { if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f32_aligned_s; + p = ctx->device->pipeline_matmul_f32_aligned_s; shname = "F32_ALIGNED_S"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_f32_aligned_s; + p = ctx->device->pipeline_matmul_f16_f32_aligned_s; shname = "F16_F32_ALIGNED_S"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_aligned_s; + p = ctx->device->pipeline_matmul_f16_aligned_s; shname = "F16_ALIGNED_S"; } else { GGML_ASSERT(false); } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f32_aligned_m; + p = ctx->device->pipeline_matmul_f32_aligned_m; shname = "F32_ALIGNED_M"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_f32_aligned_m; + p = ctx->device->pipeline_matmul_f16_f32_aligned_m; shname = "F16_F32_ALIGNED_M"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_aligned_m; + p = ctx->device->pipeline_matmul_f16_aligned_m; shname = "F16_ALIGNED_M"; } else { GGML_ASSERT(false); } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f32_aligned_l; + p = ctx->device->pipeline_matmul_f32_aligned_l; shname = "F32_ALIGNED_L"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_f32_aligned_l; + p = ctx->device->pipeline_matmul_f16_f32_aligned_l; shname = "F16_F32_ALIGNED_L"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_aligned_l; + p = ctx->device->pipeline_matmul_f16_aligned_l; shname = "F16_ALIGNED_L"; } else { GGML_ASSERT(false); @@ -3347,35 +3402,35 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t if (k != kpad) { if (shader_size == 0) { if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f32_s; + p = ctx->device->pipeline_matmul_f32_s; shname = "F32_S"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_f32_s; + p = ctx->device->pipeline_matmul_f16_f32_s; shname = "F16_F32_S"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_s; + p = ctx->device->pipeline_matmul_f16_s; shname = "F16_S"; } } else if (shader_size == 1) { if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f32_m; + p = ctx->device->pipeline_matmul_f32_m; shname = "F32_M"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_f32_m; + p = ctx->device->pipeline_matmul_f16_f32_m; shname = "F16_F32_M"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_m; + p = ctx->device->pipeline_matmul_f16_m; shname = "F16_M"; } } else if (shader_size == 2) { if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f32_l; + p = ctx->device->pipeline_matmul_f32_l; shname = "F32_L"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_f32_l; + p = ctx->device->pipeline_matmul_f16_f32_l; shname = "F16_F32_L"; } else if (std::is_same() && std::is_same()) { - p = &ctx->pipeline_matmul_f16_l; + p = ctx->device->pipeline_matmul_f16_l; shname = "F16_L"; } } @@ -3383,7 +3438,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ggml_pipeline_allocate_descriptor_sets(ctx, *p, num_it); if (split_k > 1) { - ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, num_it); + ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it); if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) { // Resize buffer @@ -3424,7 +3479,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch); ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch); - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); for (size_t i = 0; i < num_it; i++) { ggml_vk_ctx_begin(ctx, subctx); ggml_vk_matmul(ctx, subctx, *p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n); @@ -3433,8 +3488,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t auto begin = std::chrono::high_resolution_clock::now(); ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); auto end = std::chrono::high_resolution_clock::now(); double time = std::chrono::duration_cast(end-begin).count() / 1000.0; @@ -3538,15 +3593,15 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t free(d_chk); - ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue); - ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue); + ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue); + ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue); ggml_vk_destroy_buffer(d_X); ggml_vk_destroy_buffer(d_Y); ggml_vk_destroy_buffer(d_D); ggml_pipeline_cleanup(*p); - ggml_pipeline_cleanup(ctx->pipeline_matmul_split_k_reduce); + ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce); free(x); free(y); @@ -3619,7 +3674,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_ data[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f; } - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); ggml_vk_ctx_begin(ctx, subctx); vk_buffer buffer = ggml_vk_create_buffer_check(ctx, ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal); @@ -3628,8 +3683,8 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_ ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); ggml_vk_buffer_read(ctx, buffer, 0, result_data, ggml_nbytes(tensor)); @@ -3701,7 +3756,7 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool x[i] = rand() / (float)RAND_MAX; } - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); ggml_vk_ctx_begin(ctx, subctx); auto begin = std::chrono::high_resolution_clock::now(); @@ -3715,8 +3770,8 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); auto end = std::chrono::high_resolution_clock::now(); @@ -3730,8 +3785,8 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool ggml_vk_ctx_end(subctx); ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); for (auto& cpy : subctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); @@ -3781,7 +3836,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ std::vector hist_cur(1 << 4, 0); - vk_pipeline& p = ctx->pipeline_dequant[quant]; + vk_pipeline& p = ctx->device->pipeline_dequant[quant]; switch(quant) { case GGML_TYPE_Q4_0: @@ -3822,17 +3877,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_ ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz); - vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); ggml_vk_ctx_begin(ctx, subctx); - const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)(ne / 32) }; + const std::vector pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne }; ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1}); ggml_vk_ctx_end(subctx); auto begin = std::chrono::high_resolution_clock::now(); ggml_vk_submit(subctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); auto end = std::chrono::high_resolution_clock::now(); @@ -3927,11 +3982,11 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm const uint32_t y_ne = ne10 * ne11; const uint32_t d_ne = ne20 * ne21; - const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; - const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; - const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; - const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; - uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; + const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; + const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; + const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0; + const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0; + uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23; const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0; if (extra->buffer_gpu.expired()) { @@ -4161,7 +4216,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod } if (ctx->compute_ctx == nullptr) { - ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue); + ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue); ggml_vk_ctx_begin(ctx, ctx->compute_ctx); } @@ -4354,8 +4409,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_ } if (tensor == subctx.exit_tensor) { - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); // Do staging buffer copies for (auto& cpy : subctx.out_memcpys) { @@ -4383,20 +4438,25 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { } ctx->gc.temp_buffers.clear(); - for (auto * pipeline : ctx->gc.pipelines) { - ggml_pipeline_cleanup(*pipeline); + for (auto& pipeline : ctx->device->pipelines) { + if (pipeline.expired()) { + continue; + } + + vk_pipeline pl = pipeline.lock(); + ggml_pipeline_cleanup(pl); } - ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue); - ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue); + ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue); + ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue); for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) { - ctx->device.lock()->device.destroySemaphore({ ctx->gc.semaphores[i].s }); + ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s }); } ctx->gc.semaphores.clear(); for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) { - ctx->device.lock()->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s }); + ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s }); } ctx->gc.tl_semaphores.clear(); ctx->semaphore_idx = 0; @@ -4404,7 +4464,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) { ctx->event_idx = 0; for (auto& event : ctx->gc.events) { - ctx->device.lock()->device.resetEvent(event); + ctx->device->device.resetEvent(event); } ctx->staging_offset = 0; @@ -4441,21 +4501,11 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) { ctx->staging_size = 0; for (auto& event : ctx->gc.events) { - ctx->device.lock()->device.destroyEvent(event); + ctx->device->device.destroyEvent(event); } ctx->gc.events.clear(); - for (auto* pipeline : ctx->gc.pipelines) { - ggml_vk_destroy_pipeline(ctx, pipeline); - } - ctx->gc.pipelines.clear(); - - ctx->device.lock()->device.destroyFence(ctx->fence); - - ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->compute_queue.pool); - if (!ctx->device.lock()->single_queue) { - ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->transfer_queue.pool); - } + ctx->device->device.destroyFence(ctx->fence); } GGML_CALL int ggml_vk_get_device_count() { @@ -4724,12 +4774,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer( GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; - return ctx->ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment; + return ctx->ctx->device->properties.limits.minStorageBufferOffsetAlignment; } GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) { ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context; - return ctx->ctx->device.lock()->max_memory_allocation_size; + return ctx->ctx->device->max_memory_allocation_size; } GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { @@ -4815,7 +4865,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu } GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { - return vk_instance.contexts[0].device.lock()->properties.limits.minMemoryMapAlignment; + return vk_instance.contexts[0].device->properties.limits.minMemoryMapAlignment; UNUSED(buft); } @@ -4860,8 +4910,7 @@ GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) { ggml_vk_cleanup(ctx); - // Release device - vk_instance.devices[ctx->idx].reset(); + ctx->device.reset(); ctx->initialized = false; vk_instance.initialized[idx] = false; @@ -4890,7 +4939,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g if (ctx->transfer_ctx == nullptr) { // Initialize new transfer context - ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); } @@ -4911,7 +4960,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c if (ctx->transfer_ctx == nullptr) { // Initialize new transfer context - ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); } @@ -4931,7 +4980,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c if (ctx->transfer_ctx == nullptr) { // Initialize new transfer context - ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue); + ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue); ggml_vk_ctx_begin(ctx, ctx->transfer_ctx); } @@ -4961,8 +5010,8 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) { } ggml_vk_submit(ctx->transfer_ctx, ctx->fence); - VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); - ctx->device.lock()->device.resetFences({ ctx->fence }); + VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences"); + ctx->device->device.resetFences({ ctx->fence }); for (auto& cpy : ctx->transfer_ctx->out_memcpys) { memcpy(cpy.dst, cpy.src, cpy.n); diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py index 59205908c..6713f14d1 100644 --- a/ggml_vk_generate_shaders.py +++ b/ggml_vk_generate_shaders.py @@ -445,7 +445,7 @@ layout (push_constant) uniform parameter uint K; uint stride_a; uint stride_b; - uint num_groups; + uint nel; } p; """ @@ -483,31 +483,30 @@ void main() { """ dequant_q4_0_body = """ -layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in; +layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in; layout (binding = 0) readonly buffer A {block_q4_0 data_a[];}; layout (binding = 1) writeonly buffer D {D_TYPE data_b[];}; void main() { - const uint i = gl_WorkGroupID.x; + const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64; - // assume 32 threads - const uint tid = gl_LocalInvocationID.x; - const uint il = tid/8; - const uint ir = tid%8; - const uint ib = 8*i + ir; - if (ib >= p.num_groups) { + const uint tid = gl_LocalInvocationID.x % 64; + const uint il = tid/32; + const uint ir = tid%32; + const uint ib = 32*i + ir; + if (ib >= p.nel / 32) { return; } - const uint b_idx = 256*i + 32*ir + 4*il; + const uint b_idx = 1024*i + 32*ir + 8*il; const float d = float(data_a[ib].d); const float dm = -8.0f * d; - const uint q_idx = 4*il; + const uint q_idx = 8*il; - [[unroll]] for (uint l = 0; l < 4; ++l) { + [[unroll]] for (uint l = 0; l < 8; ++l) { data_b[b_idx + l + 0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm); data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >> 4) + dm); }