diff --git a/ggml-vulkan-shaders.hpp b/ggml-vulkan-shaders.hpp
index 7a768e1ed..612c8669d 100644
--- a/ggml-vulkan-shaders.hpp
+++ b/ggml-vulkan-shaders.hpp
@@ -1793,7 +1793,7 @@ const uint64_t dequant_q3_K_len = 4804;
 
 unsigned char dequant_q4_0_data[] = {
 0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
-0xf7,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
+0x83,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x11,0x00,0x02,0x00,
 0x01,0x00,0x00,0x00,0x11,0x00,0x02,0x00,0x27,0x00,0x00,0x00,
 0x11,0x00,0x02,0x00,0x51,0x11,0x00,0x00,0x11,0x00,0x02,0x00,
 0x60,0x11,0x00,0x00,0x0b,0x00,0x06,0x00,0x01,0x00,0x00,0x00,
@@ -1801,43 +1801,43 @@ unsigned char dequant_q4_0_data[] = {
 0x00,0x00,0x00,0x00,0x0e,0x00,0x03,0x00,0x00,0x00,0x00,0x00,
 0x01,0x00,0x00,0x00,0x0f,0x00,0x0a,0x00,0x05,0x00,0x00,0x00,
 0x04,0x00,0x00,0x00,0x6d,0x61,0x69,0x6e,0x00,0x00,0x00,0x00,
-0x0b,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
-0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
+0x0b,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
+0x4d,0x00,0x00,0x00,0x66,0x00,0x00,0x00,0x10,0x00,0x06,0x00,
+0x04,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
 0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
 0x0b,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x11,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
-0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x21,0x00,0x00,0x00,
+0x47,0x00,0x04,0x00,0x12,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,
+0x1b,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x29,0x00,0x00,0x00,
 0x00,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x21,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+0x48,0x00,0x05,0x00,0x29,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
 0x23,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x21,0x00,0x00,0x00,
+0x29,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
+0x08,0x00,0x00,0x00,0x48,0x00,0x05,0x00,0x29,0x00,0x00,0x00,
 0x03,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x21,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
+0x48,0x00,0x05,0x00,0x29,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
 0x23,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x21,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x40,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x41,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+0x29,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
+0x48,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+0x48,0x00,0x05,0x00,0x49,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x48,0x00,0x05,0x00,
-0x41,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
+0x49,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x23,0x00,0x00,0x00,
+0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x4a,0x00,0x00,0x00,
 0x06,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x43,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x43,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+0x4b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x18,0x00,0x00,0x00,
+0x48,0x00,0x05,0x00,0x4b,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x43,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x45,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x5b,0x00,0x00,0x00,
+0x4b,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
+0x4d,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+0x47,0x00,0x04,0x00,0x4d,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
+0x00,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x63,0x00,0x00,0x00,
 0x06,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x48,0x00,0x04,0x00,
-0x5c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
-0x48,0x00,0x05,0x00,0x5c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+0x64,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x19,0x00,0x00,0x00,
+0x48,0x00,0x05,0x00,0x64,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 0x23,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x47,0x00,0x03,0x00,
-0x5c,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
-0x5e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x47,0x00,0x04,0x00,0x5e,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x8b,0x00,0x00,0x00,
+0x64,0x00,0x00,0x00,0x02,0x00,0x00,0x00,0x47,0x00,0x04,0x00,
+0x66,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+0x47,0x00,0x04,0x00,0x66,0x00,0x00,0x00,0x21,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x47,0x00,0x04,0x00,0x94,0x00,0x00,0x00,
 0x0b,0x00,0x00,0x00,0x19,0x00,0x00,0x00,0x13,0x00,0x02,0x00,
 0x02,0x00,0x00,0x00,0x21,0x00,0x03,0x00,0x03,0x00,0x00,0x00,
 0x02,0x00,0x00,0x00,0x15,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
@@ -1848,232 +1848,385 @@ unsigned char dequant_q4_0_data[] = {
 0x0b,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
 0x06,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
 0x20,0x00,0x04,0x00,0x0d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x0a,0x00,0x00,0x00,
-0x11,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
-0x1e,0x00,0x07,0x00,0x21,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
+0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
+0x0a,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
+0x40,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x1e,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x1e,0x00,0x07,0x00,
+0x29,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
 0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x06,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x22,0x00,0x00,0x00,
-0x09,0x00,0x00,0x00,0x21,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
-0x22,0x00,0x00,0x00,0x23,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
-0x15,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
-0x01,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x26,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,
-0x14,0x00,0x02,0x00,0x29,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x20,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x04,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x3a,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x16,0x00,0x03,0x00,
-0x3d,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
-0x3e,0x00,0x00,0x00,0x08,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,
-0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,0x40,0x00,0x00,0x00,
-0x3e,0x00,0x00,0x00,0x3f,0x00,0x00,0x00,0x1e,0x00,0x04,0x00,
-0x41,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
-0x1d,0x00,0x03,0x00,0x42,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
-0x1e,0x00,0x03,0x00,0x43,0x00,0x00,0x00,0x42,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x44,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x43,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x44,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x24,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
-0x20,0x00,0x04,0x00,0x48,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,
-0x4d,0x00,0x00,0x00,0x00,0x00,0x00,0xc1,0x1d,0x00,0x03,0x00,
-0x5b,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,
-0x5c,0x00,0x00,0x00,0x5b,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x5d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x5c,0x00,0x00,0x00,
-0x3b,0x00,0x04,0x00,0x5d,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x20,0x00,0x04,0x00,
-0x69,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x24,0x00,0x00,0x00,0x6e,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,
-0x09,0x00,0x00,0x00,0x8b,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x02,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x12,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
-0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
+0x20,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,0x09,0x00,0x00,0x00,
+0x29,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x2a,0x00,0x00,0x00,
+0x2b,0x00,0x00,0x00,0x09,0x00,0x00,0x00,0x15,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0x20,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+0x2b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x2e,0x00,0x00,0x00,
+0x09,0x00,0x00,0x00,0x06,0x00,0x00,0x00,0x14,0x00,0x02,0x00,
+0x32,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x38,0x00,0x00,0x00,0x00,0x04,0x00,0x00,0x2b,0x00,0x04,0x00,
+0x06,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
+0x16,0x00,0x03,0x00,0x42,0x00,0x00,0x00,0x20,0x00,0x00,0x00,
+0x16,0x00,0x03,0x00,0x45,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
+0x15,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
+0x00,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x47,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x1c,0x00,0x04,0x00,
+0x48,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x47,0x00,0x00,0x00,
+0x1e,0x00,0x04,0x00,0x49,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
+0x48,0x00,0x00,0x00,0x1d,0x00,0x03,0x00,0x4a,0x00,0x00,0x00,
+0x49,0x00,0x00,0x00,0x1e,0x00,0x03,0x00,0x4b,0x00,0x00,0x00,
+0x4a,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x4c,0x00,0x00,0x00,
+0x0c,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,
+0x4c,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
+0x2b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x00,0x00,0x00,0x00,0x20,0x00,0x04,0x00,0x50,0x00,0x00,0x00,
+0x0c,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x55,0x00,0x00,0x00,0x00,0x00,0x00,0xc1,
+0x1d,0x00,0x03,0x00,0x63,0x00,0x00,0x00,0x45,0x00,0x00,0x00,
+0x1e,0x00,0x03,0x00,0x64,0x00,0x00,0x00,0x63,0x00,0x00,0x00,
+0x20,0x00,0x04,0x00,0x65,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
+0x64,0x00,0x00,0x00,0x3b,0x00,0x04,0x00,0x65,0x00,0x00,0x00,
+0x66,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
+0x20,0x00,0x04,0x00,0x71,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
+0x46,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x2c,0x00,0x00,0x00,
+0x76,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
+0x06,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x00,0x01,0x00,0x00,
+0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x2c,0x00,0x06,0x00,0x09,0x00,0x00,0x00,
+0x94,0x00,0x00,0x00,0x92,0x00,0x00,0x00,0x93,0x00,0x00,0x00,
+0x93,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x77,0x01,0x00,0x00,0x11,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
+0x06,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x02,0x00,0x00,0x00,
+0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x79,0x01,0x00,0x00,
+0x12,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x7a,0x01,0x00,0x00,0x03,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
+0x06,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x13,0x00,0x00,0x00,
+0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,
+0x14,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x7d,0x01,0x00,0x00,0x05,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
+0x06,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x15,0x00,0x00,0x00,
+0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,
+0x06,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x80,0x01,0x00,0x00,0x16,0x00,0x00,0x00,0x2b,0x00,0x04,0x00,
+0x06,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x07,0x00,0x00,0x00,
+0x2b,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x82,0x01,0x00,0x00,
+0x17,0x00,0x00,0x00,0x36,0x00,0x05,0x00,0x02,0x00,0x00,0x00,
 0x04,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
 0xf8,0x00,0x02,0x00,0x05,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x8c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
-0x0c,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
-0x8d,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
+0x95,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfb,0x00,0x03,0x00,
+0x0c,0x00,0x00,0x00,0x96,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
+0x96,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
 0x0e,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
 0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,
-0x0e,0x00,0x00,0x00,0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x11,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,
-0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
-0x12,0x00,0x00,0x00,0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x17,0x00,0x00,0x00,0x13,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x13,0x00,0x00,0x00,0x16,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
-0x0f,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x1d,0x00,0x00,0x00,0x1a,0x00,0x00,0x00,
-0x41,0x00,0x05,0x00,0x26,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0x23,0x00,0x00,0x00,0x25,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x06,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
-0xae,0x00,0x05,0x00,0x29,0x00,0x00,0x00,0x2a,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x28,0x00,0x00,0x00,0xf7,0x00,0x03,0x00,
-0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0xfa,0x00,0x04,0x00,
-0x2a,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,0x2c,0x00,0x00,0x00,
-0xf8,0x00,0x02,0x00,0x2b,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x2c,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
-0x2f,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x84,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x34,0x00,0x00,0x00,0x32,0x00,0x00,0x00,
-0x1a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x35,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
-0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x38,0x00,0x00,0x00,
-0x36,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x35,0x00,0x00,0x00,
-0x38,0x00,0x00,0x00,0x41,0x00,0x07,0x00,0x48,0x00,0x00,0x00,
-0x49,0x00,0x00,0x00,0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00,
-0x1f,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x3d,0x00,0x00,0x00,0x4a,0x00,0x00,0x00,0x49,0x00,0x00,0x00,
-0x73,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,
-0x4a,0x00,0x00,0x00,0x85,0x00,0x05,0x00,0x3a,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x69,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x3e,0x00,0x00,0x00,0x6b,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,
-0x6b,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0x6d,0x00,0x00,0x00,0x6c,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,
-0x70,0x00,0x00,0x00,0x6f,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x3a,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x70,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x48,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0x39,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x3f,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x3e,0x00,0x00,0x00,
-0x80,0x00,0x00,0x00,0x6a,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x3e,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x80,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,
-0x82,0x00,0x00,0x00,0x81,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x3a,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0x82,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0x86,0x00,0x00,0x00,0x85,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x48,0x00,0x00,0x00,0x87,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0x79,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0x87,0x00,0x00,0x00,0x86,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0x8a,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0x9b,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x69,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0x9b,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x3e,0x00,0x00,0x00,0x9d,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,
-0x9d,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0x9f,0x00,0x00,0x00,0x9e,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x9f,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,
-0xa1,0x00,0x00,0x00,0xa0,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x3a,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xa1,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0xa4,0x00,0x00,0x00,0xa3,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x48,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0x99,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xa5,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0xf2,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x3e,0x00,0x00,0x00,
-0xaa,0x00,0x00,0x00,0x9c,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x3e,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,
-0xac,0x00,0x00,0x00,0xab,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x3a,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0xaf,0x00,0x00,0x00,0xae,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x48,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xb0,0x00,0x00,0x00,0xaf,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0xf3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xba,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x69,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0xba,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x3e,0x00,0x00,0x00,0xbc,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,
-0xbc,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0xbe,0x00,0x00,0x00,0xbd,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0xbe,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,
-0xc0,0x00,0x00,0x00,0xbf,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x3a,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xc0,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0xc3,0x00,0x00,0x00,0xc2,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x48,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xc4,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0xf4,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x3e,0x00,0x00,0x00,
-0xc9,0x00,0x00,0x00,0xbb,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x3e,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,
-0xcb,0x00,0x00,0x00,0xca,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x3a,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0xce,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x48,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xcf,0x00,0x00,0x00,0xce,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0xf5,0x00,0x00,0x00,0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
-0xd9,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
-0x41,0x00,0x08,0x00,0x69,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x45,0x00,0x00,0x00,0x46,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
-0x65,0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
-0x3e,0x00,0x00,0x00,0xdb,0x00,0x00,0x00,0xda,0x00,0x00,0x00,
-0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,
-0xdb,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,0x24,0x00,0x00,0x00,
-0xdd,0x00,0x00,0x00,0xdc,0x00,0x00,0x00,0xc7,0x00,0x05,0x00,
-0x24,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0xdd,0x00,0x00,0x00,
-0x6e,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,
-0xdf,0x00,0x00,0x00,0xde,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x3a,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xdf,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0xe2,0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x48,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xe3,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
-0x06,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x39,0x00,0x00,0x00,
-0xf6,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x3e,0x00,0x00,0x00,
-0xe8,0x00,0x00,0x00,0xda,0x00,0x00,0x00,0xc2,0x00,0x05,0x00,
-0x3e,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,
-0x25,0x00,0x00,0x00,0x70,0x00,0x04,0x00,0x3a,0x00,0x00,0x00,
-0xea,0x00,0x00,0x00,0xe9,0x00,0x00,0x00,0x0c,0x00,0x08,0x00,
-0x3a,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,
-0x32,0x00,0x00,0x00,0x4b,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
-0x4f,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x3d,0x00,0x00,0x00,
-0xed,0x00,0x00,0x00,0xec,0x00,0x00,0x00,0x41,0x00,0x06,0x00,
-0x48,0x00,0x00,0x00,0xee,0x00,0x00,0x00,0x5e,0x00,0x00,0x00,
-0x46,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,0x3e,0x00,0x03,0x00,
-0xee,0x00,0x00,0x00,0xed,0x00,0x00,0x00,0xf9,0x00,0x02,0x00,
-0x8c,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x8c,0x00,0x00,0x00,
-0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
+0x0e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
+0x11,0x00,0x00,0x00,0x0f,0x00,0x00,0x00,0x10,0x00,0x00,0x00,
+0x41,0x00,0x05,0x00,0x0d,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
+0x12,0x00,0x00,0x00,0x0c,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
+0x06,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x13,0x00,0x00,0x00,
+0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x16,0x00,0x00,0x00,
+0x14,0x00,0x00,0x00,0x15,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0x17,0x00,0x00,0x00,0x11,0x00,0x00,0x00,
+0x16,0x00,0x00,0x00,0x89,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
+0x1b,0x00,0x00,0x00,0x14,0x00,0x00,0x00,0x15,0x00,0x00,0x00,
+0x86,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
+0x1b,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x89,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,
+0x1e,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
+0x25,0x00,0x00,0x00,0x1e,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x27,0x00,0x00,0x00,
+0x25,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x41,0x00,0x05,0x00,
+0x2e,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x2b,0x00,0x00,0x00,
+0x2d,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x30,0x00,0x00,0x00,0x2f,0x00,0x00,0x00,0x86,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0x31,0x00,0x00,0x00,0x30,0x00,0x00,0x00,
+0x1e,0x00,0x00,0x00,0xae,0x00,0x05,0x00,0x32,0x00,0x00,0x00,
+0x33,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x31,0x00,0x00,0x00,
+0xf7,0x00,0x03,0x00,0x35,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+0xfa,0x00,0x04,0x00,0x33,0x00,0x00,0x00,0x34,0x00,0x00,0x00,
+0x35,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,0x34,0x00,0x00,0x00,
+0xf9,0x00,0x02,0x00,0x95,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
+0x35,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
+0x3a,0x00,0x00,0x00,0x38,0x00,0x00,0x00,0x17,0x00,0x00,0x00,
+0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3c,0x00,0x00,0x00,
+0x1e,0x00,0x00,0x00,0x22,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0x3d,0x00,0x00,0x00,0x3a,0x00,0x00,0x00,
+0x3c,0x00,0x00,0x00,0x84,0x00,0x05,0x00,0x06,0x00,0x00,0x00,
+0x40,0x00,0x00,0x00,0x3e,0x00,0x00,0x00,0x1f,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
+0x3d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,0x41,0x00,0x07,0x00,
+0x50,0x00,0x00,0x00,0x51,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,
+0x4e,0x00,0x00,0x00,0x27,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x3d,0x00,0x04,0x00,0x45,0x00,0x00,0x00,0x52,0x00,0x00,0x00,
+0x51,0x00,0x00,0x00,0x73,0x00,0x04,0x00,0x42,0x00,0x00,0x00,
+0x53,0x00,0x00,0x00,0x52,0x00,0x00,0x00,0x85,0x00,0x05,0x00,
+0x42,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x55,0x00,0x00,0x00,
+0x53,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,
+0x72,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
+0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x73,0x00,0x00,0x00,
+0x72,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x74,0x00,0x00,0x00,0x73,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0x75,0x00,0x00,0x00,0x74,0x00,0x00,0x00,
+0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
+0x75,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x78,0x00,0x00,0x00,0x77,0x00,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x78,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,0x7b,0x00,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x7d,0x00,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x41,0x00,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x7d,0x00,0x00,0x00,0x7c,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
+0x41,0x00,0x00,0x00,0x47,0x00,0x00,0x00,0x3d,0x00,0x04,0x00,
+0x46,0x00,0x00,0x00,0x88,0x00,0x00,0x00,0x72,0x00,0x00,0x00,
+0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
+0x88,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x8a,0x00,0x00,0x00,0x89,0x00,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x8a,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,0x8d,0x00,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x8f,0x00,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x81,0x00,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x8f,0x00,0x00,0x00,0x8e,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
+0x41,0x00,0x00,0x00,0x93,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
+0x93,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,
+0xa5,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xa4,0x00,0x00,0x00,
+0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,
+0xa5,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0xa7,0x00,0x00,0x00,0xa6,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0xa8,0x00,0x00,0x00,0xa7,0x00,0x00,0x00,
+0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
+0xa8,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0xaa,0x00,0x00,0x00,0xa9,0x00,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0xaa,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0xad,0x00,0x00,0x00,0xac,0x00,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xae,0x00,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xa2,0x00,0x00,0x00,
+0x3e,0x00,0x03,0x00,0xae,0x00,0x00,0x00,0xad,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
+0x41,0x00,0x00,0x00,0x77,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
+0x46,0x00,0x00,0x00,0xb3,0x00,0x00,0x00,0xa5,0x00,0x00,0x00,
+0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
+0xb3,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0xb5,0x00,0x00,0x00,0xb4,0x00,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0xb5,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,0xb7,0x00,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xb9,0x00,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xb0,0x00,0x00,0x00,
+0x3e,0x00,0x03,0x00,0xb9,0x00,0x00,0x00,0xb8,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
+0x41,0x00,0x00,0x00,0x78,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
+0x78,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,
+0xc4,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xc3,0x00,0x00,0x00,
+0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,
+0xc4,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0xc6,0x00,0x00,0x00,0xc5,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0xc7,0x00,0x00,0x00,0xc6,0x00,0x00,0x00,
+0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
+0xc7,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0xc9,0x00,0x00,0x00,0xc8,0x00,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0xc9,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,0xcb,0x00,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xcd,0x00,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xc1,0x00,0x00,0x00,
+0x3e,0x00,0x03,0x00,0xcd,0x00,0x00,0x00,0xcc,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
+0x41,0x00,0x00,0x00,0x79,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
+0x46,0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0xc4,0x00,0x00,0x00,
+0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
+0xd2,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0xd4,0x00,0x00,0x00,0xd3,0x00,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0xd4,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,0xd6,0x00,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xd8,0x00,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xcf,0x00,0x00,0x00,
+0x3e,0x00,0x03,0x00,0xd8,0x00,0x00,0x00,0xd7,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,
+0x41,0x00,0x00,0x00,0x7a,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,0x40,0x00,0x00,0x00,
+0x7a,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,
+0xe3,0x00,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0xe2,0x00,0x00,0x00,
+0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,
+0xe3,0x00,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0xe5,0x00,0x00,0x00,0xe4,0x00,0x00,0x00,0x7c,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0xe6,0x00,0x00,0x00,0xe5,0x00,0x00,0x00,
+0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
+0xe6,0x00,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0xe8,0x00,0x00,0x00,0xe7,0x00,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0xe8,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,0xea,0x00,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xec,0x00,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xe0,0x00,0x00,0x00,
+0x3e,0x00,0x03,0x00,0xec,0x00,0x00,0x00,0xeb,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
+0x41,0x00,0x00,0x00,0x7b,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
+0x46,0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0xe3,0x00,0x00,0x00,
+0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
+0xf1,0x00,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0xf3,0x00,0x00,0x00,0xf2,0x00,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0xf3,0x00,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,0xf5,0x00,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0xf7,0x00,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xee,0x00,0x00,0x00,
+0x3e,0x00,0x03,0x00,0xf7,0x00,0x00,0x00,0xf6,0x00,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0xff,0x00,0x00,0x00,
+0x41,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x80,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0x01,0x01,0x00,0x00,0x40,0x00,0x00,0x00,
+0x10,0x00,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,
+0x02,0x01,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x01,0x01,0x00,0x00,
+0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x03,0x01,0x00,0x00,
+0x02,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x04,0x01,0x00,0x00,0x03,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0x05,0x01,0x00,0x00,0x04,0x01,0x00,0x00,
+0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x06,0x01,0x00,0x00,
+0x05,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x07,0x01,0x00,0x00,0x06,0x01,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x09,0x01,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x07,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x0a,0x01,0x00,0x00,0x09,0x01,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x0b,0x01,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0xff,0x00,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x0b,0x01,0x00,0x00,0x0a,0x01,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
+0x41,0x00,0x00,0x00,0x7c,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
+0x46,0x00,0x00,0x00,0x10,0x01,0x00,0x00,0x02,0x01,0x00,0x00,
+0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x11,0x01,0x00,0x00,
+0x10,0x01,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x12,0x01,0x00,0x00,0x11,0x01,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x14,0x01,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x12,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x15,0x01,0x00,0x00,0x14,0x01,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x16,0x01,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x0d,0x01,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x16,0x01,0x00,0x00,0x15,0x01,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
+0x41,0x00,0x00,0x00,0x7d,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0x20,0x01,0x00,0x00,0x40,0x00,0x00,0x00,
+0x7d,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,
+0x21,0x01,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x20,0x01,0x00,0x00,
+0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x22,0x01,0x00,0x00,
+0x21,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x23,0x01,0x00,0x00,0x22,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0x24,0x01,0x00,0x00,0x23,0x01,0x00,0x00,
+0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x25,0x01,0x00,0x00,
+0x24,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x26,0x01,0x00,0x00,0x25,0x01,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x28,0x01,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x26,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x29,0x01,0x00,0x00,0x28,0x01,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x2a,0x01,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x1e,0x01,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x2a,0x01,0x00,0x00,0x29,0x01,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,
+0x41,0x00,0x00,0x00,0x7e,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
+0x46,0x00,0x00,0x00,0x2f,0x01,0x00,0x00,0x21,0x01,0x00,0x00,
+0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x30,0x01,0x00,0x00,
+0x2f,0x01,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x31,0x01,0x00,0x00,0x30,0x01,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x33,0x01,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x31,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x34,0x01,0x00,0x00,0x33,0x01,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x35,0x01,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x2c,0x01,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x35,0x01,0x00,0x00,0x34,0x01,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
+0x41,0x00,0x00,0x00,0x7f,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,0x40,0x00,0x00,0x00,
+0x7f,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,
+0x40,0x01,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x3f,0x01,0x00,0x00,
+0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x41,0x01,0x00,0x00,
+0x40,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x42,0x01,0x00,0x00,0x41,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0x43,0x01,0x00,0x00,0x42,0x01,0x00,0x00,
+0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x44,0x01,0x00,0x00,
+0x43,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x45,0x01,0x00,0x00,0x44,0x01,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x47,0x01,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x45,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x48,0x01,0x00,0x00,0x47,0x01,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x49,0x01,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x3d,0x01,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x49,0x01,0x00,0x00,0x48,0x01,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
+0x41,0x00,0x00,0x00,0x80,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
+0x46,0x00,0x00,0x00,0x4e,0x01,0x00,0x00,0x40,0x01,0x00,0x00,
+0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x4f,0x01,0x00,0x00,
+0x4e,0x01,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x50,0x01,0x00,0x00,0x4f,0x01,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x52,0x01,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x50,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x53,0x01,0x00,0x00,0x52,0x01,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x54,0x01,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x4b,0x01,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x54,0x01,0x00,0x00,0x53,0x01,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
+0x41,0x00,0x00,0x00,0x81,0x01,0x00,0x00,0x80,0x00,0x05,0x00,
+0x06,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,0x40,0x00,0x00,0x00,
+0x81,0x01,0x00,0x00,0x41,0x00,0x08,0x00,0x71,0x00,0x00,0x00,
+0x5f,0x01,0x00,0x00,0x4d,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x27,0x00,0x00,0x00,0x6d,0x00,0x00,0x00,0x5e,0x01,0x00,0x00,
+0x3d,0x00,0x04,0x00,0x46,0x00,0x00,0x00,0x60,0x01,0x00,0x00,
+0x5f,0x01,0x00,0x00,0x71,0x00,0x04,0x00,0x06,0x00,0x00,0x00,
+0x61,0x01,0x00,0x00,0x60,0x01,0x00,0x00,0x7c,0x00,0x04,0x00,
+0x2c,0x00,0x00,0x00,0x62,0x01,0x00,0x00,0x61,0x01,0x00,0x00,
+0xc7,0x00,0x05,0x00,0x2c,0x00,0x00,0x00,0x63,0x01,0x00,0x00,
+0x62,0x01,0x00,0x00,0x76,0x00,0x00,0x00,0x6f,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x64,0x01,0x00,0x00,0x63,0x01,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x66,0x01,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x64,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x67,0x01,0x00,0x00,0x66,0x01,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x68,0x01,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x5c,0x01,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x68,0x01,0x00,0x00,0x67,0x01,0x00,0x00,
+0x80,0x00,0x05,0x00,0x06,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
+0x41,0x00,0x00,0x00,0x82,0x01,0x00,0x00,0x3d,0x00,0x04,0x00,
+0x46,0x00,0x00,0x00,0x6d,0x01,0x00,0x00,0x5f,0x01,0x00,0x00,
+0xc2,0x00,0x05,0x00,0x46,0x00,0x00,0x00,0x6e,0x01,0x00,0x00,
+0x6d,0x01,0x00,0x00,0x2d,0x00,0x00,0x00,0x70,0x00,0x04,0x00,
+0x42,0x00,0x00,0x00,0x6f,0x01,0x00,0x00,0x6e,0x01,0x00,0x00,
+0x0c,0x00,0x08,0x00,0x42,0x00,0x00,0x00,0x71,0x01,0x00,0x00,
+0x01,0x00,0x00,0x00,0x32,0x00,0x00,0x00,0x53,0x00,0x00,0x00,
+0x6f,0x01,0x00,0x00,0x57,0x00,0x00,0x00,0x73,0x00,0x04,0x00,
+0x45,0x00,0x00,0x00,0x72,0x01,0x00,0x00,0x71,0x01,0x00,0x00,
+0x41,0x00,0x06,0x00,0x50,0x00,0x00,0x00,0x73,0x01,0x00,0x00,
+0x66,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x6a,0x01,0x00,0x00,
+0x3e,0x00,0x03,0x00,0x73,0x01,0x00,0x00,0x72,0x01,0x00,0x00,
+0xf9,0x00,0x02,0x00,0x95,0x00,0x00,0x00,0xf8,0x00,0x02,0x00,
+0x95,0x00,0x00,0x00,0xfd,0x00,0x01,0x00,0x38,0x00,0x01,0x00,
+
 };
-const uint64_t dequant_q4_0_len = 3356;
+const uint64_t dequant_q4_0_len = 5184;
 
 unsigned char dequant_q4_1_data[] = {
 0x03,0x02,0x23,0x07,0x00,0x05,0x01,0x00,0x0b,0x00,0x0d,0x00,
diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp
index fda66804c..105c86840 100644
--- a/ggml-vulkan.cpp
+++ b/ggml-vulkan.cpp
@@ -68,6 +68,26 @@ struct vk_queue {
     vk::PipelineStageFlags stage_flags;
 };
 
+struct vk_pipeline_struct {
+    std::string name;
+    vk::ShaderModule shader_module;
+    vk::DescriptorSetLayout dsl;
+    std::vector<vk::DescriptorPool> descriptor_pools;
+    std::vector<vk::DescriptorSet> descriptor_sets;
+    uint32_t descriptor_set_idx;
+    vk::PipelineLayout layout;
+    vk::Pipeline pipeline;
+    uint32_t push_constant_size;
+    uint32_t parameter_count;
+    std::array<uint32_t, 3> wg_denoms;
+    uint32_t align;
+};
+
+typedef std::shared_ptr<vk_pipeline_struct> vk_pipeline;
+typedef std::weak_ptr<vk_pipeline_struct> vk_pipeline_ref;
+
+static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline);
+
 struct vk_device {
     vk::PhysicalDevice physical_device;
     vk::PhysicalDeviceProperties properties;
@@ -83,10 +103,61 @@ struct vk_device {
     uint32_t subgroup_size;
     bool uma;
 
+    bool initialized;
+    size_t idx;
+
+    vk_pipeline pipeline_matmul_f32_l, pipeline_matmul_f32_m, pipeline_matmul_f32_s;
+    vk_pipeline pipeline_matmul_f32_aligned_l, pipeline_matmul_f32_aligned_m, pipeline_matmul_f32_aligned_s;
+    vk_pipeline pipeline_matmul_f16_l, pipeline_matmul_f16_m, pipeline_matmul_f16_s;
+    vk_pipeline pipeline_matmul_f16_aligned_l, pipeline_matmul_f16_aligned_m, pipeline_matmul_f16_aligned_s;
+    vk_pipeline pipeline_matmul_f16_f32_l, pipeline_matmul_f16_f32_m, pipeline_matmul_f16_f32_s;
+    vk_pipeline pipeline_matmul_f16_f32_aligned_l, pipeline_matmul_f16_f32_aligned_m, pipeline_matmul_f16_f32_aligned_s;
+    vk_pipeline pipeline_matmul_split_k_reduce;
+
+    vk_pipeline pipeline_dequant[VK_NUM_TYPES];
+    vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES];
+
+    vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
+    vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
+    vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
+    vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
+    vk_pipeline pipeline_mul_f32;
+    vk_pipeline pipeline_add_f32;
+    vk_pipeline pipeline_scale_f32;
+    vk_pipeline pipeline_sqr_f32;
+    vk_pipeline pipeline_clamp_f32;
+    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
+    vk_pipeline pipeline_norm_f32;
+    vk_pipeline pipeline_rms_norm_f32;
+    vk_pipeline pipeline_gelu_f32;
+    vk_pipeline pipeline_silu_f32;
+    vk_pipeline pipeline_relu_f32;
+    vk_pipeline pipeline_diag_mask_inf_f32;
+    vk_pipeline pipeline_soft_max_f32;
+    vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
+    vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
+
+    std::vector<vk_pipeline_ref> pipelines;
+
     ~vk_device() {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << "destroy device " << name << std::endl;
 #endif
+        device.destroyCommandPool(compute_queue.pool);
+        if (!single_queue) {
+            device.destroyCommandPool(transfer_queue.pool);
+        }
+
+        for (auto& pipeline : pipelines) {
+            if (pipeline.expired()) {
+                continue;
+            }
+
+            vk_pipeline pl = pipeline.lock();
+            ggml_vk_destroy_pipeline(device, pl);
+        }
+        pipelines.clear();
+
         device.destroy();
     }
 };
@@ -124,21 +195,6 @@ struct vk_subbuffer {
     uint64_t size;
 };
 
-struct vk_pipeline {
-    std::string name;
-    vk::ShaderModule shader_module;
-    vk::DescriptorSetLayout dsl;
-    std::vector<vk::DescriptorPool> descriptor_pools;
-    std::vector<vk::DescriptorSet> descriptor_sets;
-    uint32_t descriptor_set_idx;
-    vk::PipelineLayout layout;
-    vk::Pipeline pipeline;
-    uint32_t push_constant_size;
-    uint32_t parameter_count;
-    std::array<uint32_t, 3> wg_denoms;
-    uint32_t align;
-};
-
 struct vk_semaphore {
     vk::Semaphore s;
     uint64_t value;
@@ -235,7 +291,6 @@ struct ggml_tensor_extra_gpu {
 };
 
 struct ggml_vk_garbage_collector {
-    std::vector<vk_pipeline *> pipelines;
     std::vector<vk_semaphore> tl_semaphores;
     std::vector<vk_semaphore> semaphores;
     std::vector<vk::Event> events;
@@ -246,37 +301,7 @@ struct ggml_vk_garbage_collector {
 struct ggml_backend_vk_context {
     std::string name;
 
-    std::weak_ptr<vk_device> device;
-    vk_pipeline pipeline_matmul_f32_l, pipeline_matmul_f32_m, pipeline_matmul_f32_s;
-    vk_pipeline pipeline_matmul_f32_aligned_l, pipeline_matmul_f32_aligned_m, pipeline_matmul_f32_aligned_s;
-    vk_pipeline pipeline_matmul_f16_l, pipeline_matmul_f16_m, pipeline_matmul_f16_s;
-    vk_pipeline pipeline_matmul_f16_aligned_l, pipeline_matmul_f16_aligned_m, pipeline_matmul_f16_aligned_s;
-    vk_pipeline pipeline_matmul_f16_f32_l, pipeline_matmul_f16_f32_m, pipeline_matmul_f16_f32_s;
-    vk_pipeline pipeline_matmul_f16_f32_aligned_l, pipeline_matmul_f16_f32_aligned_m, pipeline_matmul_f16_f32_aligned_s;
-    vk_pipeline pipeline_matmul_split_k_reduce;
-
-    vk_pipeline pipeline_dequant[VK_NUM_TYPES];
-    vk_pipeline pipeline_dequant_mul_mat_vec_f32[VK_NUM_TYPES];
-
-    vk_pipeline pipeline_mul_mat_vec_p021_f16_f32;
-    vk_pipeline pipeline_mul_mat_vec_nc_f16_f32;
-    vk_pipeline pipeline_get_rows[VK_NUM_TYPES];
-    vk_pipeline pipeline_get_rows_f32[VK_NUM_TYPES];
-    vk_pipeline pipeline_mul_f32;
-    vk_pipeline pipeline_add_f32;
-    vk_pipeline pipeline_scale_f32;
-    vk_pipeline pipeline_sqr_f32;
-    vk_pipeline pipeline_clamp_f32;
-    vk_pipeline pipeline_cpy_f32_f32, pipeline_cpy_f32_f16, pipeline_cpy_f16_f16;
-    vk_pipeline pipeline_norm_f32;
-    vk_pipeline pipeline_rms_norm_f32;
-    vk_pipeline pipeline_gelu_f32;
-    vk_pipeline pipeline_silu_f32;
-    vk_pipeline pipeline_relu_f32;
-    vk_pipeline pipeline_diag_mask_inf_f32;
-    vk_pipeline pipeline_soft_max_f32;
-    vk_pipeline pipeline_rope_f32, pipeline_rope_f16;
-    vk_pipeline pipeline_rope_neox_f32, pipeline_rope_neox_f16;
+    std::shared_ptr<vk_device> device;
 
     size_t semaphore_idx, event_idx;
     ggml_vk_garbage_collector gc;
@@ -305,13 +330,31 @@ struct vk_instance {
 
     std::vector<size_t> device_indices;
 
-    std::shared_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
     ggml_backend_t backends[GGML_VK_MAX_DEVICES];
     ggml_backend_vk_context contexts[GGML_VK_MAX_DEVICES];
     ggml_backend_buffer_type buffer_types[GGML_VK_MAX_DEVICES];
     bool initialized[GGML_VK_MAX_DEVICES];
 };
 
+static std::shared_ptr<vk_device> ggml_vk_get_device(size_t idx) {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_vk_get_device(" << idx << ")" << std::endl;
+#endif
+    static std::weak_ptr<vk_device> devices[GGML_VK_MAX_DEVICES];
+
+    if (devices[idx].expired()) {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "Initializing new vk_device" << std::endl;
+#endif
+        std::shared_ptr<vk_device> device = std::make_shared<vk_device>();
+        device->initialized = false;
+        devices[idx] = device;
+        return device;
+    }
+
+    return devices[idx].lock();
+}
+
 #ifdef GGML_VULKAN_CHECK_RESULTS
 static size_t vk_skip_checks;
 static size_t vk_output_tensor;
@@ -335,14 +378,15 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
     GGML_ASSERT(parameter_count > 0);
     GGML_ASSERT(wg_denoms[0] > 0 && wg_denoms[1] > 0 && wg_denoms[2] > 0); // NOLINT
 
-    pipeline.name = name;
-    pipeline.parameter_count = parameter_count;
-    pipeline.push_constant_size = push_constant_size;
-    pipeline.wg_denoms = wg_denoms;
-    pipeline.align = align;
+    pipeline = std::make_shared<vk_pipeline_struct>();
+    pipeline->name = name;
+    pipeline->parameter_count = parameter_count;
+    pipeline->push_constant_size = push_constant_size;
+    pipeline->wg_denoms = wg_denoms;
+    pipeline->align = align;
 
     vk::ShaderModuleCreateInfo shader_module_create_info({}, spv_size, reinterpret_cast<const uint32_t *>(spv_data));
-    pipeline.shader_module = ctx->device.lock()->device.createShaderModule(shader_module_create_info);
+    pipeline->shader_module = ctx->device->device.createShaderModule(shader_module_create_info);
 
     std::vector<vk::DescriptorSetLayoutBinding> dsl_binding;
     std::vector<vk::DescriptorBindingFlags> dsl_binding_flags;
@@ -356,49 +400,49 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
     vk::PushConstantRange pcr(
         vk::ShaderStageFlagBits::eCompute,
         0,
-        pipeline.push_constant_size
+        pipeline->push_constant_size
     );
 
     vk::DescriptorSetLayoutCreateInfo descriptor_set_layout_create_info(
         {},
         dsl_binding);
     descriptor_set_layout_create_info.setPNext(&dslbfci);
-    pipeline.dsl = ctx->device.lock()->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
+    pipeline->dsl = ctx->device->device.createDescriptorSetLayout(descriptor_set_layout_create_info);
 
     // Check if device supports multiple descriptors per pool
-    if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) {
+    if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN) {
         const uint32_t alloc_count = 2;
 
         // Try allocating multiple sets from one pool
         // This fails on AMD for some reason, so add a fall back to allocating one pool per set
-        vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count);
+        vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
         vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, alloc_count, descriptor_pool_size);
-        vk::DescriptorPool pool = ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info);
+        vk::DescriptorPool pool = ctx->device->device.createDescriptorPool(descriptor_pool_create_info);
 
         std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
         for (uint32_t i = 0; i < alloc_count; i++) {
-            layouts[i] = pipeline.dsl;
+            layouts[i] = pipeline->dsl;
         }
         try {
             vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pool, alloc_count, layouts.data());
-            std::vector<vk::DescriptorSet> sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info);
+            std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info);
         } catch(vk::OutOfPoolMemoryError const&) {
-            ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE;
+            ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_SINGLE;
         }
 
-        ctx->device.lock()->device.destroyDescriptorPool(pool);
+        ctx->device->device.destroyDescriptorPool(pool);
     }
 
-    if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
-        vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count);
+    if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
+        vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
         vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 128, descriptor_pool_size);
-        pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info));
+        pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info));
     }
 
-    pipeline.descriptor_set_idx = 0;
+    pipeline->descriptor_set_idx = 0;
 
-    vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline.dsl, pcr);
-    pipeline.layout = ctx->device.lock()->device.createPipelineLayout(pipeline_layout_create_info);
+    vk::PipelineLayoutCreateInfo pipeline_layout_create_info(vk::PipelineLayoutCreateFlags(), pipeline->dsl, pcr);
+    pipeline->layout = ctx->device->device.createPipelineLayout(pipeline_layout_create_info);
 
     std::vector<vk::SpecializationMapEntry> specialization_entries(specialization_constants.size());
 
@@ -418,72 +462,75 @@ static void ggml_vk_create_pipeline(ggml_backend_vk_context * ctx, vk_pipeline&
     vk::PipelineShaderStageCreateInfo pipeline_shader_create_info(
             vk::PipelineShaderStageCreateFlags(),
             vk::ShaderStageFlagBits::eCompute,
-            pipeline.shader_module,
+            pipeline->shader_module,
             entrypoint.c_str(),
             &specialization_info);
     vk::ComputePipelineCreateInfo compute_pipeline_create_info(
         vk::PipelineCreateFlags(),
         pipeline_shader_create_info,
-        pipeline.layout);
-    pipeline.pipeline = ctx->device.lock()->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
+        pipeline->layout);
+    pipeline->pipeline = ctx->device->device.createComputePipeline(VK_NULL_HANDLE, compute_pipeline_create_info).value;
 
-    ctx->gc.pipelines.push_back(&pipeline);
+    ctx->device->pipelines.push_back(pipeline);
 }
 
-static void ggml_vk_destroy_pipeline(ggml_backend_vk_context * ctx, vk_pipeline * pipeline) {
+static void ggml_vk_destroy_pipeline(vk::Device& device, vk_pipeline& pipeline) {
+#ifdef GGML_VULKAN_DEBUG
+    std::cerr << "ggml_pipeline_destroy_pipeline(" << pipeline->name << ")" << std::endl;
+#endif
     for (auto& pool : pipeline->descriptor_pools) {
-        ctx->device.lock()->device.destroyDescriptorPool(pool);
+        device.destroyDescriptorPool(pool);
     }
     pipeline->descriptor_pools.clear();
     pipeline->descriptor_sets.clear();
     pipeline->descriptor_set_idx = 0;
 
-    ctx->device.lock()->device.destroyDescriptorSetLayout(pipeline->dsl);
+    device.destroyDescriptorSetLayout(pipeline->dsl);
 
-    ctx->device.lock()->device.destroyPipelineLayout(pipeline->layout);
+    device.destroyPipelineLayout(pipeline->layout);
 
-    ctx->device.lock()->device.destroyShaderModule(pipeline->shader_module);
+    device.destroyShaderModule(pipeline->shader_module);
 
-    ctx->device.lock()->device.destroyPipeline(pipeline->pipeline);
+    device.destroyPipeline(pipeline->pipeline);
 }
 
 static void ggml_pipeline_allocate_descriptor_sets(ggml_backend_vk_context * ctx, vk_pipeline& pipeline, uint32_t n) {
 #ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline.name << ", " << n << ")" << std::endl;
+    std::cerr << "ggml_pipeline_allocate_descriptor_sets(" << pipeline->name << ", " << n << ")" << std::endl;
 #endif
-    if (pipeline.descriptor_sets.size() >= pipeline.descriptor_set_idx + n) {
+    if (pipeline->descriptor_sets.size() >= pipeline->descriptor_set_idx + n) {
         // Enough descriptors are available
         return;
     }
 
-    if (ctx->device.lock()->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
-        const uint32_t alloc_count = pipeline.descriptor_set_idx + n - pipeline.descriptor_sets.size();
+    if (ctx->device->descriptor_set_mode == VK_DEVICE_DESCRIPTOR_POOL_MODE_MULTI) {
+        const uint32_t alloc_count = pipeline->descriptor_set_idx + n - pipeline->descriptor_sets.size();
 
         std::vector<vk::DescriptorSetLayout> layouts(alloc_count);
         for (uint32_t i = 0; i < alloc_count; i++) {
-            layouts[i] = pipeline.dsl;
+            layouts[i] = pipeline->dsl;
         }
-        vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[0], alloc_count, layouts.data());
-        std::vector<vk::DescriptorSet> sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info);
-        pipeline.descriptor_sets.insert(pipeline.descriptor_sets.end(), sets.begin(), sets.end());
+        vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[0], alloc_count, layouts.data());
+        std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info);
+        pipeline->descriptor_sets.insert(pipeline->descriptor_sets.end(), sets.begin(), sets.end());
     } else {
-        for (uint32_t i = pipeline.descriptor_sets.size(); i < pipeline.descriptor_set_idx + n; i++) {
-            vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline.parameter_count);
+        for (uint32_t i = pipeline->descriptor_sets.size(); i < pipeline->descriptor_set_idx + n; i++) {
+            vk::DescriptorPoolSize descriptor_pool_size(vk::DescriptorType::eStorageBuffer, pipeline->parameter_count);
             vk::DescriptorPoolCreateInfo descriptor_pool_create_info({}, 1, descriptor_pool_size);
-            pipeline.descriptor_pools.push_back(ctx->device.lock()->device.createDescriptorPool(descriptor_pool_create_info));
+            pipeline->descriptor_pools.push_back(ctx->device->device.createDescriptorPool(descriptor_pool_create_info));
 
-            vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline.descriptor_pools[i], 1, &pipeline.dsl);
-            std::vector<vk::DescriptorSet> sets = ctx->device.lock()->device.allocateDescriptorSets(descriptor_set_alloc_info);
-            pipeline.descriptor_sets.push_back(sets[0]);
+            vk::DescriptorSetAllocateInfo descriptor_set_alloc_info(pipeline->descriptor_pools[i], 1, &pipeline->dsl);
+            std::vector<vk::DescriptorSet> sets = ctx->device->device.allocateDescriptorSets(descriptor_set_alloc_info);
+            pipeline->descriptor_sets.push_back(sets[0]);
         }
     }
 }
 
 static void ggml_pipeline_cleanup(vk_pipeline& pipeline) {
 #ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_pipeline_cleanup(" << pipeline.name << ")" << std::endl;
+    std::cerr << "ggml_pipeline_cleanup(" << pipeline->name << ")" << std::endl;
 #endif
-    pipeline.descriptor_set_idx = 0;
+    pipeline->descriptor_set_idx = 0;
 }
 
 static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx, vk_queue& q) {
@@ -499,7 +546,7 @@ static vk::CommandBuffer ggml_vk_create_cmd_buffer(ggml_backend_vk_context * ctx
         q.pool,
         vk::CommandBufferLevel::ePrimary,
         1);
-    const std::vector<vk::CommandBuffer> cmd_buffers = ctx->device.lock()->device.allocateCommandBuffers(command_buffer_alloc_info);
+    const std::vector<vk::CommandBuffer> cmd_buffers = ctx->device->device.allocateCommandBuffers(command_buffer_alloc_info);
     auto buf = cmd_buffers.front();
 
     q.cmd_buffers.push_back(buf);
@@ -644,11 +691,11 @@ static void ggml_vk_create_queue(ggml_backend_vk_context * ctx, vk_queue& q, uin
     q.queue_family_index = queue_family_index;
 
     vk::CommandPoolCreateInfo command_pool_create_info_compute(vk::CommandPoolCreateFlags(VK_COMMAND_POOL_CREATE_TRANSIENT_BIT), queue_family_index);
-    q.pool = ctx->device.lock()->device.createCommandPool(command_pool_create_info_compute);
+    q.pool = ctx->device->device.createCommandPool(command_pool_create_info_compute);
 
     q.cmd_buffer_idx = 0;
 
-    q.queue = ctx->device.lock()->device.getQueue(queue_family_index, queue_index);
+    q.queue = ctx->device->device.getQueue(queue_family_index, queue_index);
 
     q.stage_flags = stage_flags;
 }
@@ -672,7 +719,7 @@ static vk_semaphore * ggml_vk_create_binary_semaphore(ggml_backend_vk_context *
     vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eBinary, 0 };
     vk::SemaphoreCreateInfo ci{};
     ci.setPNext(&tci);
-    vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci);
+    vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
     ctx->gc.semaphores.push_back({ semaphore, 0 });
     return &ctx->gc.semaphores[ctx->gc.semaphores.size() - 1];
 }
@@ -685,7 +732,7 @@ static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context
         vk::SemaphoreTypeCreateInfo tci{ vk::SemaphoreType::eTimeline, 0 };
         vk::SemaphoreCreateInfo ci{};
         ci.setPNext(&tci);
-        vk::Semaphore semaphore = ctx->device.lock()->device.createSemaphore(ci);
+        vk::Semaphore semaphore = ctx->device->device.createSemaphore(ci);
         ctx->gc.tl_semaphores.push_back({ semaphore, 0 });
     }
     return &ctx->gc.tl_semaphores[ctx->semaphore_idx++];
@@ -693,7 +740,7 @@ static vk_semaphore * ggml_vk_create_timeline_semaphore(ggml_backend_vk_context
 
 static vk::Event ggml_vk_create_event(ggml_backend_vk_context * ctx) {
     if (ctx->event_idx >= ctx->gc.events.size()) {
-        ctx->gc.events.push_back(ctx->device.lock()->device.createEvent({}));
+        ctx->gc.events.push_back(ctx->device->device.createEvent({}));
     }
     return ctx->gc.events[ctx->event_idx++];
 }
@@ -704,7 +751,7 @@ static void ggml_vk_queue_cleanup(ggml_backend_vk_context * ctx, vk_queue& q) {
 #endif
     // Requires command buffers to be done
 
-    ctx->device.lock()->device.resetCommandPool(q.pool);
+    ctx->device->device.resetCommandPool(q.pool);
     q.cmd_buffer_idx = 0;
 }
 
@@ -729,11 +776,11 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
         nullptr,
     };
 
-    buf->buffer = ctx->device.lock()->device.createBuffer(buffer_create_info);
+    buf->buffer = ctx->device->device.createBuffer(buffer_create_info);
 
-    vk::MemoryRequirements mem_req = ctx->device.lock()->device.getBufferMemoryRequirements(buf->buffer);
+    vk::MemoryRequirements mem_req = ctx->device->device.getBufferMemoryRequirements(buf->buffer);
 
-    vk::PhysicalDeviceMemoryProperties mem_props = ctx->device.lock()->physical_device.getMemoryProperties();
+    vk::PhysicalDeviceMemoryProperties mem_props = ctx->device->physical_device.getMemoryProperties();
 
     uint32_t memory_type_index = UINT32_MAX;
 
@@ -746,16 +793,16 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
     }
 
     if (memory_type_index >= mem_props.memoryTypeCount) {
-        ctx->device.lock()->device.destroyBuffer(buf->buffer);
+        ctx->device->device.destroyBuffer(buf->buffer);
         buf->size = 0;
         throw vk::OutOfDeviceMemoryError("No suitable memory type found");
     }
 
     try {
-        buf->device_memory = ctx->device.lock()->device.allocateMemory({ mem_req.size, memory_type_index });
+        buf->device_memory = ctx->device->device.allocateMemory({ mem_req.size, memory_type_index });
     } catch (const vk::SystemError& e) {
         // Out of Host/Device memory, clean up buffer
-        ctx->device.lock()->device.destroyBuffer(buf->buffer);
+        ctx->device->device.destroyBuffer(buf->buffer);
         buf->size = 0;
         throw e;
     }
@@ -763,14 +810,14 @@ static vk_buffer ggml_vk_create_buffer(ggml_backend_vk_context * ctx, size_t siz
     buf->ptr = nullptr;
 
     if (req_flags & vk::MemoryPropertyFlagBits::eHostVisible) {
-        buf->ptr = ctx->device.lock()->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
+        buf->ptr = ctx->device->device.mapMemory(buf->device_memory, 0, VK_WHOLE_SIZE);
     }
 
-    ctx->device.lock()->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
+    ctx->device->device.bindBufferMemory(buf->buffer, buf->device_memory, 0);
 
     buf->ctx = ctx;
 
-    buf->device = ctx->device.lock();
+    buf->device = ctx->device;
 
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << "Created buffer " << buf->buffer << std::endl;
@@ -794,7 +841,7 @@ static vk_buffer ggml_vk_create_buffer_device(ggml_backend_vk_context * ctx, siz
     try {
         buf = ggml_vk_create_buffer(ctx, size, vk::MemoryPropertyFlagBits::eDeviceLocal);
     } catch (const vk::SystemError& e) {
-        if (ctx->device.lock()->uma) {
+        if (ctx->device->uma) {
             // Fall back to host memory type
             buf = ggml_vk_create_buffer_check(ctx, size, vk::MemoryPropertyFlagBits::eHostVisible | vk::MemoryPropertyFlagBits::eHostCoherent);
         } else {
@@ -873,7 +920,7 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
     std::cerr << "ggml_vk_load_shaders(" << ctx->name << ")" << std::endl;
 #endif
 
-    const std::shared_ptr<vk_device> device = ctx->device.lock();
+    const std::shared_ptr<vk_device> device = ctx->device;
 
     // mulmat
     std::initializer_list<uint32_t> warptile_l = { 128, 128, 128, 16, device->subgroup_size * 2, 64, 2, 4, 4, device->subgroup_size };
@@ -889,125 +936,125 @@ static void ggml_vk_load_shaders(ggml_backend_vk_context * ctx) {
     uint32_t s_align =  32;
 
     if (device->fp16) {
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_len, matmul_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_len, matmul_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
 
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_len, matmul_f16_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_len, matmul_f16_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
 
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_len, matmul_f16_f32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_len, matmul_f16_f32_aligned_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
     } else {
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_l, "matmul_f32_l", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_m, "matmul_f32_m", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_s, "matmul_f32_s", matmul_f32_fp32_len, matmul_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_l, "matmul_f32_aligned_l", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_m, "matmul_f32_aligned_m", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f32_aligned_s, "matmul_f32_aligned_s", matmul_f32_aligned_fp32_len, matmul_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
 
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_l, "matmul_f16_l", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_m, "matmul_f16_m", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_s, "matmul_f16_s", matmul_f16_fp32_len, matmul_f16_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_l, "matmul_f16_aligned_l", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_m, "matmul_f16_aligned_m", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_aligned_s, "matmul_f16_aligned_s", matmul_f16_aligned_fp32_len, matmul_f16_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
 
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
-        ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_l, "matmul_f16_f32_l", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_m, "matmul_f16_f32_m", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_s, "matmul_f16_f32_s", matmul_f16_f32_fp32_len, matmul_f16_f32_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, 1);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_l, "matmul_f16_f32_aligned_l", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), l_wg_denoms, warptile_l, l_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_m, "matmul_f16_f32_aligned_m", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), m_wg_denoms, warptile_m, m_align);
+        ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_f16_f32_aligned_s, "matmul_f16_f32_aligned_s", matmul_f16_f32_aligned_fp32_len, matmul_f16_f32_aligned_fp32_data, "main", 3, 14 * sizeof(uint32_t), s_wg_denoms, warptile_s, s_align);
     }
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32",  mul_mat_vec_f16_f32_len,  mul_mat_vec_f16_f32_data,  "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_F16 ], "mul_mat_vec_f16_f32",  mul_mat_vec_f16_f32_len,  mul_mat_vec_f16_f32_data,  "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_0], "mul_mat_vec_q4_0_f32", mul_mat_vec_q4_0_f32_len, mul_mat_vec_q4_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_1], "mul_mat_vec_q4_1_f32", mul_mat_vec_q4_1_f32_len, mul_mat_vec_q4_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_0], "mul_mat_vec_q5_0_f32", mul_mat_vec_q5_0_f32_len, mul_mat_vec_q5_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_1], "mul_mat_vec_q5_1_f32", mul_mat_vec_q5_1_f32_len, mul_mat_vec_q5_1_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q8_0], "mul_mat_vec_q8_0_f32", mul_mat_vec_q8_0_f32_len, mul_mat_vec_q8_0_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q2_K], "mul_mat_vec_q2_K_f32", mul_mat_vec_q2_K_f32_len, mul_mat_vec_q2_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q3_K], "mul_mat_vec_q3_K_f32", mul_mat_vec_q3_K_f32_len, mul_mat_vec_q3_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q4_K], "mul_mat_vec_q4_K_f32", mul_mat_vec_q4_K_f32_len, mul_mat_vec_q4_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q5_K], "mul_mat_vec_q5_K_f32", mul_mat_vec_q5_K_f32_len, mul_mat_vec_q5_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant_mul_mat_vec_f32[GGML_TYPE_Q6_K], "mul_mat_vec_q6_K_f32", mul_mat_vec_q6_K_f32_len, mul_mat_vec_q6_K_f32_data, "main", 3, 3 * sizeof(uint32_t), {1, 1, 1}, { device->subgroup_size }, 1);
 
     // dequant shaders
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   f32_to_f16_len,   f32_to_f16_data,   "main", 2, 5 * sizeof(uint32_t), {      64, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_F16 ], "dequant_f16",  dequant_f16_len,  dequant_f16_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {  8 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F32 ], "f32_to_f16",   f32_to_f16_len,   f32_to_f16_data,   "main", 2, 5 * sizeof(uint32_t), {      64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_F16 ], "dequant_f16",  dequant_f16_len,  dequant_f16_data,  "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_0], "dequant_q4_0", dequant_q4_0_len, dequant_q4_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 16, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_1], "dequant_q4_1", dequant_q4_1_len, dequant_q4_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_0], "dequant_q5_0", dequant_q5_0_len, dequant_q5_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_1], "dequant_q5_1", dequant_q5_1_len, dequant_q5_1_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q8_0], "dequant_q8_0", dequant_q8_0_len, dequant_q8_0_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q2_K], "dequant_q2_K", dequant_q2_K_len, dequant_q2_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q3_K], "dequant_q3_K", dequant_q3_K_len, dequant_q3_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q4_K], "dequant_q4_K", dequant_q4_K_len, dequant_q4_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 32, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q5_K], "dequant_q5_K", dequant_q5_K_len, dequant_q5_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_dequant[GGML_TYPE_Q6_K], "dequant_q6_K", dequant_q6_K_len, dequant_q6_K_data, "main", 2, 5 * sizeof(uint32_t), {256 * 64, 1, 1}, {}, 1);
 
     // get_rows
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16",  get_rows_f16_len,  get_rows_f16_data,  "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_F16 ], "get_rows_f16",  get_rows_f16_len,  get_rows_f16_data,  "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_0], "get_rows_q4_0", get_rows_q4_0_len, get_rows_q4_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q4_1], "get_rows_q4_1", get_rows_q4_1_len, get_rows_q4_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_0], "get_rows_q5_0", get_rows_q5_0_len, get_rows_q5_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q5_1], "get_rows_q5_1", get_rows_q5_1_len, get_rows_q5_1_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows[GGML_TYPE_Q8_0], "get_rows_q8_0", get_rows_q8_0_len, get_rows_q8_0_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_F32 ], "get_rows_f16_f32",  get_rows_f16_f32_len,  get_rows_f16_f32_data,  "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_0], "get_rows_q4_0_f32", get_rows_q4_0_f32_len, get_rows_q4_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q4_1], "get_rows_q4_1_f32", get_rows_q4_1_f32_len, get_rows_q4_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_0], "get_rows_q5_0_f32", get_rows_q5_0_f32_len, get_rows_q5_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q5_1], "get_rows_q5_1_f32", get_rows_q5_1_f32_len, get_rows_q5_1_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_get_rows_f32[GGML_TYPE_Q8_0], "get_rows_q8_0_f32", get_rows_q8_0_f32_len, get_rows_q8_0_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_matmul_split_k_reduce, "split_k_reduce", split_k_reduce_len, split_k_reduce_data, "main", 2, 2 * sizeof(uint32_t), {256, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, "mul_mat_vec_p021_f16_f32", mul_mat_vec_p021_f16_f32_len, mul_mat_vec_p021_f16_f32_data, "main", 3, 6 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, "mul_mat_vec_nc_f16_f32", mul_mat_vec_nc_f16_f32_len, mul_mat_vec_nc_f16_f32_data, "main", 3, 7 * sizeof(uint32_t), {1, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_norm_f32, "norm_f32", norm_f32_len, norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rms_norm_f32, "rms_norm_f32", rms_norm_f32_len, rms_norm_f32_data, "main", 2, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f32, "cpy_f32_f32", cpy_f32_f32_len, cpy_f32_f32_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f32_f16, "cpy_f32_f16", cpy_f32_f16_len, cpy_f32_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_cpy_f16_f16, "cpy_f16_f16", cpy_f16_f16_len, cpy_f16_f16_data, "main", 2, sizeof(vk_op_cpy_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_add_f32, "add_f32", add_f32_len, add_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_mul_f32, "mul_f32", mul_f32_len, mul_f32_data, "main", 3, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_scale_f32, "scale_f32", scale_f32_len, scale_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_sqr_f32, "sqr_f32", sqr_f32_len, sqr_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_clamp_f32, "clamp_f32", clamp_f32_len, clamp_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_gelu_f32, "gelu_f32", gelu_f32_len, gelu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_silu_f32, "silu_f32", silu_f32_len, silu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_relu_f32, "relu_f32", relu_f32_len, relu_f32_data, "main", 2, sizeof(vk_op_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_diag_mask_inf_f32, "diag_mask_inf_f32", diag_mask_inf_f32_len, diag_mask_inf_f32_data, "main", 2, sizeof(vk_op_diag_mask_push_constants), {512, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_soft_max_f32, "soft_max_f32", soft_max_f32_len, soft_max_f32_data, "main", 3, sizeof(vk_op_push_constants), {1, 1, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f32, "rope_f32", rope_f32_len, rope_f32_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_f16, "rope_f16", rope_f16_len, rope_f16_data, "main", 3, sizeof(vk_op_rope_push_constants), {1, 512, 1}, {}, 1);
 
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
-    ggml_vk_create_pipeline(ctx, ctx->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f32, "rope_neox_f32", rope_neox_f32_len, rope_neox_f32_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
+    ggml_vk_create_pipeline(ctx, ctx->device->pipeline_rope_neox_f16, "rope_neox_f16", rope_neox_f16_len, rope_neox_f16_data, "main", 3, sizeof(vk_op_rope_neox_push_constants), {1, 512, 1}, {}, 1);
 }
 
 static void ggml_vk_print_gpu_info(size_t idx) {
@@ -1157,140 +1204,148 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
         throw std::runtime_error("Device not found");
     }
 
-    vk_instance.devices[idx] = std::make_shared<vk_device>();
-    ctx->device = vk_instance.devices[idx];
-    ctx->device.lock()->physical_device = devices[dev_num];
-    std::vector<vk::ExtensionProperties> ext_props = ctx->device.lock()->physical_device.enumerateDeviceExtensionProperties();
+    ctx->device = ggml_vk_get_device(idx);
+    if (!ctx->device->initialized) {
+        ctx->device->physical_device = devices[dev_num];
+        std::vector<vk::ExtensionProperties> ext_props = ctx->device->physical_device.enumerateDeviceExtensionProperties();
 
-    bool maintenance4_support = false;
+        bool maintenance4_support = false;
 
-    // Check if maintenance4 is supported
-    for (auto properties : ext_props) {
-        if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
-            maintenance4_support = true;
+        // Check if maintenance4 is supported
+        for (auto properties : ext_props) {
+            if (strcmp("VK_KHR_maintenance4", properties.extensionName) == 0) {
+                maintenance4_support = true;
+            }
         }
-    }
 
-    vk::PhysicalDeviceProperties2 props2;
-    vk::PhysicalDeviceMaintenance3Properties props3;
-    vk::PhysicalDeviceMaintenance4Properties props4;
-    vk::PhysicalDeviceSubgroupProperties subgroup_props;
-    props2.pNext = &props3;
-    props3.pNext = &subgroup_props;
-    if (maintenance4_support) {
-        subgroup_props.pNext = &props4;
-    }
-    ctx->device.lock()->physical_device.getProperties2(&props2);
-    ctx->device.lock()->properties = props2.properties;
-
-    if (maintenance4_support) {
-        ctx->device.lock()->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
-    } else {
-        ctx->device.lock()->max_memory_allocation_size = props3.maxMemoryAllocationSize;
-    }
-
-    ctx->device.lock()->vendor_id = ctx->device.lock()->properties.vendorID;
-    ctx->device.lock()->subgroup_size = subgroup_props.subgroupSize;
-    ctx->device.lock()->uma = ctx->device.lock()->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
-
-    bool fp16_storage = false;
-    bool fp16_compute = false;
-
-    for (auto properties : ext_props) {
-        if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
-            fp16_storage = true;
-        } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
-            fp16_compute = true;
+        vk::PhysicalDeviceProperties2 props2;
+        vk::PhysicalDeviceMaintenance3Properties props3;
+        vk::PhysicalDeviceMaintenance4Properties props4;
+        vk::PhysicalDeviceSubgroupProperties subgroup_props;
+        props2.pNext = &props3;
+        props3.pNext = &subgroup_props;
+        if (maintenance4_support) {
+            subgroup_props.pNext = &props4;
         }
-    }
+        ctx->device->physical_device.getProperties2(&props2);
+        ctx->device->properties = props2.properties;
 
-    const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16");
-    bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr;
+        if (maintenance4_support) {
+            ctx->device->max_memory_allocation_size = std::min(props3.maxMemoryAllocationSize, props4.maxBufferSize);
+        } else {
+            ctx->device->max_memory_allocation_size = props3.maxMemoryAllocationSize;
+        }
 
-    ctx->device.lock()->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
+        ctx->device->vendor_id = ctx->device->properties.vendorID;
+        ctx->device->subgroup_size = subgroup_props.subgroupSize;
+        ctx->device->uma = ctx->device->properties.deviceType == vk::PhysicalDeviceType::eIntegratedGpu;
 
-    std::vector<vk::QueueFamilyProperties> queue_family_props = ctx->device.lock()->physical_device.getQueueFamilyProperties();
+        bool fp16_storage = false;
+        bool fp16_compute = false;
 
-    // Try to find a non-graphics compute queue and transfer-focused queues
-    const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
-    const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1);
+        for (auto properties : ext_props) {
+            if (strcmp("VK_KHR_16bit_storage", properties.extensionName) == 0) {
+                fp16_storage = true;
+            } else if (strcmp("VK_KHR_shader_float16_int8", properties.extensionName) == 0) {
+                fp16_compute = true;
+            }
+        }
 
-    const float priorities[] = { 1.0f, 1.0f };
-    ctx->device.lock()->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1;
+        const char* GGML_VULKAN_DISABLE_F16 = getenv("GGML_VULKAN_DISABLE_F16");
+        bool force_disable_f16 = GGML_VULKAN_DISABLE_F16 != nullptr;
 
-    std::vector<vk::DeviceQueueCreateInfo> device_queue_create_infos;
-    if (compute_queue_family_index != transfer_queue_family_index) {
-        device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
-        device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1});
-    } else if(!ctx->device.lock()->single_queue) {
-        device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities});
-    } else {
-        device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
-    }
-    vk::DeviceCreateInfo device_create_info;
-    std::vector<const char *> device_extensions;
-    vk::PhysicalDeviceFeatures device_features = ctx->device.lock()->physical_device.getFeatures();
+        ctx->device->fp16 = !force_disable_f16 && fp16_storage && fp16_compute;
 
-    VkPhysicalDeviceFeatures2 device_features2;
-    device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
-    device_features2.pNext = nullptr;
-    device_features2.features = (VkPhysicalDeviceFeatures)device_features;
+        std::vector<vk::QueueFamilyProperties> queue_family_props = ctx->device->physical_device.getQueueFamilyProperties();
 
-    VkPhysicalDeviceVulkan11Features vk11_features;
-    vk11_features.pNext = nullptr;
-    vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
-    device_features2.pNext = &vk11_features;
+        // Try to find a non-graphics compute queue and transfer-focused queues
+        const uint32_t compute_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eCompute, vk::QueueFlagBits::eGraphics, -1, 1);
+        const uint32_t transfer_queue_family_index = ggml_vk_find_queue_family_index(queue_family_props, vk::QueueFlagBits::eTransfer, vk::QueueFlagBits::eCompute | vk::QueueFlagBits::eGraphics, compute_queue_family_index, 1);
 
-    VkPhysicalDeviceVulkan12Features vk12_features;
-    vk12_features.pNext = nullptr;
-    vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
-    vk11_features.pNext = &vk12_features;
+        const float priorities[] = { 1.0f, 1.0f };
+        ctx->device->single_queue = compute_queue_family_index == transfer_queue_family_index && queue_family_props[compute_queue_family_index].queueCount == 1;
 
-    vkGetPhysicalDeviceFeatures2(ctx->device.lock()->physical_device, &device_features2);
+        std::vector<vk::DeviceQueueCreateInfo> device_queue_create_infos;
+        if (compute_queue_family_index != transfer_queue_family_index) {
+            device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
+            device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), transfer_queue_family_index, 1, priorities + 1});
+        } else if(!ctx->device->single_queue) {
+            device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 2, priorities});
+        } else {
+            device_queue_create_infos.push_back({vk::DeviceQueueCreateFlags(), compute_queue_family_index, 1, priorities});
+        }
+        vk::DeviceCreateInfo device_create_info;
+        std::vector<const char *> device_extensions;
+        vk::PhysicalDeviceFeatures device_features = ctx->device->physical_device.getFeatures();
 
-    ctx->device.lock()->fp16 = ctx->device.lock()->fp16 && vk12_features.shaderFloat16;
+        VkPhysicalDeviceFeatures2 device_features2;
+        device_features2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2;
+        device_features2.pNext = nullptr;
+        device_features2.features = (VkPhysicalDeviceFeatures)device_features;
 
-    if (!vk11_features.storageBuffer16BitAccess) {
-        std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
-        throw std::runtime_error("Unsupported device");
-    }
+        VkPhysicalDeviceVulkan11Features vk11_features;
+        vk11_features.pNext = nullptr;
+        vk11_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_1_FEATURES;
+        device_features2.pNext = &vk11_features;
 
-    device_extensions.push_back("VK_KHR_16bit_storage");
+        VkPhysicalDeviceVulkan12Features vk12_features;
+        vk12_features.pNext = nullptr;
+        vk12_features.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_VULKAN_1_2_FEATURES;
+        vk11_features.pNext = &vk12_features;
+
+        vkGetPhysicalDeviceFeatures2(ctx->device->physical_device, &device_features2);
+
+        ctx->device->fp16 = ctx->device->fp16 && vk12_features.shaderFloat16;
+
+        if (!vk11_features.storageBuffer16BitAccess) {
+            std::cerr << "ggml_vulkan: device " << GGML_VK_NAME << idx << " does not support 16-bit storage." << std::endl;
+            throw std::runtime_error("Unsupported device");
+        }
+
+        device_extensions.push_back("VK_KHR_16bit_storage");
 
 #ifdef GGML_VULKAN_VALIDATE
-    device_extensions.push_back("VK_KHR_shader_non_semantic_info");
+        device_extensions.push_back("VK_KHR_shader_non_semantic_info");
 #endif
 
-    if (ctx->device.lock()->fp16) {
-        device_extensions.push_back("VK_KHR_shader_float16_int8");
-    }
-    ctx->device.lock()->name = ctx->device.lock()->properties.deviceName.data();
+        if (ctx->device->fp16) {
+            device_extensions.push_back("VK_KHR_shader_float16_int8");
+        }
+        ctx->device->name = ctx->device->properties.deviceName.data();
 
-    device_create_info = {
-        vk::DeviceCreateFlags(),
-        device_queue_create_infos,
-        {},
-        device_extensions
-    };
-    device_create_info.setPNext(&device_features2);
-    ctx->device.lock()->device = ctx->device.lock()->physical_device.createDevice(device_create_info);
+        device_create_info = {
+            vk::DeviceCreateFlags(),
+            device_queue_create_infos,
+            {},
+            device_extensions
+        };
+        device_create_info.setPNext(&device_features2);
+        ctx->device->device = ctx->device->physical_device.createDevice(device_create_info);
 
-    ctx->device.lock()->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN;
+        ctx->device->descriptor_set_mode = VK_DEVICE_DESCRIPTOR_POOL_MODE_UNKNOWN;
 
-    // Shaders
-    ggml_vk_load_shaders(ctx);
+        // Queues
+        ggml_vk_create_queue(ctx, ctx->device->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer });
 
-    // Queues
-    ggml_vk_create_queue(ctx, ctx->device.lock()->compute_queue, compute_queue_family_index, 0, { vk::PipelineStageFlagBits::eComputeShader | vk::PipelineStageFlagBits::eTransfer });
-    if (!ctx->device.lock()->single_queue) {
-        const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
-        ggml_vk_create_queue(ctx, ctx->device.lock()->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer });
-    } else {
-        // TODO: Use pointer or reference to avoid copy
-        ctx->device.lock()->transfer_queue = ctx->device.lock()->compute_queue;
+        // Shaders
+        ggml_vk_load_shaders(ctx);
+
+        if (!ctx->device->single_queue) {
+            const uint32_t transfer_queue_index = compute_queue_family_index == transfer_queue_family_index ? 1 : 0;
+            ggml_vk_create_queue(ctx, ctx->device->transfer_queue, transfer_queue_family_index, transfer_queue_index, { vk::PipelineStageFlagBits::eTransfer });
+        } else {
+            // TODO: Use pointer or reference to avoid copy
+            ctx->device->transfer_queue = ctx->device->compute_queue;
+        }
+
+        ctx->device->idx = dev_num;
+        ctx->device->initialized = true;
+    } else if (ctx->device->idx != dev_num) {
+        std::cerr << "ggml_vulkan: Device " << ctx->device->name << " already initialized with index " << ctx->device->idx << ", but trying to reinitialize with index " << dev_num << std::endl;
+        throw std::runtime_error("Device already initialized");
     }
 
-    ctx->fence = ctx->device.lock()->device.createFence({});
+    ctx->fence = ctx->device->device.createFence({});
 
     ctx->compute_ctx = nullptr;
     ctx->transfer_ctx = nullptr;
@@ -1308,7 +1363,7 @@ void ggml_vk_init(ggml_backend_vk_context * ctx, size_t idx) {
 #endif
 }
 
-static vk_pipeline* ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
+static vk_pipeline ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type type) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << "ggml_vk_get_to_fp16()" << std::endl;
 #endif
@@ -1329,10 +1384,10 @@ static vk_pipeline* ggml_vk_get_to_fp16(ggml_backend_vk_context * ctx, ggml_type
             return nullptr;
     }
 
-    return &ctx->pipeline_dequant[type];
+    return ctx->device->pipeline_dequant[type];
 }
 
-static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) {
+static vk_pipeline ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context * ctx, ggml_type type) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << "ggml_vk_get_dequantize_mul_mat_vec()" << std::endl;
 #endif
@@ -1353,7 +1408,7 @@ static vk_pipeline* ggml_vk_get_dequantize_mul_mat_vec(ggml_backend_vk_context *
             return nullptr;
     }
 
-    return &ctx->pipeline_dequant_mul_mat_vec_f32[type];
+    return ctx->device->pipeline_dequant_mul_mat_vec_f32[type];
 }
 
 static vk_buffer ggml_vk_pool_malloc(ggml_backend_vk_context * ctx, size_t size) {
@@ -1430,8 +1485,8 @@ static void * ggml_vk_host_malloc(ggml_backend_vk_context * ctx, size_t size) {
     if(!(buf->memory_property_flags & vk::MemoryPropertyFlagBits::eHostVisible)) {
         fprintf(stderr, "WARNING: failed to allocate %.2f MB of pinned memory\n",
             size/1024.0/1024.0);
-        ctx->device.lock()->device.freeMemory(buf->device_memory);
-        ctx->device.lock()->device.destroyBuffer(buf->buffer);
+        ctx->device->device.freeMemory(buf->device_memory);
+        ctx->device->device.destroyBuffer(buf->buffer);
         return nullptr;
     }
 
@@ -1495,30 +1550,30 @@ static vk_submission ggml_vk_begin_submission(ggml_backend_vk_context * ctx, vk_
 }
 
 static void ggml_vk_dispatch_pipeline(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, std::vector<vk_subbuffer>&& buffers, size_t push_constant_size, const void* push_constants, std::array<uint32_t, 3> elements) {
-    const uint32_t wg0 = CEIL_DIV(elements[0], pipeline.wg_denoms[0]);
-    const uint32_t wg1 = CEIL_DIV(elements[1], pipeline.wg_denoms[1]);
-    const uint32_t wg2 = CEIL_DIV(elements[2], pipeline.wg_denoms[2]);
+    const uint32_t wg0 = CEIL_DIV(elements[0], pipeline->wg_denoms[0]);
+    const uint32_t wg1 = CEIL_DIV(elements[1], pipeline->wg_denoms[1]);
+    const uint32_t wg2 = CEIL_DIV(elements[2], pipeline->wg_denoms[2]);
 #ifdef GGML_VULKAN_DEBUG
-    std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline.name << ", (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
+    std::cerr << "ggml_vk_dispatch_pipeline(" << pipeline->name << ", (" << wg0 << "," << wg1 << "," << wg2 << "))" << std::endl;
 #endif
     std::vector<vk::DescriptorBufferInfo> descriptor_buffer_infos;
     std::vector<vk::WriteDescriptorSet> write_descriptor_sets;
-    GGML_ASSERT(pipeline.descriptor_set_idx < pipeline.descriptor_sets.size());
-    GGML_ASSERT(buffers.size() == pipeline.parameter_count);
-    vk::DescriptorSet& descriptor_set = pipeline.descriptor_sets[pipeline.descriptor_set_idx++];
-    for (uint32_t i = 0; i < pipeline.parameter_count; i++) {
+    GGML_ASSERT(pipeline->descriptor_set_idx < pipeline->descriptor_sets.size());
+    GGML_ASSERT(buffers.size() == pipeline->parameter_count);
+    vk::DescriptorSet& descriptor_set = pipeline->descriptor_sets[pipeline->descriptor_set_idx++];
+    for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
         descriptor_buffer_infos.push_back({buffers[i].buffer->buffer, buffers[i].offset, buffers[i].size});
     }
-    for (uint32_t i = 0; i < pipeline.parameter_count; i++) {
+    for (uint32_t i = 0; i < pipeline->parameter_count; i++) {
         write_descriptor_sets.push_back({descriptor_set, i, 0, 1, vk::DescriptorType::eStorageBuffer, nullptr, &descriptor_buffer_infos[i]});
     }
 
-    ctx->device.lock()->device.updateDescriptorSets(write_descriptor_sets, {});
+    ctx->device->device.updateDescriptorSets(write_descriptor_sets, {});
 
-    subctx->s->buffer.pushConstants(pipeline.layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
-    subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline.pipeline);
+    subctx->s->buffer.pushConstants(pipeline->layout, vk::ShaderStageFlagBits::eCompute, 0, push_constant_size, push_constants);
+    subctx->s->buffer.bindPipeline(vk::PipelineBindPoint::eCompute, pipeline->pipeline);
     subctx->s->buffer.bindDescriptorSets(vk::PipelineBindPoint::eCompute,
-                                pipeline.layout,
+                                pipeline->layout,
                                 0,
                                 { descriptor_set },
                                 {});
@@ -1775,7 +1830,7 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
             memcpy((uint8_t *)dst->ptr + offset + i * width, (const uint8_t *) src + i * spitch, width);
         }
     } else {
-        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
+        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
         ggml_vk_ctx_begin(ctx, subctx);
         ggml_vk_buffer_write_2d_async(ctx, subctx, dst, offset, src, spitch, width, height, true);
         ggml_vk_ctx_end(subctx);
@@ -1785,8 +1840,8 @@ static void ggml_vk_buffer_write_2d(ggml_backend_vk_context * ctx, vk_buffer& ds
         }
 
         ggml_vk_submit(subctx, ctx->fence);
-        VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
-        ctx->device.lock()->device.resetFences({ ctx->fence });
+        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_write_2d waitForFences");
+        ctx->device->device.resetFences({ ctx->fence });
     }
 }
 
@@ -1871,14 +1926,14 @@ static void ggml_vk_buffer_read(ggml_backend_vk_context * ctx, vk_buffer& src, s
 
         memcpy(dst, (uint8_t *) src->ptr + offset, size);
     } else {
-        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
+        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
         ggml_vk_ctx_begin(ctx, subctx);
         ggml_vk_buffer_read_async(ctx, subctx, src, offset, dst, size, true);
         ggml_vk_ctx_end(subctx);
 
         ggml_vk_submit(subctx, ctx->fence);
-        VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
-        ctx->device.lock()->device.resetFences({ ctx->fence });
+        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_read waitForFences");
+        ctx->device->device.resetFences({ ctx->fence });
 
         for (auto& cpy : subctx->out_memcpys) {
             memcpy(cpy.dst, cpy.src, cpy.n);
@@ -1908,13 +1963,13 @@ static void ggml_vk_buffer_copy(vk_buffer& dst, size_t dst_offset, vk_buffer& sr
 
         VkBufferCopy bc{ src_offset, dst_offset, size };
 
-        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
+        vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
         ggml_vk_ctx_begin(ctx, subctx);
         ggml_vk_buffer_copy_async(subctx, dst, dst_offset, src, src_offset, size);
         ggml_vk_ctx_end(subctx);
         ggml_vk_submit(subctx, ctx->fence);
-        VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
-        ctx->device.lock()->device.resetFences({ ctx->fence });
+        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_buffer_copy waitForFences");
+        ctx->device->device.resetFences({ ctx->fence });
     } else {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << "ggml_vk_buffer_copy(MULTI_DEVICE, " << size << ")" << std::endl;
@@ -1942,14 +1997,14 @@ static void ggml_vk_buffer_memset(ggml_backend_vk_context * ctx, vk_buffer& dst,
     // Make sure ctx owns the buffer
     GGML_ASSERT(dst->ctx == ctx);
 
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
+    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
     ggml_vk_ctx_begin(ctx, subctx);
     subctx->s->buffer.fillBuffer(dst->buffer, offset, size, c);
     ggml_vk_ctx_end(subctx);
 
     ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "vk_memset waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
 }
 
 static void ggml_vk_h2d_tensor_2d(ggml_backend_vk_context * ctx, vk_context * subctx, vk_buffer& dst, size_t offset, const ggml_tensor * src, uint64_t i3, uint64_t i2, uint64_t i1) {
@@ -2030,74 +2085,74 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
     std::cerr << "ggml_vk_guess_matmul_pipeline_align(" << m << ", " << n << ")" << std::endl;
 #endif
     if (m <= 32 || n <= 32) {
-        return ctx->pipeline_matmul_f32_aligned_s.align;
+        return ctx->device->pipeline_matmul_f32_aligned_s->align;
     }
-    if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
-        return ctx->pipeline_matmul_f32_aligned_m.align;
+    if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) {
+        return ctx->device->pipeline_matmul_f32_aligned_m->align;
     }
-    return ctx->pipeline_matmul_f32_aligned_l.align;
+    return ctx->device->pipeline_matmul_f32_aligned_l->align;
 }
 
-static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
+static vk_pipeline ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
 #endif
     if (bit16_x && bit16_y) {
-        if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
+        if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << " S" << std::endl;
 #endif
-            return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
+            return aligned ? ctx->device->pipeline_matmul_f16_aligned_s : ctx->device->pipeline_matmul_f16_s;
         }
-        if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
+        if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << " M" << std::endl;
 #endif
-            return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
+            return aligned ? ctx->device->pipeline_matmul_f16_aligned_m : ctx->device->pipeline_matmul_f16_m;
         }
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << " L" << std::endl;
 #endif
-        return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
+        return aligned ? ctx->device->pipeline_matmul_f16_aligned_l : ctx->device->pipeline_matmul_f16_l;
     }
     if (bit16_x && !bit16_y) {
-        if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
+        if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << " S" << std::endl;
 #endif
-            return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
+            return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_s : ctx->device->pipeline_matmul_f16_f32_s;
         }
-        if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
+        if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << " M" << std::endl;
 #endif
-            return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
+            return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_m : ctx->device->pipeline_matmul_f16_f32_m;
         }
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << " L" << std::endl;
 #endif
-        return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_l : &ctx->pipeline_matmul_f16_f32_l;
+        return aligned ? ctx->device->pipeline_matmul_f16_f32_aligned_l : ctx->device->pipeline_matmul_f16_f32_l;
     }
     if (!bit16_x && bit16_y) {
         GGML_ASSERT(false);
     }
 
-    if (ctx->device.lock()->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
+    if (ctx->device->vendor_id == VK_VENDOR_ID_INTEL || m <= 32 || n <= 32) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << " S" << std::endl;
 #endif
-        return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
+        return aligned ? ctx->device->pipeline_matmul_f32_aligned_s : ctx->device->pipeline_matmul_f32_s;
     }
-    if (ctx->device.lock()->subgroup_size == 64 || m <= 64 || n <= 64) {
+    if (ctx->device->subgroup_size == 64 || m <= 64 || n <= 64) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << " M" << std::endl;
 #endif
-        return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
+        return aligned ? ctx->device->pipeline_matmul_f32_aligned_m : ctx->device->pipeline_matmul_f32_m;
     }
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << " L" << std::endl;
 #endif
-    return aligned ? &ctx->pipeline_matmul_f32_aligned_l : &ctx->pipeline_matmul_f32_l;
+    return aligned ? ctx->device->pipeline_matmul_f32_aligned_l : ctx->device->pipeline_matmul_f32_l;
 }
 
 static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline& pipeline, vk_subbuffer&& a, vk_subbuffer&& b, vk_subbuffer&& d, vk_subbuffer&& split_k_buffer, uint32_t m, uint32_t n, uint32_t k, uint32_t stride_a, uint32_t stride_b, uint32_t stride_d, uint32_t split_k, uint32_t batch, uint32_t ne02, uint32_t ne12, uint32_t broadcast2, uint32_t broadcast3, uint32_t batch_stride_a, uint32_t batch_stride_b, uint32_t batch_stride_d) {
@@ -2115,10 +2170,10 @@ static void ggml_vk_matmul(ggml_backend_vk_context * ctx, vk_context * subctx, v
 
     const std::array<uint32_t, 14> pc1 = { m, n, k, stride_a, stride_b, stride_d, CEIL_DIV(k, split_k), ne02, ne12, broadcast2, broadcast3, batch_stride_a, batch_stride_b, batch_stride_d };
     // Make sure enough workgroups get assigned for split k to work
-    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline.wg_denoms[0]) * pipeline.wg_denoms[0]) * split_k, n, batch });
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { a, b, split_k_buffer }, pc1.size() * sizeof(uint32_t), pc1.data(), { (CEIL_DIV(m, pipeline->wg_denoms[0]) * pipeline->wg_denoms[0]) * split_k, n, batch });
     ggml_vk_sync_buffers(subctx);
     const std::array<uint32_t, 2> pc2 = { (uint32_t)(m * n * batch), split_k };
-    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_matmul_split_k_reduce, { split_k_buffer, d }, pc2.size() * sizeof(uint32_t), pc2.data(), { m * n * batch, 1, 1 });
 }
 
 static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
@@ -2128,22 +2183,22 @@ static bool ggml_vk_dim01_contiguous(const ggml_tensor * tensor) {
         tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
 }
 
-static vk_pipeline * ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
+static vk_pipeline ggml_vk_get_cpy_pipeline(ggml_backend_vk_context * ctx, ggml_type from, ggml_type to) {
     if (from == GGML_TYPE_F32 && to == GGML_TYPE_F32) {
-        return &ctx->pipeline_cpy_f32_f32;
+        return ctx->device->pipeline_cpy_f32_f32;
     }
     if (from == GGML_TYPE_F32 && to == GGML_TYPE_F16) {
-        return &ctx->pipeline_cpy_f32_f16;
+        return ctx->device->pipeline_cpy_f32_f16;
     }
     if (from == GGML_TYPE_F16 && to == GGML_TYPE_F16) {
-        return &ctx->pipeline_cpy_f16_f16;
+        return ctx->device->pipeline_cpy_f16_f16;
     }
 
     std::cerr << "Missing CPY op for types: " << ggml_type_name(from) << " " << ggml_type_name(to) << std::endl;
     GGML_ASSERT(false);
 }
 
-static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline * pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) {
+static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context * subctx, vk_pipeline pipeline, const ggml_tensor * tensor, vk_subbuffer&& in, vk_subbuffer&& out, ggml_type buffer_type, bool aligned=true) {
 #ifdef GGML_VULKAN_DEBUG
     std::cerr << "ggml_vk_cpy_to_contiguous((" << tensor << ", type=" << tensor->type << ", backend=" << tensor->backend << ", ne0=" << tensor->ne[0] << ", ne1=" << tensor->ne[1] << ", ne2=" << tensor->ne[2] << ", ne3=" << tensor->ne[3] << ", nb0=" << tensor->nb[0] << ", nb1=" << tensor->nb[1] << ", nb2=" << tensor->nb[2] << ", nb3=" << tensor->nb[3] << "), ";
     std::cerr << "buffer in size=" << in.buffer->size << ", buffer out size=" << out.buffer->size << ")" << std::endl;
@@ -2153,7 +2208,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
 
     const uint32_t ne = tensor->ne[0] * tensor->ne[1] * tensor->ne[2];
 
-    const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1];
+    const uint32_t nb2 = aligned ? ggml_vk_align_size(dst_type_size * tensor->ne[0] * tensor->ne[1], ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size : tensor->ne[0] * tensor->ne[1];
 
     const vk_op_cpy_push_constants pc = {
         (uint32_t)ne,
@@ -2162,7 +2217,7 @@ static void ggml_vk_cpy_to_contiguous(ggml_backend_vk_context * ctx, vk_context
         0,
     };
     ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { in, out }, sizeof(vk_op_cpy_push_constants), &pc, { ne, 1, 1 });
 }
 
 static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * subctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
@@ -2202,7 +2257,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
     bool src0_uma = false;
     bool src1_uma = false;
 
-    if (ctx->device.lock()->uma) {
+    if (ctx->device->uma) {
         ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
         ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
         src0_uma = d_Qx != nullptr;
@@ -2232,7 +2287,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
 
     const uint32_t split_k = ggml_vk_guess_split_k(ne01, ne11, ne10);
 
-    vk_pipeline * pipeline = ggml_vk_guess_matmul_pipeline(ctx, true, !f16_f32_kernel, ne01, ne11, aligned);
+    vk_pipeline pipeline = ggml_vk_guess_matmul_pipeline(ctx, true, !f16_f32_kernel, ne01, ne11, aligned);
 
     const uint64_t qx_sz = ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type);
     const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
@@ -2279,8 +2334,8 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
         GGML_ASSERT(qy_sz == y_sz);
     }
 
-    vk_pipeline * to_fp16_vk_0 = nullptr;
-    vk_pipeline * to_fp16_vk_1 = nullptr;
+    vk_pipeline to_fp16_vk_0 = nullptr;
+    vk_pipeline to_fp16_vk_1 = nullptr;
 
     if (x_non_contig) {
         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, GGML_TYPE_F16);
@@ -2296,15 +2351,15 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
     GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
 
     // Allocate descriptor sets
-    ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, 1);
+    ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
     if (qx_needs_dequant) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, 1);
+        ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1);
     }
     if (qy_needs_dequant) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, 1);
+        ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, 1);
     }
     if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, 1);
+        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, 1);
     }
 
     if (x_non_contig) {
@@ -2317,9 +2372,9 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
         }
 
         if (qx_needs_dequant) {
-            const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0) / 32) };  // TODO: replace with subgroup size
+            const std::vector<uint32_t> pc = { (uint32_t)ne01, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(ggml_nelements(src0)) };
             ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
+            ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_0, { { d_Qx, qx_buf_offset, qx_sz * ne02 * ne03 }, { d_X, 0, x_sz * ne02 * ne03 } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)(x_ne * ne02 * ne03), 1, 1});
         }
     }
     if (y_non_contig) {
@@ -2340,7 +2395,7 @@ static void ggml_vk_mul_mat_q_f16(ggml_backend_vk_context * ctx, vk_context * su
     }
 
     // compute
-    ggml_vk_matmul(ctx, subctx, *pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21);  // NOLINT
+    ggml_vk_matmul(ctx, subctx, pipeline, { d_X, x_buf_offset, x_sz * ne02 * ne03 }, { d_Y, y_buf_offset, y_sz * ne12 * ne13 }, { d_D, d_buf_offset, d_sz * ne12 * ne13 }, { ctx->prealloc_split_k, 0, d_sz * ne12 * ne13 * split_k }, ne01, ne11, ne10, ne10, ne10, ne01, split_k, ne12*ne13, ne02, ne12, r2, r3, stride_batch_x, stride_batch_y, ne20*ne21);  // NOLINT
 
     if (dst->backend == GGML_BACKEND_CPU) {
         // copy dst to host
@@ -2388,7 +2443,7 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
     bool src0_uma = false;
     bool src1_uma = false;
 
-    if (ctx->device.lock()->uma) {
+    if (ctx->device->uma) {
         ggml_vk_host_get(ctx, src0->data, d_Qx, qx_buf_offset);
         ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
         src0_uma = d_Qx != nullptr;
@@ -2410,9 +2465,9 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
     const uint64_t y_ne = ne11 * ne10;
     const uint64_t d_ne = ne11 * ne01;
 
-    const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
+    const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
     const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
-    const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
+    const uint64_t x_sz = x_non_contig ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : qx_sz;
     const uint64_t y_sz = f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne;
     const uint64_t d_sz = sizeof(float) * d_ne;
 
@@ -2452,8 +2507,8 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
         GGML_ASSERT(qy_sz == y_sz);
     }
 
-    vk_pipeline * to_fp16_vk_0 = nullptr;
-    vk_pipeline* to_fp16_vk_1 = nullptr;
+    vk_pipeline to_fp16_vk_0 = nullptr;
+    vk_pipeline to_fp16_vk_1 = nullptr;
     if (x_non_contig) {
         to_fp16_vk_0 = ggml_vk_get_cpy_pipeline(ctx, src0->type, src0->type);
     }
@@ -2462,22 +2517,22 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
     } else {
         to_fp16_vk_1 = ggml_vk_get_to_fp16(ctx, src1->type);
     }
-    vk_pipeline* dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type);
+    vk_pipeline dmmv = ggml_vk_get_dequantize_mul_mat_vec(ctx, src0->type);
     GGML_ASSERT(!qx_needs_dequant || to_fp16_vk_0 != nullptr);  // NOLINT
     GGML_ASSERT(!qy_needs_dequant || to_fp16_vk_1 != nullptr);  // NOLINT
     GGML_ASSERT(dmmv != nullptr);
 
     // Allocate descriptor sets
     if (qx_needs_dequant) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_0, 1);
+        ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_0, 1);
     }
     if (qy_needs_dequant) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
+        ggml_pipeline_allocate_descriptor_sets(ctx, to_fp16_vk_1, y_non_contig ? 1 : ne12 * ne13);
     }
-    ggml_pipeline_allocate_descriptor_sets(ctx, *dmmv, ne12 * ne13);
+    ggml_pipeline_allocate_descriptor_sets(ctx, dmmv, ne12 * ne13);
 
     if (x_non_contig) {
-        GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment));
+        GGML_ASSERT(x_sz == ggml_vk_align_size(ggml_type_size(src0->type) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment));
         ggml_vk_cpy_to_contiguous(ctx, subctx, to_fp16_vk_0, src0, { d_Qx, qx_buf_offset, VK_WHOLE_SIZE }, { d_X, 0, VK_WHOLE_SIZE }, src0->type);
     } else if (load_x) {
         // copy data to device
@@ -2502,22 +2557,22 @@ static void ggml_vk_mul_mat_vec_q_f16(ggml_backend_vk_context * ctx, vk_context
             const uint64_t y_offset = y_buf_offset + y_sz * it_idx1;
             const uint64_t d_offset = d_buf_offset + d_sz * it_idx1;
 
-            const uint64_t y_buffer_offset = (y_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
+            const uint64_t y_buffer_offset = (y_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
             const uint64_t y_shader_offset = y_offset - y_buffer_offset;
 
-            const uint64_t d_buffer_offset = (d_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
+            const uint64_t d_buffer_offset = (d_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
             const uint64_t d_shader_offset = d_offset - d_buffer_offset;
 
             if (!y_non_contig && qy_needs_dequant) {
                 const std::vector<uint32_t> pc = { (uint32_t)ne11, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)ne10, (uint32_t)(y_ne / 32) };
                 ggml_vk_sync_buffers(subctx);
-                ggml_vk_dispatch_pipeline(ctx, subctx, *to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)y_ne, 1, 1});
+                ggml_vk_dispatch_pipeline(ctx, subctx, to_fp16_vk_1, { { d_Qy, qy_offset, qy_sz }, { d_Y, y_offset, y_sz } }, pc.size() * sizeof(uint32_t), pc.data(), { (uint32_t)y_ne, 1, 1});
             }
 
             // compute
             const std::array<uint32_t, 3> pc = { (uint32_t)ne00, (uint32_t)(y_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type))};
             ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
+            ggml_vk_dispatch_pipeline(ctx, subctx, dmmv, { { d_X, x_offset, x_sz }, { d_Y, y_buffer_offset, y_sz + y_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 3 * sizeof(int), &pc, { (uint32_t)ne01, 1, 1});
 
             if (dst->backend == GGML_BACKEND_CPU) {
                 // copy dst to host
@@ -2563,7 +2618,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
 
     bool src1_uma = false;
 
-    if (ctx->device.lock()->uma) {
+    if (ctx->device->uma) {
         ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
         src1_uma = d_Qy != nullptr;
     }
@@ -2574,7 +2629,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
     const uint64_t y_ne = ne10 * ne11 * ne12;
     const uint64_t d_ne = ne01 * ne11 * ne12;
 
-    const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
+    const uint64_t qx_sz = ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment);
     const uint64_t qy_sz = ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type);
     const uint64_t d_sz = sizeof(float) * d_ne;
 
@@ -2593,12 +2648,12 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
     }
 
     // Allocate descriptor sets
-    ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, 1);
+    ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, 1);
 
-    const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
+    const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
     const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
 
-    const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
+    const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
     const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
 
     if (load_y) {
@@ -2608,7 +2663,7 @@ static void ggml_vk_mul_mat_vec_p021_f16_f32(ggml_backend_vk_context * ctx, vk_c
     // compute
     const std::array<uint32_t, 6> pc = { (uint32_t)ne00, (uint32_t)ne01, (uint32_t)ne02, (uint32_t)ne12, (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
     ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_p021_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 6 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
 
     if (dst->backend == GGML_BACKEND_CPU) {
         // copy dst to host
@@ -2655,7 +2710,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
 
     bool src1_uma = false;
 
-    if (ctx->device.lock()->uma) {
+    if (ctx->device->uma) {
         ggml_vk_host_get(ctx, src1->data, d_Qy, qy_buf_offset);
         src1_uma = d_Qy != nullptr;
     }
@@ -2686,12 +2741,12 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
     }
 
     // Allocate descriptor sets
-    ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, 1);
+    ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, 1);
 
-    const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
+    const uint64_t qy_buffer_offset = (qy_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
     const uint64_t qy_shader_offset = qy_buf_offset - qy_buffer_offset;
 
-    const uint64_t d_buffer_offset = (d_buf_offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
+    const uint64_t d_buffer_offset = (d_buf_offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
     const uint64_t d_shader_offset = d_buf_offset - d_buffer_offset;
 
     if (load_y) {
@@ -2701,7 +2756,7 @@ static void ggml_vk_mul_mat_vec_nc_f16_f32(ggml_backend_vk_context * ctx, vk_con
     // compute
     const std::array<uint32_t, 7> pc = { (uint32_t)ne00, (uint32_t)ne01, row_stride_x, channel_stride_x, (uint32_t)(ne12 / ne02), (uint32_t)(qy_shader_offset / ggml_type_size(src1->type)), (uint32_t)(d_shader_offset / ggml_type_size(dst->type)) };
     ggml_vk_sync_buffers(subctx);
-    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
+    ggml_vk_dispatch_pipeline(ctx, subctx, ctx->device->pipeline_mul_mat_vec_nc_f16_f32, { { d_Qx, qx_buf_offset, qx_sz }, { d_Qy, qy_buffer_offset, qy_sz + qy_shader_offset }, { d_D, d_buffer_offset, d_sz + d_shader_offset } }, 7 * sizeof(uint32_t), &pc, { 1, (uint32_t)ne01, (uint32_t)ne12 });
 
     if (dst->backend == GGML_BACKEND_CPU) {
         // copy dst to host
@@ -2810,40 +2865,40 @@ static void ggml_vk_op_repeat(ggml_backend_vk_context * ctx, vk_context * subctx
 }
 
 
-static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) {
+static vk_pipeline ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_op op) {
     switch (op) {
     case GGML_OP_ADD:
         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_add_f32;
+            return ctx->device->pipeline_add_f32;
         }
         return nullptr;
     case GGML_OP_GET_ROWS:
         GGML_ASSERT(src1->type == GGML_TYPE_I32);
         if (dst->type == GGML_TYPE_F16) {
-            return &ctx->pipeline_get_rows[src0->type];
+            return ctx->device->pipeline_get_rows[src0->type];
         }
         if (dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_get_rows_f32[src0->type];
+            return ctx->device->pipeline_get_rows_f32[src0->type];
         }
         return nullptr;
     case GGML_OP_MUL:
         if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_mul_f32;
+            return ctx->device->pipeline_mul_f32;
         }
         return nullptr;
     case GGML_OP_SCALE:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_scale_f32;
+            return ctx->device->pipeline_scale_f32;
         }
         return nullptr;
     case GGML_OP_SQR:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_sqr_f32;
+            return ctx->device->pipeline_sqr_f32;
         }
         return nullptr;
     case GGML_OP_CLAMP:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_clamp_f32;
+            return ctx->device->pipeline_clamp_f32;
         }
         return nullptr;
     case GGML_OP_CPY:
@@ -2852,29 +2907,29 @@ static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
         return ggml_vk_get_cpy_pipeline(ctx, src0->type, dst->type);
     case GGML_OP_NORM:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_norm_f32;
+            return ctx->device->pipeline_norm_f32;
         }
         return nullptr;
     case GGML_OP_RMS_NORM:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_rms_norm_f32;
+            return ctx->device->pipeline_rms_norm_f32;
         }
         return nullptr;
     case GGML_OP_UNARY:
         switch (ggml_get_unary_op(dst)) {
             case GGML_UNARY_OP_SILU:
                 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_silu_f32;
+                    return ctx->device->pipeline_silu_f32;
                 }
                 break;
             case GGML_UNARY_OP_GELU:
                 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_gelu_f32;
+                    return ctx->device->pipeline_gelu_f32;
                 }
                 break;
             case GGML_UNARY_OP_RELU:
                 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_relu_f32;
+                    return ctx->device->pipeline_relu_f32;
                 }
                 break;
             default:
@@ -2883,12 +2938,12 @@ static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
         return nullptr;
     case GGML_OP_DIAG_MASK_INF:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_diag_mask_inf_f32;
+            return ctx->device->pipeline_diag_mask_inf_f32;
         }
         return nullptr;
     case GGML_OP_SOFT_MAX:
         if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-            return &ctx->pipeline_soft_max_f32;
+            return ctx->device->pipeline_soft_max_f32;
         }
         return nullptr;
     case GGML_OP_ROPE:
@@ -2903,17 +2958,17 @@ static vk_pipeline* ggml_vk_op_get_pipeline(ggml_backend_vk_context * ctx, const
 
             if (is_neox) {
                 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_rope_neox_f32;
+                    return ctx->device->pipeline_rope_neox_f32;
                 }
                 if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-                    return &ctx->pipeline_rope_neox_f16;
+                    return ctx->device->pipeline_rope_neox_f16;
                 }
             } else {
                 if (src0->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32) {
-                    return &ctx->pipeline_rope_f32;
+                    return ctx->device->pipeline_rope_f32;
                 }
                 if (src0->type == GGML_TYPE_F16 && dst->type == GGML_TYPE_F16) {
-                    return &ctx->pipeline_rope_f16;
+                    return ctx->device->pipeline_rope_f16;
                 }
             }
             return nullptr;
@@ -2960,7 +3015,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
     const uint64_t nb2  = dst->nb[2];
     const uint64_t nb3  = dst->nb[3];
 
-    vk_pipeline * pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
+    vk_pipeline pipeline = ggml_vk_op_get_pipeline(ctx, src0, src1, dst, op);
     ggml_vk_func_t op_func;
 
     if (pipeline == nullptr) {
@@ -2990,7 +3045,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
     bool src0_uma = false;
     bool src1_uma = false;
 
-    if (ctx->device.lock()->uma) {
+    if (ctx->device->uma) {
         ggml_vk_host_get(ctx, src0->data, d_X, x_buf_offset);
         src0_uma = d_X != nullptr;
         if (use_src1) {
@@ -3002,8 +3057,8 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
     const bool transfer_src0 = src0->backend != GGML_BACKEND_GPU && !src0_uma;
     const bool transfer_src1 = use_src1 && src1->backend != GGML_BACKEND_GPU && !src1_uma;
 
-    uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment);
-    uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) : 0;
+    uint64_t x_sz = ggml_vk_align_size(ggml_type_size(src0->type) * ne0, ctx->device->properties.limits.minStorageBufferOffsetAlignment);
+    uint64_t y_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * ne1, ctx->device->properties.limits.minStorageBufferOffsetAlignment) : 0;
     uint64_t d_sz = ggml_type_size(dst->type) * ne0;
 
     vk_buffer d_D = extra->buffer_gpu.lock();
@@ -3014,7 +3069,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
     }
 
     GGML_ASSERT(d_D != nullptr);
-    uint64_t d_buf_offset = (extra->offset / ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
+    uint64_t d_buf_offset = (extra->offset / ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ctx->device->properties.limits.minStorageBufferOffsetAlignment;
     GGML_ASSERT(d_buf_offset == extra->offset || op == GGML_OP_CPY);  // NOLINT
     if (transfer_src0) {
         d_X = ctx->prealloc_qx;
@@ -3058,7 +3113,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
 
     // Single call if dimension 2 is contiguous
     if (op == GGML_OP_CPY || (ggml_is_contiguous(src0) && (src1 == nullptr || ggml_is_contiguous(src1)))) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, 1);
+        ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, 1);
 
         switch (dst->op) {
         case GGML_OP_NORM:
@@ -3090,13 +3145,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
         if (!use_src1 && op == GGML_OP_SOFT_MAX) {
             // Empty src1 is possible on soft_max, but the shader needs a buffer
             ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+            ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
         } else if (use_src1) {
             ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+            ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_Y, y_buf_offset, y_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
         } else {
             ggml_vk_sync_buffers(subctx);
-            ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+            ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
         }
         if (dst->backend == GGML_BACKEND_CPU && op == GGML_OP_CPY) {
             ggml_vk_d2h_tensor_2d(ctx, subctx, d_D, 0, dst);
@@ -3106,7 +3161,7 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
             ggml_vk_buffer_read_async(ctx, subctx, d_D, 0, d, d_sz);
         }
     } else {
-        ggml_pipeline_allocate_descriptor_sets(ctx, *pipeline, ne02 * ne03);
+        ggml_pipeline_allocate_descriptor_sets(ctx, pipeline, ne02 * ne03);
 
         switch (dst->op) {
         case GGML_OP_NORM:
@@ -3134,13 +3189,13 @@ static void ggml_vk_op_f32(ggml_backend_vk_context * ctx, vk_context * subctx, c
                 if (!use_src1 && op == GGML_OP_SOFT_MAX) {
                     // Empty src1 is possible on soft_max, but the shader needs a buffer
                     ggml_vk_sync_buffers(subctx);
-                    ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
+                    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset, x_sz }, { ctx->prealloc_y, 0, ctx->prealloc_y->size }, { d_D, d_buf_offset, d_sz } }, sizeof(PC), &pc, elements);
                 } else if (use_src1) {
                     ggml_vk_sync_buffers(subctx);
-                    ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
+                    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_Y, y_buf_offset + y_offset, y_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
                 } else {
                     ggml_vk_sync_buffers(subctx);
-                    ggml_vk_dispatch_pipeline(ctx, subctx, *pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
+                    ggml_vk_dispatch_pipeline(ctx, subctx, pipeline, { { d_X, x_buf_offset + x_offset, x_sz }, { d_D, d_buf_offset + d_offset, d_sz } }, sizeof(PC), &pc, elements);
                 }
                 if (dst->backend == GGML_BACKEND_CPU) {
                     // copy dst to host
@@ -3185,7 +3240,7 @@ static void ggml_vk_cpy(ggml_backend_vk_context * ctx, vk_context * subctx, cons
     ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) dst->extra;
     const int src0_type_size = ggml_type_size(src0->type);
     const int dst_type_size = ggml_type_size(dst->type);
-    const uint32_t d_offset = (extra->offset % ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
+    const uint32_t d_offset = (extra->offset % ctx->device->properties.limits.minStorageBufferOffsetAlignment) / dst_type_size;
     ggml_vk_op_f32<vk_op_cpy_push_constants>(ctx, subctx, src0, nullptr, dst, GGML_OP_CPY, {
         (uint32_t)ggml_nelements(src0),
         (uint32_t)src0->ne[0], (uint32_t)src0->ne[1], (uint32_t)src0->nb[0] / src0_type_size, (uint32_t)src0->nb[1] / src0_type_size, (uint32_t)src0->nb[2] / src0_type_size,
@@ -3297,43 +3352,43 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
     const size_t y_ne = k * n * batch;
     const size_t d_ne = m * n * batch;
 
-    vk_pipeline * p;
+    vk_pipeline p;
     std::string shname;
     if (shader_size == 0) {
         if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f32_aligned_s;
+            p = ctx->device->pipeline_matmul_f32_aligned_s;
             shname = "F32_ALIGNED_S";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_f32_aligned_s;
+            p = ctx->device->pipeline_matmul_f16_f32_aligned_s;
             shname = "F16_F32_ALIGNED_S";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_aligned_s;
+            p = ctx->device->pipeline_matmul_f16_aligned_s;
             shname = "F16_ALIGNED_S";
         } else {
             GGML_ASSERT(false);
         }
     } else if (shader_size == 1) {
         if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f32_aligned_m;
+            p = ctx->device->pipeline_matmul_f32_aligned_m;
             shname = "F32_ALIGNED_M";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_f32_aligned_m;
+            p = ctx->device->pipeline_matmul_f16_f32_aligned_m;
             shname = "F16_F32_ALIGNED_M";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_aligned_m;
+            p = ctx->device->pipeline_matmul_f16_aligned_m;
             shname = "F16_ALIGNED_M";
         } else {
             GGML_ASSERT(false);
         }
     } else if (shader_size == 2) {
         if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f32_aligned_l;
+            p = ctx->device->pipeline_matmul_f32_aligned_l;
             shname = "F32_ALIGNED_L";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_f32_aligned_l;
+            p = ctx->device->pipeline_matmul_f16_f32_aligned_l;
             shname = "F16_F32_ALIGNED_L";
         } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-            p = &ctx->pipeline_matmul_f16_aligned_l;
+            p = ctx->device->pipeline_matmul_f16_aligned_l;
             shname = "F16_ALIGNED_L";
         } else {
             GGML_ASSERT(false);
@@ -3347,35 +3402,35 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
     if (k != kpad) {
         if (shader_size == 0) {
             if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f32_s;
+                p = ctx->device->pipeline_matmul_f32_s;
                 shname = "F32_S";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_f32_s;
+                p = ctx->device->pipeline_matmul_f16_f32_s;
                 shname = "F16_F32_S";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_s;
+                p = ctx->device->pipeline_matmul_f16_s;
                 shname = "F16_S";
             }
         } else if (shader_size == 1) {
             if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f32_m;
+                p = ctx->device->pipeline_matmul_f32_m;
                 shname = "F32_M";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_f32_m;
+                p = ctx->device->pipeline_matmul_f16_f32_m;
                 shname = "F16_F32_M";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_m;
+                p = ctx->device->pipeline_matmul_f16_m;
                 shname = "F16_M";
             }
         } else if (shader_size == 2) {
             if (std::is_same<float, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f32_l;
+                p = ctx->device->pipeline_matmul_f32_l;
                 shname = "F32_L";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<float, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_f32_l;
+                p = ctx->device->pipeline_matmul_f16_f32_l;
                 shname = "F16_F32_L";
             } else if (std::is_same<ggml_fp16_t, X_TYPE>() && std::is_same<ggml_fp16_t, Y_TYPE>()) {
-                p = &ctx->pipeline_matmul_f16_l;
+                p = ctx->device->pipeline_matmul_f16_l;
                 shname = "F16_L";
             }
         }
@@ -3383,7 +3438,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
 
     ggml_pipeline_allocate_descriptor_sets(ctx, *p, num_it);
     if (split_k > 1) {
-        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->pipeline_matmul_split_k_reduce, num_it);
+        ggml_pipeline_allocate_descriptor_sets(ctx, ctx->device->pipeline_matmul_split_k_reduce, num_it);
 
         if (ctx->prealloc_split_k == nullptr || ctx->prealloc_split_k->size < sizeof(float) * d_ne * split_k) {
             // Resize buffer
@@ -3424,7 +3479,7 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
     ggml_vk_buffer_write(ctx, d_X, 0, x, sizeof(X_TYPE) * k * m * batch);
     ggml_vk_buffer_write(ctx, d_Y, 0, y, sizeof(Y_TYPE) * k * n * batch);
 
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
+    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
     for (size_t i = 0; i < num_it; i++) {
         ggml_vk_ctx_begin(ctx, subctx);
         ggml_vk_matmul(ctx, subctx, *p, ggml_vk_subbuffer(d_X), ggml_vk_subbuffer(d_Y), ggml_vk_subbuffer(d_D), ggml_vk_subbuffer(ctx->prealloc_split_k), m, n, k, k, k, m, split_k, batch, batch, batch, 1, 1, k*m, k*n, m*n);
@@ -3433,8 +3488,8 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
 
     auto begin = std::chrono::high_resolution_clock::now();
     ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_matmul waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
 
     auto end = std::chrono::high_resolution_clock::now();
     double time = std::chrono::duration_cast<std::chrono::microseconds>(end-begin).count() / 1000.0;
@@ -3538,15 +3593,15 @@ static void ggml_vk_test_matmul(ggml_backend_vk_context * ctx, size_t m, size_t
 
     free(d_chk);
 
-    ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue);
-    ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue);
+    ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
+    ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
 
     ggml_vk_destroy_buffer(d_X);
     ggml_vk_destroy_buffer(d_Y);
     ggml_vk_destroy_buffer(d_D);
 
     ggml_pipeline_cleanup(*p);
-    ggml_pipeline_cleanup(ctx->pipeline_matmul_split_k_reduce);
+    ggml_pipeline_cleanup(ctx->device->pipeline_matmul_split_k_reduce);
 
     free(x);
     free(y);
@@ -3619,7 +3674,7 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
         data[i] = (rand() / (float)RAND_MAX) * 2.0f - 1.0f;
     }
 
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
+    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
     ggml_vk_ctx_begin(ctx, subctx);
 
     vk_buffer buffer = ggml_vk_create_buffer_check(ctx, ggml_nbytes(tensor), vk::MemoryPropertyFlagBits::eDeviceLocal);
@@ -3628,8 +3683,8 @@ static void ggml_vk_test_h2d_nc(ggml_backend_vk_context * ctx, size_t ne0, size_
 
     ggml_vk_ctx_end(subctx);
     ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_h2d_nc waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
 
     ggml_vk_buffer_read(ctx, buffer, 0, result_data, ggml_nbytes(tensor));
 
@@ -3701,7 +3756,7 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
         x[i] = rand() / (float)RAND_MAX;
     }
 
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
+    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
     ggml_vk_ctx_begin(ctx, subctx);
 
     auto begin = std::chrono::high_resolution_clock::now();
@@ -3715,8 +3770,8 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
 
     ggml_vk_ctx_end(subctx);
     ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
 
     auto end = std::chrono::high_resolution_clock::now();
 
@@ -3730,8 +3785,8 @@ static void ggml_vk_test_transfer(ggml_backend_vk_context * ctx, size_t ne, bool
 
     ggml_vk_ctx_end(subctx);
     ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_transfer waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
 
     for (auto& cpy : subctx->out_memcpys) {
         memcpy(cpy.dst, cpy.src, cpy.n);
@@ -3781,7 +3836,7 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
 
     std::vector<int64_t> hist_cur(1 << 4, 0);
 
-    vk_pipeline& p = ctx->pipeline_dequant[quant];
+    vk_pipeline& p = ctx->device->pipeline_dequant[quant];
 
     switch(quant) {
     case GGML_TYPE_Q4_0:
@@ -3822,17 +3877,17 @@ static void ggml_vk_test_dequant(ggml_backend_vk_context * ctx, size_t ne, ggml_
 
     ggml_vk_buffer_write(ctx, qx_buf, 0, qx, qx_sz);
 
-    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
+    vk_context * subctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
     ggml_vk_ctx_begin(ctx, subctx);
-    const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)(ne / 32) };
+    const std::vector<uint32_t> pc = { 1, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne, (uint32_t)ne };
     ggml_vk_dispatch_pipeline(ctx, subctx, p, { { qx_buf, 0, qx_sz }, { x_buf, 0, x_sz_f16 } }, pc.size() * sizeof(int), pc.data(), { (uint32_t)ne, 1, 1});
     ggml_vk_ctx_end(subctx);
 
     auto begin = std::chrono::high_resolution_clock::now();
 
     ggml_vk_submit(subctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_test_dequant waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
 
     auto end = std::chrono::high_resolution_clock::now();
 
@@ -3927,11 +3982,11 @@ static void ggml_vk_preallocate_buffers_graph(ggml_backend_vk_context * ctx, ggm
     const uint32_t y_ne = ne10 * ne11;
     const uint32_t d_ne = ne20 * ne21;
 
-    const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
-    const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
-    const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
-    const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
-    uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
+    const uint64_t qx_sz = use_src0 ? ggml_vk_align_size(ggml_type_size(src0->type) * x_ne / ggml_blck_size(src0->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
+    const uint64_t qy_sz = use_src1 ? ggml_vk_align_size(ggml_type_size(src1->type) * y_ne / ggml_blck_size(src1->type), ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
+    const uint64_t x_sz = use_src0 ? ggml_vk_align_size(sizeof(ggml_fp16_t) * x_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne02 * ne03 : 0;
+    const uint64_t y_sz = use_src1 ? ggml_vk_align_size(f16_f32_kernel ? sizeof(float) * y_ne : sizeof(ggml_fp16_t) * y_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne12 * ne13 : 0;
+    uint64_t d_sz = ggml_vk_align_size(ggml_type_size(node->type) * d_ne, ctx->device->properties.limits.minStorageBufferOffsetAlignment) * ne22 * ne23;
     const uint64_t split_k_size = split_k > 1 ? d_sz * 4 : 0;
 
     if (extra->buffer_gpu.expired()) {
@@ -4161,7 +4216,7 @@ static void ggml_vk_build_graph(ggml_backend_vk_context * ctx, ggml_tensor * nod
     }
 
     if (ctx->compute_ctx == nullptr) {
-        ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->compute_queue);
+        ctx->compute_ctx = ggml_vk_create_context(ctx, ctx->device->compute_queue);
         ggml_vk_ctx_begin(ctx, ctx->compute_ctx);
     }
 
@@ -4354,8 +4409,8 @@ static bool ggml_vk_compute_forward(ggml_backend_vk_context * ctx, ggml_compute_
     }
 
     if (tensor == subctx.exit_tensor) {
-        VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
-        ctx->device.lock()->device.resetFences({ ctx->fence });
+        VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_vk_compute_forward waitForFences");
+        ctx->device->device.resetFences({ ctx->fence });
 
         // Do staging buffer copies
         for (auto& cpy : subctx.out_memcpys) {
@@ -4383,20 +4438,25 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
     }
     ctx->gc.temp_buffers.clear();
 
-    for (auto * pipeline : ctx->gc.pipelines) {
-        ggml_pipeline_cleanup(*pipeline);
+    for (auto& pipeline : ctx->device->pipelines) {
+        if (pipeline.expired()) {
+            continue;
+        }
+
+        vk_pipeline pl = pipeline.lock();
+        ggml_pipeline_cleanup(pl);
     }
 
-    ggml_vk_queue_cleanup(ctx, ctx->device.lock()->compute_queue);
-    ggml_vk_queue_cleanup(ctx, ctx->device.lock()->transfer_queue);
+    ggml_vk_queue_cleanup(ctx, ctx->device->compute_queue);
+    ggml_vk_queue_cleanup(ctx, ctx->device->transfer_queue);
 
     for (size_t i = 0; i < ctx->gc.semaphores.size(); i++) {
-        ctx->device.lock()->device.destroySemaphore({ ctx->gc.semaphores[i].s });
+        ctx->device->device.destroySemaphore({ ctx->gc.semaphores[i].s });
     }
     ctx->gc.semaphores.clear();
 
     for (size_t i = 0; i < ctx->gc.tl_semaphores.size(); i++) {
-        ctx->device.lock()->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s });
+        ctx->device->device.destroySemaphore({ ctx->gc.tl_semaphores[i].s });
     }
     ctx->gc.tl_semaphores.clear();
     ctx->semaphore_idx = 0;
@@ -4404,7 +4464,7 @@ static void ggml_vk_graph_cleanup(ggml_backend_vk_context * ctx) {
     ctx->event_idx = 0;
 
     for (auto& event : ctx->gc.events) {
-        ctx->device.lock()->device.resetEvent(event);
+        ctx->device->device.resetEvent(event);
     }
 
     ctx->staging_offset = 0;
@@ -4441,21 +4501,11 @@ static void ggml_vk_cleanup(ggml_backend_vk_context * ctx) {
     ctx->staging_size = 0;
 
     for (auto& event : ctx->gc.events) {
-        ctx->device.lock()->device.destroyEvent(event);
+        ctx->device->device.destroyEvent(event);
     }
     ctx->gc.events.clear();
 
-    for (auto* pipeline : ctx->gc.pipelines) {
-        ggml_vk_destroy_pipeline(ctx, pipeline);
-    }
-    ctx->gc.pipelines.clear();
-
-    ctx->device.lock()->device.destroyFence(ctx->fence);
-
-    ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->compute_queue.pool);
-    if (!ctx->device.lock()->single_queue) {
-        ctx->device.lock()->device.destroyCommandPool(ctx->device.lock()->transfer_queue.pool);
-    }
+    ctx->device->device.destroyFence(ctx->fence);
 }
 
 GGML_CALL int ggml_vk_get_device_count() {
@@ -4724,12 +4774,12 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_buffer_type_alloc_buffer(
 
 GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
     ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
-    return ctx->ctx->device.lock()->properties.limits.minStorageBufferOffsetAlignment;
+    return ctx->ctx->device->properties.limits.minStorageBufferOffsetAlignment;
 }
 
 GGML_CALL static size_t ggml_backend_vk_buffer_type_get_max_size(ggml_backend_buffer_type_t buft) {
     ggml_backend_vk_buffer_type_context * ctx = (ggml_backend_vk_buffer_type_context *) buft->context;
-    return ctx->ctx->device.lock()->max_memory_allocation_size;
+    return ctx->ctx->device->max_memory_allocation_size;
 }
 
 GGML_CALL static size_t ggml_backend_vk_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
@@ -4815,7 +4865,7 @@ GGML_CALL static ggml_backend_buffer_t ggml_backend_vk_host_buffer_type_alloc_bu
 }
 
 GGML_CALL static size_t ggml_backend_vk_host_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
-    return vk_instance.contexts[0].device.lock()->properties.limits.minMemoryMapAlignment;
+    return vk_instance.contexts[0].device->properties.limits.minMemoryMapAlignment;
 
     UNUSED(buft);
 }
@@ -4860,8 +4910,7 @@ GGML_CALL static void ggml_backend_vk_free(ggml_backend_t backend) {
 
     ggml_vk_cleanup(ctx);
 
-    // Release device
-    vk_instance.devices[ctx->idx].reset();
+    ctx->device.reset();
     ctx->initialized = false;
 
     vk_instance.initialized[idx] = false;
@@ -4890,7 +4939,7 @@ GGML_CALL static void ggml_backend_vk_set_tensor_async(ggml_backend_t backend, g
 
     if (ctx->transfer_ctx == nullptr) {
         // Initialize new transfer context
-        ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
+        ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
         ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
     }
 
@@ -4911,7 +4960,7 @@ GGML_CALL static void ggml_backend_vk_get_tensor_async(ggml_backend_t backend, c
 
     if (ctx->transfer_ctx == nullptr) {
         // Initialize new transfer context
-        ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
+        ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
         ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
     }
 
@@ -4931,7 +4980,7 @@ GGML_CALL static bool ggml_backend_vk_cpy_tensor_async(ggml_backend_t backend, c
 
         if (ctx->transfer_ctx == nullptr) {
             // Initialize new transfer context
-            ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device.lock()->transfer_queue);
+            ctx->transfer_ctx = ggml_vk_create_context(ctx, ctx->device->transfer_queue);
             ggml_vk_ctx_begin(ctx, ctx->transfer_ctx);
         }
 
@@ -4961,8 +5010,8 @@ GGML_CALL static void ggml_backend_vk_synchronize(ggml_backend_t backend) {
     }
 
     ggml_vk_submit(ctx->transfer_ctx, ctx->fence);
-    VK_CHECK(ctx->device.lock()->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
-    ctx->device.lock()->device.resetFences({ ctx->fence });
+    VK_CHECK(ctx->device->device.waitForFences({ ctx->fence }, true, UINT64_MAX), "ggml_backend_vk_synchronize waitForFences");
+    ctx->device->device.resetFences({ ctx->fence });
 
     for (auto& cpy : ctx->transfer_ctx->out_memcpys) {
         memcpy(cpy.dst, cpy.src, cpy.n);
diff --git a/ggml_vk_generate_shaders.py b/ggml_vk_generate_shaders.py
index 59205908c..6713f14d1 100644
--- a/ggml_vk_generate_shaders.py
+++ b/ggml_vk_generate_shaders.py
@@ -445,7 +445,7 @@ layout (push_constant) uniform parameter
     uint K;
     uint stride_a;
     uint stride_b;
-    uint num_groups;
+    uint nel;
 } p;
 """
 
@@ -483,31 +483,30 @@ void main() {
 """
 
 dequant_q4_0_body = """
-layout(local_size_x = 32, local_size_y = 1, local_size_z = 1) in;
+layout(local_size_x = 256, local_size_y = 1, local_size_z = 1) in;
 
 layout (binding = 0) readonly buffer A {block_q4_0 data_a[];};
 layout (binding = 1) writeonly buffer D {D_TYPE data_b[];};
 
 void main() {
-    const uint i = gl_WorkGroupID.x;
+    const uint i = gl_WorkGroupID.x * 4 + gl_LocalInvocationID.x / 64;
 
-    // assume 32 threads
-    const uint tid = gl_LocalInvocationID.x;
-    const uint il  = tid/8;
-    const uint ir  = tid%8;
-    const uint ib = 8*i + ir;
-    if (ib >= p.num_groups) {
+    const uint tid = gl_LocalInvocationID.x % 64;
+    const uint il  = tid/32;
+    const uint ir  = tid%32;
+    const uint ib = 32*i + ir;
+    if (ib >= p.nel / 32) {
         return;
     }
 
-    const uint b_idx = 256*i + 32*ir + 4*il;
+    const uint b_idx = 1024*i + 32*ir + 8*il;
 
     const float d = float(data_a[ib].d);
     const float dm = -8.0f * d;
 
-    const uint q_idx = 4*il;
+    const uint q_idx = 8*il;
 
-    [[unroll]] for (uint l = 0; l < 4; ++l) {
+    [[unroll]] for (uint l = 0; l < 8; ++l) {
         data_b[b_idx + l +  0] = D_TYPE(d * (data_a[ib].qs[q_idx + l] & 0xF) + dm);
         data_b[b_idx + l + 16] = D_TYPE(d * (data_a[ib].qs[q_idx + l] >>  4) + dm);
     }