diff --git a/SHA256SUMS b/SHA256SUMS
index c3f935a85..593c8efaa 100644
--- a/SHA256SUMS
+++ b/SHA256SUMS
@@ -1,17 +1,17 @@
 700df0d3013b703a806d2ae7f1bfb8e59814e3d06ae78be0c66368a50059f33d  models/7B/consolidated.00.pth
 666a4bb533b303bdaf89e1b6a3b6f93535d868de31d903afdc20983dc526c847  models/7B/ggml-model-f16.bin
-b734d7201dc7869855fe2861247178719607d96372f0fb1bf6a1c5810898a48f  models/7B/ggml-model-q4_0.bin
-1ea1d3e94d0012ee5c23ee5ee2c8909eb124a1e8e43c11108feb17879d8b9379  models/7B/ggml-model-q4_1.bin
-3232f282b40e3330093acb96e7d4983ce15b80a7e38b49d035e83b9aab753671  models/7B/ggml-model-q5_0.bin
-75b1e0ef9a7ba27d760e4239422e29a6ced0ff9c4f2537f1cc4754821bdb8d3e  models/7B/ggml-model-q5_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q4_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/7B/ggml-model-q5_1.bin
 7e89e242ddc0dd6f060b43ca219ce8b3e8f08959a72cb3c0855df8bb04d46265  models/7B/params.json
 745bf4e29a4dd6f411e72976d92b452da1b49168a4f41c951cfcc8051823cf08  models/13B/consolidated.00.pth
 d5ccbcc465c71c0de439a5aeffebe8344c68a519bce70bc7f9f92654ee567085  models/13B/consolidated.01.pth
 2b206e9b21fb1076f11cafc624e2af97c9e48ea09312a0962153acc20d45f808  models/13B/ggml-model-f16.bin
-a8dd1a853a3227abda5b2046dcc23b1f06ee8b837bc97b34f6b182229eca21ff  models/13B/ggml-model-q4_0.bin
-3a58a576f0e188ad77bc5104407f1c7cf129928d1af2f920099fa206ca6af34a  models/13B/ggml-model-q4_1.bin
-814f9e369ca0daf4517b6a66bdf8d616c5d4ae8b4353fe091d15080e66965c34  models/13B/ggml-model-q5_0.bin
-74ab4eacb6ef14e08c7f06a2dd0b2630c3f920149324acf6651222ed397c430f  models/13B/ggml-model-q5_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q4_1.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_0.bin
+ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff  models/13B/ggml-model-q5_1.bin
 4ab77bec4d4405ccb66a97b282574c89a94417e3c32e5f68f37e2876fc21322f  models/13B/params.json
 e23294a58552d8cdec5b7e8abb87993b97ea6eced4178ff2697c02472539d067  models/30B/consolidated.00.pth
 4e077b7136c7ae2302e954860cf64930458d3076fcde9443f4d0e939e95903ff  models/30B/consolidated.01.pth
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 08d1566bd..8a3beb0e5 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -86,8 +86,8 @@ static __global__ void dequantize_block_q4_0(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf) - 8;
         const int x1 = (x[i].qs[j] >>  4) - 8;
 
-        y[i*qk + 2*j + 0] = x0*d;
-        y[i*qk + 2*j + 1] = x1*d;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
     }
 }
 
@@ -105,8 +105,8 @@ static __global__ void dequantize_block_q4_1(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf);
         const int x1 = (x[i].qs[j] >>  4);
 
-        y[i*qk + 2*j + 0] = x0*d + m;
-        y[i*qk + 2*j + 1] = x1*d + m;
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
     }
 }
 
@@ -129,8 +129,8 @@ static __global__ void dequantize_block_q5_0(const void * vx, float * y) {
         const int32_t x0 = ((x[i].qs[j] & 0xf) | xh_0) - 16;
         const int32_t x1 = ((x[i].qs[j] >>  4) | xh_1) - 16;
 
-        y[i*qk + 2*j + 0] = x0*d;
-        y[i*qk + 2*j + 1] = x1*d;
+        y[i*qk + j + 0   ] = x0*d;
+        y[i*qk + j + qk/2] = x1*d;
     }
 }
 
@@ -154,8 +154,8 @@ static __global__ void dequantize_block_q5_1(const void * vx, float * y) {
         const int x0 = (x[i].qs[j] & 0xf) | xh_0;
         const int x1 = (x[i].qs[j] >>  4) | xh_1;
 
-        y[i*qk + 2*j + 0] = x0*d + m;
-        y[i*qk + 2*j + 1] = x1*d + m;
+        y[i*qk + j + 0   ] = x0*d + m;
+        y[i*qk + j + qk/2] = x1*d + m;
     }
 }
 
@@ -168,9 +168,8 @@ static __global__ void dequantize_block_q8_0(const void * vx, float * y) {
 
     const float d = x[i].d;
 
-    for (int j = 0; j < qk/2; ++j) {
-        y[i*qk + 2*j + 0] = x[i].qs[j + 0   ]*d;
-        y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
+    for (int j = 0; j < qk; ++j) {
+        y[i*qk + j] = x[i].qs[j]*d;
     }
 }
 
diff --git a/ggml-opencl.c b/ggml-opencl.c
index 230c84f2f..0e6e6770f 100644
--- a/ggml-opencl.c
+++ b/ggml-opencl.c
@@ -114,7 +114,6 @@ __kernel void dequantize_row_q8_0(__global struct block_q8_0* blocks, __global f
     const uint i = get_global_id(0) / 32;
     const uint l = get_local_id(0);
 
-    // TODO: this is broken
     result[i*32 + l] = blocks[i].qs[l] * blocks[i].d;
 }
 
diff --git a/ggml.c b/ggml.c
index a9c10a295..096ccacfb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -525,30 +525,14 @@ static inline __m256i bytes_from_bits_32(const uint8_t * x) {
     return _mm256_cmpeq_epi8(bytes, _mm256_set1_epi64x(-1));
 }
 
-static inline __m256i bytes_from_nibbles_32_deinterleave(const uint8_t * rsi) {
-    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
-    const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
-    const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    return _mm256_and_si256(lowMask, bytes);
-}
-
 // Unpack 32 4-bit fields into 32 bytes
 // The output vector contains 32 bytes, each one in [ 0 .. 15 ] interval
 static inline __m256i bytes_from_nibbles_32(const uint8_t * rsi)
 {
-    // Load 16 bytes from memory
-    __m128i tmp = _mm_loadu_si128( ( const __m128i* )rsi );
-
-    // Expand bytes into uint16_t values
-    __m256i bytes = _mm256_cvtepu8_epi16( tmp );
-
-    // Unpack values into individual bytes
+    const __m128i tmp = _mm_loadu_si128((const __m128i *)rsi);
+    const __m256i bytes = _mm256_set_m128i(_mm_srli_epi16(tmp, 4), tmp);
     const __m256i lowMask = _mm256_set1_epi8( 0xF );
-    __m256i high = _mm256_andnot_si256( lowMask, bytes );
-    __m256i low = _mm256_and_si256( lowMask, bytes );
-    high = _mm256_slli_epi16( high, 4 );
-    bytes = _mm256_or_si256( low, high );
-    return bytes;
+    return _mm256_and_si256(lowMask, bytes);
 }
 
 // add int16_t pairwise and return as float vector
@@ -766,8 +750,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r
         y[i].d = d;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 2*j + 0]*id;
-            const float x1 = x[i*qk + 2*j + 1]*id;
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 8.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 8.5f));
@@ -807,8 +791,8 @@ static void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * r
         y[i].m = min;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 2*j + 0] - min)*id;
-            const float x1 = (x[i*qk + 2*j + 1] - min)*id;
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
 
             const uint8_t xi0 = MIN(15, (int8_t)(x0 + 0.5f));
             const uint8_t xi1 = MIN(15, (int8_t)(x1 + 0.5f));
@@ -850,8 +834,8 @@ static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * r
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = x[i*qk + 2*j + 0]*id;
-            const float x1 = x[i*qk + 2*j + 1]*id;
+            const float x0 = x[i*qk + 0    + j]*id;
+            const float x1 = x[i*qk + qk/2 + j]*id;
 
             const uint8_t xi0 = MIN(31, (int8_t)(x0 + 16.5f));
             const uint8_t xi1 = MIN(31, (int8_t)(x1 + 16.5f));
@@ -898,8 +882,8 @@ static void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * r
         uint32_t qh = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const float x0 = (x[i*qk + 2*j + 0] - min)*id;
-            const float x1 = (x[i*qk + 2*j + 1] - min)*id;
+            const float x0 = (x[i*qk + 0    + j] - min)*id;
+            const float x1 = (x[i*qk + qk/2 + j] - min)*id;
 
             const uint8_t xi0 = (uint8_t)(x0 + 0.5f);
             const uint8_t xi1 = (uint8_t)(x1 + 0.5f);
@@ -937,12 +921,10 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 
         y[i].d = d;
 
-        for (int j = 0; j < QK8_0/2; ++j) {
-            const float v0 = x[i*QK8_0 + 2*j + 0]*id;
-            const float v1 = x[i*QK8_0 + 2*j + 1]*id;
+        for (int j = 0; j < QK8_0; ++j) {
+            const float x0 = x[i*QK8_0 + j]*id;
 
-            y[i].qs[          j] = roundf(v0);
-            y[i].qs[QK8_0/2 + j] = roundf(v1);
+            y[i].qs[j] = roundf(x0);
         }
     }
 }
@@ -978,13 +960,13 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
-            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
         }
     }
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) || defined(__AVX__)
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1029,7 +1011,7 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         __m256i i2 = _mm256_cvtps_epi32( v2 );
         __m256i i3 = _mm256_cvtps_epi32( v3 );
 
-#if defined(__AVX2__) // || defined(__AVX__) TODO
+#if defined(__AVX2__)
         // Convert int32 to int16
         i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
         i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
@@ -1037,11 +1019,10 @@ static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int
         i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 
         // We got our precious signed bytes, but the order is now wrong
-        // TODO: find a smarter way to do this
-        i2 = _mm256_permute2f128_si256(i0, i0, 0x01);
-        i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15));
-        i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1));
-        i0 = _mm256_or_si256(i1, i2);
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
 
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
@@ -1097,8 +1078,8 @@ static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * r
         int sum = 0;
 
         for (int j = 0; j < QK8_1/2; ++j) {
-            const float v0 = x[i*QK8_1 + 2*j + 0]*id;
-            const float v1 = x[i*QK8_1 + 2*j + 1]*id;
+            const float v0 = x[i*QK8_1           + j]*id;
+            const float v1 = x[i*QK8_1 + QK8_1/2 + j]*id;
 
             y[i].qs[          j] = roundf(v0);
             y[i].qs[QK8_1/2 + j] = roundf(v1);
@@ -1143,17 +1124,17 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
             const float32x4_t v  = vmulq_n_f32(srcv[j], id);
             const int32x4_t   vi = vcvtnq_s32_f32(v);
 
-            y[i].qs[     2*j + 0] = vgetq_lane_s32(vi, 0);
-            y[i].qs[16 + 2*j + 0] = vgetq_lane_s32(vi, 1);
-            y[i].qs[     2*j + 1] = vgetq_lane_s32(vi, 2);
-            y[i].qs[16 + 2*j + 1] = vgetq_lane_s32(vi, 3);
+            y[i].qs[4*j + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*j + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*j + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*j + 3] = vgetq_lane_s32(vi, 3);
 
             accv = vaddq_s32(accv, vi);
         }
 
         y[i].s = d * vaddvq_s32(accv);
     }
-#elif defined(__AVX2__)
+#elif defined(__AVX2__) || defined(__AVX__)
     for (int i = 0; i < nb; i++) {
         // Load elements into 4 AVX vectors
         __m256 v0 = _mm256_loadu_ps( x );
@@ -1198,7 +1179,7 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         __m256i i2 = _mm256_cvtps_epi32( v2 );
         __m256i i3 = _mm256_cvtps_epi32( v3 );
 
-#if defined(__AVX2__) // || defined(__AVX__) TODO
+#if defined(__AVX2__)
         // Compute the sum of the quants and set y[i].s
         y[i].s = d * hsum_i32_8(_mm256_add_epi32(_mm256_add_epi32(i0, i1), _mm256_add_epi32(i2, i3)));
 
@@ -1209,11 +1190,10 @@ static void quantize_row_q8_1(const float * restrict x, void * restrict vy, int
         i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
 
         // We got our precious signed bytes, but the order is now wrong
-        // TODO: find a smarter way to do this
-        i2 = _mm256_permute2f128_si256(i0, i0, 0x01);
-        i1 = _mm256_shuffle_epi8(i0, _mm256_setr_epi8( 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14,-1,-1,-1,-1, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15));
-        i2 = _mm256_shuffle_epi8(i2, _mm256_setr_epi8(-1,-1, 0, 2,-1,-1, 4, 6,-1,-1, 8,10,-1,-1,12,14, 1, 3,-1,-1, 5, 7,-1,-1, 9,11,-1,-1,13,15,-1,-1));
-        i0 = _mm256_or_si256(i1, i2);
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
 
         _mm256_storeu_si256((__m256i *)y[i].qs, i0);
 #else
@@ -1266,8 +1246,8 @@ static void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F) - 8;
             const int x1 = (x[i].qs[j] >>   4) - 8;
 
-            y[i*qk + 2*j + 0] = x0*d;
-            y[i*qk + 2*j + 1] = x1*d;
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
         }
     }
 }
@@ -1287,8 +1267,8 @@ static void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F);
             const int x1 = (x[i].qs[j] >>   4);
 
-            y[i*qk + 2*j + 0] = x0*d + m;
-            y[i*qk + 2*j + 1] = x1*d + m;
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
         }
     }
 }
@@ -1313,8 +1293,8 @@ static void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict
             const int32_t x0 = ((x[i].qs[j] & 0x0F) | xh_0) - 16;
             const int32_t x1 = ((x[i].qs[j] >>   4) | xh_1) - 16;
 
-            y[i*qk + 2*j + 0] = x0*d;
-            y[i*qk + 2*j + 1] = x1*d;
+            y[i*qk + j + 0   ] = x0*d;
+            y[i*qk + j + qk/2] = x1*d;
         }
     }
 }
@@ -1340,8 +1320,8 @@ static void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict
             const int x0 = (x[i].qs[j] & 0x0F) | xh_0;
             const int x1 = (x[i].qs[j] >>   4) | xh_1;
 
-            y[i*qk + 2*j + 0] = x0*d + m;
-            y[i*qk + 2*j + 1] = x1*d + m;
+            y[i*qk + j + 0   ] = x0*d + m;
+            y[i*qk + j + qk/2] = x1*d + m;
         }
     }
 }
@@ -1358,9 +1338,8 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
     for (int i = 0; i < nb; i++) {
         const float d = x[i].d;
 
-        for (int j = 0; j < qk/2; ++j) {
-            y[i*qk + 2*j + 0] = x[i].qs[j + 0   ]*d;
-            y[i*qk + 2*j + 1] = x[i].qs[j + qk/2]*d;
+        for (int j = 0; j < qk; ++j) {
+            y[i*qk + j] = x[i].qs[j]*d;
         }
     }
 }
@@ -2103,7 +2082,7 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
         /* Compute combined scale for the block */
         const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) );
 
-        __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
 
         // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval.
         const __m256i off = _mm256_set1_epi8( 8 );
@@ -2262,7 +2241,7 @@ static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void *
         const __m256 d0d1 = _mm256_mul_ps( d0v, d1v );
 
         // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes
-        const __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
+        const __m256i bx = bytes_from_nibbles_32(x[i].qs);
         const __m256i by = _mm256_loadu_si256( (const __m256i *)y[i].qs );
 
         const __m256 xy = mul_sum_i8_pairs_float(bx, by);
@@ -2466,7 +2445,7 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
         /* Compute combined scale for the block */
         const __m256 d = _mm256_mul_ps(_mm256_set1_ps(GGML_FP16_TO_FP32(x[i].d)), _mm256_broadcast_ss(&y[i].d));
 
-        __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
         __m256i bxhi = bytes_from_bits_32(x[i].qh);
         bxhi = _mm256_andnot_si256(bxhi, _mm256_set1_epi8((char)0xF0));
         bx = _mm256_or_si256(bx, bxhi);
@@ -2694,7 +2673,7 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
 
         summs += GGML_FP16_TO_FP32(x[i].m) * y[i].s;
 
-        __m256i bx = bytes_from_nibbles_32_deinterleave(x[i].qs);
+        __m256i bx = bytes_from_nibbles_32(x[i].qs);
         __m256i bxhi = bytes_from_bits_32(x[i].qh);
         bxhi = _mm256_and_si256(bxhi, _mm256_set1_epi8(0x10));
         bx = _mm256_or_si256(bx, bxhi);
@@ -2719,8 +2698,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
         int sumi = 0;
 
         for (int j = 0; j < qk/2; ++j) {
-            const uint8_t xh_0 = ((qh & (1u << (j + 0 ))) >> (j + 0 )) << 4;
-            const uint8_t xh_1 = ((qh & (1u << (j + 16))) >> (j + 12));
+            const uint8_t xh_0 = ((qh >> (j +  0)) << 4) & 0x10;
+            const uint8_t xh_1 = ((qh >> (j + 12))     ) & 0x10;
 
             const int32_t x0 = (x[i].qs[j] & 0xF) | xh_0;
             const int32_t x1 = (x[i].qs[j] >>  4) | xh_1;
diff --git a/llama.cpp b/llama.cpp
index b2dbc6c3b..b27eb91e4 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -919,7 +919,9 @@ static void llama_model_load_internal(
     }
 
     if (file_version != LLAMA_FILE_VERSION_GGJT_V2) {
-        if (hparams.ftype == LLAMA_FTYPE_MOSTLY_Q8_0) {
+        if (hparams.ftype != LLAMA_FTYPE_ALL_F32     &&
+            hparams.ftype != LLAMA_FTYPE_MOSTLY_F16  &&
+            hparams.ftype != LLAMA_FTYPE_MOSTLY_Q8_0) {
             throw format("this format is no longer supported (see https://github.com/ggerganov/llama.cpp/pull/1305)");
         }
     }