diff --git a/ggml-common.h b/ggml-common.h
index 2bbf5c0f9..aa065f482 100644
--- a/ggml-common.h
+++ b/ggml-common.h
@@ -379,12 +379,11 @@ static_assert(sizeof(block_iq1_s) == sizeof(ggml_half) + QK_K/8 + QK_K/16, "wron
 
 // 1.8125 bpw
 typedef struct {
-    ggml_half d;
     uint8_t  qs[QK_K/8];      // grid index, low 8 bits
     uint8_t  qh[QK_K/16];     // grid index, high 3 bits + grid shift bit (for two groups of 8)
     uint8_t  scales[QK_K/32]; // 4-bit block scales
 } block_iq1_m;
-static_assert(sizeof(block_iq1_m) == sizeof(ggml_half) + QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
+static_assert(sizeof(block_iq1_m) == QK_K/8 + QK_K/16 + QK_K/32, "wrong iq1_m block size/padding");
 
 // Non-linear quants
 #define QK4_NL 32
diff --git a/ggml-quants.c b/ggml-quants.c
index fdc8fdbe4..343859ebb 100644
--- a/ggml-quants.c
+++ b/ggml-quants.c
@@ -3474,6 +3474,11 @@ void dequantize_row_iq1_s(const block_iq1_s * restrict x, float * restrict y, in
     }
 }
 
+typedef union {
+    ggml_fp16_t fp16;
+    uint16_t    u16;
+} iq1m_scale_t;
+
 void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, int k) {
     assert(k % QK_K == 0);
     const int nb = k / QK_K;
@@ -3481,16 +3486,19 @@ void dequantize_row_iq1_m(const block_iq1_m * restrict x, float * restrict y, in
     float delta[4];
     uint16_t idx[4];
 
+    iq1m_scale_t scale;
+
     for (int i = 0; i < nb; i++) {
 
-        const float d1 = GGML_FP16_TO_FP32(x[i].d);
-        const float d2 = d1 / 16;
+        const uint16_t * sc = (const uint16_t *)x[i].scales;
+        scale.u16 = (sc[0] >> 12) | ((sc[1] >> 8) & 0x00f0) | ((sc[2] >> 4) & 0x0f00) | (sc[3] & 0xf000);
+        const float d = GGML_FP16_TO_FP32(scale.fp16);
         const uint8_t * qs = x[i].qs;
         const uint8_t * qh = x[i].qh;
 
         for (int ib = 0; ib < QK_K/32; ++ib) {
-            const float dl1 = d1 * (2*(x[i].scales[ib] & 0x0f) +  1);
-            const float dl2 = d2 * (2*(x[i].scales[ib] & 0xf0) + 16);
+            const float dl1 = d * (2*((sc[ib/2] >> (6*(ib%2)+0)) & 0x7) + 1);
+            const float dl2 = d * (2*((sc[ib/2] >> (6*(ib%2)+3)) & 0x7) + 1);
             idx[0] = qs[0] | ((qh[0] << 8) & 0x700);
             idx[1] = qs[1] | ((qh[0] << 4) & 0x700);
             idx[2] = qs[2] | ((qh[1] << 8) & 0x700);
@@ -11700,7 +11708,7 @@ static int iq1_sort_helper(const void * left, const void * right) {
 
 #define IQ1S_BLOCK_SIZE 32
 #define IQ1M_BLOCK_SIZE 16
-static void quantize_row_iq1_impl(enum ggml_type type, const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
+static void quantize_row_iq1_s_impl(const float * restrict x, void * restrict vy, int n, const float * restrict quant_weights,
         float    * scales,
         float    * weight,
         float    * sumx,
@@ -11722,9 +11730,11 @@ static void quantize_row_iq1_impl(enum ggml_type type, const float * restrict x,
     GGML_ASSERT(kneighbors_q2xs && "forgot to call ggml_quantize_init()?");
     GGML_ASSERT(n%QK_K == 0);
 
+    block_iq1_s * y = vy;
+
     const int nbl = n/QK_K;
 
-    const int block_size = type == GGML_TYPE_IQ1_S ? IQ1S_BLOCK_SIZE : IQ1M_BLOCK_SIZE;
+    const int block_size = IQ1S_BLOCK_SIZE;
 
     const float x_p[3] = {-1 + IQ1S_DELTA,  IQ1S_DELTA, 1 + IQ1S_DELTA};
     const float x_m[3] = {-1 - IQ1S_DELTA, -IQ1S_DELTA, 1 - IQ1S_DELTA};
@@ -11734,18 +11744,9 @@ static void quantize_row_iq1_impl(enum ggml_type type, const float * restrict x,
 
     for (int ibl = 0; ibl < nbl; ++ibl) {
 
-        if (type == GGML_TYPE_IQ1_S) {
-            block_iq1_s * y = vy;
-            y[ibl].d = GGML_FP32_TO_FP16(0.f);
-            memset(y[ibl].qs, 0, QK_K/8);
-            memset(y[ibl].qh, 0, QK_K/16);
-        } else {
-            block_iq1_m * y = vy;
-            y[ibl].d = GGML_FP32_TO_FP16(0.f);
-            memset(y[ibl].qs, 0, QK_K/8);
-            memset(y[ibl].qh, 0, QK_K/16);
-            memset(y[ibl].scales, 0, QK_K/32);
-        }
+        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+        memset(y[ibl].qs, 0, QK_K/8);
+        memset(y[ibl].qh, 0, QK_K/16);
 
         float max_scale = 0;
 
@@ -11785,7 +11786,6 @@ static void quantize_row_iq1_impl(enum ggml_type type, const float * restrict x,
                 }
             }
             float best_score = 0, scale = max;
-            // TODO: we need two shifts per block for IQ1_M.
             int besti1 = -1, besti2 = -1, best_shift = 0;
             for (int i1 = 0; i1 <= block_size; ++i1) {
                 for (int i2 = i1; i2 <= block_size; ++i2) {
@@ -11838,20 +11838,12 @@ static void quantize_row_iq1_impl(enum ggml_type type, const float * restrict x,
                 }
                 if (sumqx > 0 && sumq2 > 0) scale = sumqx/sumq2;
             }
-            if (type == GGML_TYPE_IQ1_S) {
-                block_iq1_s * y = vy;
-                uint16_t h = 0;
-                for (int k = 0; k < block_size/8; ++k) {
-                    y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
-                    h |= (index[k] >> 8) << 3*k;
-                }
-                y[ibl].qh[ib] = h;
-            } else {
-                block_iq1_m * y = vy;
-                y[ibl].qs[2*ib + 0] = index[0] & 255;
-                y[ibl].qs[2*ib + 1] = index[1] & 255;
-                y[ibl].qh[ib] = (index[0] >> 8) | ((index[1] >> 8) << 4);
+            uint16_t h = 0;
+            for (int k = 0; k < block_size/8; ++k) {
+                y[ibl].qs[(block_size/8)*ib + k] = index[k] & 255;
+                h |= (index[k] >> 8) << 3*k;
             }
+            y[ibl].qh[ib] = h;
             GGML_ASSERT(scale >= 0);
             scales[ib] = scale;
             shifts[ib] = best_shift;
@@ -11862,33 +11854,14 @@ static void quantize_row_iq1_impl(enum ggml_type type, const float * restrict x,
             continue;
         }
 
-        if (type == GGML_TYPE_IQ1_S) {
-            float d = max_scale/15;
-            block_iq1_s * y = vy;
-            y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
-            float id = 1/d;
-            for (int ib = 0; ib < QK_K/block_size; ++ib) {
-                int l = nearest_int(0.5f*(id*scales[ib]-1));
-                l = MAX(0, MIN(7, l));
-                if (shifts[ib] == -1) l |= 8;
-                y[ibl].qh[ib] |= (l << 12);
-            }
-        } else {
-            block_iq1_m * y = vy;
-            float d = max_scale/31;
-            y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
-            float id = 1/d;
-            for (int ib = 0; ib < QK_K/block_size; ib += 2) {
-                int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
-                l1 = MAX(0, MIN(15, l1));
-                int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
-                l2 = MAX(0, MIN(15, l2));
-                y[ibl].scales[ib/2] = l1 | (l2 << 4);
-                // TODO: we need two shifts per block for IQ1_M.
-                // For now we use the same shift for both groups of 8 in the block, thus wasting 1 pet per 16 weights.
-                if (shifts[ib+0] == -1) y[ibl].qh[ib+0] |= 0x88;
-                if (shifts[ib+1] == -1) y[ibl].qh[ib+1] |= 0x88;
-            }
+        float d = max_scale/15;
+        y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
+        float id = 1/d;
+        for (int ib = 0; ib < QK_K/block_size; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib]-1));
+            l = MAX(0, MIN(7, l));
+            if (shifts[ib] == -1) l |= 8;
+            y[ibl].qh[ib] |= (l << 12);
         }
     }
 }
@@ -11906,7 +11879,7 @@ size_t quantize_iq1_s(const float * restrict src, void * restrict dst, int nrow,
     int nblock = n_per_row/QK_K;
     char * qrow = (char *)dst;
     for (int row = 0; row < nrow; ++row) {
-        quantize_row_iq1_impl(GGML_TYPE_IQ1_S, src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
+        quantize_row_iq1_s_impl(src, qrow, n_per_row, quant_weights, scales, weight, sumx, sumw, pairs, L, index, shifts);
         src += n_per_row;
         qrow += nblock*sizeof(block_iq1_s);
     }
@@ -11947,9 +11920,11 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
 
     float sumqx[4], sumq2[4];
 
+    iq1m_scale_t s;
+
     for (int ibl = 0; ibl < nbl; ++ibl) {
 
-        y[ibl].d = GGML_FP32_TO_FP16(0.f);
+        //y[ibl].d = GGML_FP32_TO_FP16(0.f);
         memset(y[ibl].qs, 0, QK_K/8);
         memset(y[ibl].qh, 0, QK_K/16);
         memset(y[ibl].scales, 0, QK_K/32);
@@ -12120,18 +12095,30 @@ static void quantize_row_iq1_m_impl(const float * restrict x, void * restrict vy
             continue;
         }
 
-        float d = max_scale/31;
-        y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
+        uint16_t * sc = (uint16_t *)y[ibl].scales;
+        float d = max_scale/15;
         float id = 1/d;
-        for (int ib = 0; ib < QK_K/block_size; ib += 2) {
-            int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
-            l1 = MAX(0, MIN(15, l1));
-            int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
-            l2 = MAX(0, MIN(15, l2));
-            y[ibl].scales[ib/2] = l1 | (l2 << 4);
-            y[ibl].qh[ib+0] |= masks[shifts[ib+0]];
-            y[ibl].qh[ib+1] |= masks[shifts[ib+1]];
+        for (int ib = 0; ib < QK_K/block_size; ++ib) {
+            int l = nearest_int(0.5f*(id*scales[ib+0]-1));
+            l = MAX(0, MIN(7, l));
+            sc[ib/4] |= (l << 3*(ib%4));
+            y[ibl].qh[ib] |= masks[shifts[ib]];
         }
+        s.fp16 = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
+        sc[0] |= ((s.u16 & 0x000f) << 12);
+        sc[1] |= ((s.u16 & 0x00f0) <<  8);
+        sc[2] |= ((s.u16 & 0x0f00) <<  4);
+        sc[3] |= ((s.u16 & 0xf000) <<  0);
+        //y[ibl].d = GGML_FP32_TO_FP16(d*1.125f); // 1.125f is another fudge factor. Don't ask me why it is needed.
+        //for (int ib = 0; ib < QK_K/block_size; ib += 2) {
+        //    int l1 = nearest_int(0.5f*(id*scales[ib+0]-1));
+        //    l1 = MAX(0, MIN(7, l1));
+        //    int l2 = nearest_int(0.5f*(id*scales[ib+1]-1));
+        //    l2 = MAX(0, MIN(7, l2));
+        //    y[ibl].scales[ib/2] = l1 | (l2 << 4);
+        //    y[ibl].qh[ib+0] |= masks[shifts[ib+0]];
+        //    y[ibl].qh[ib+1] |= masks[shifts[ib+1]];
+        //}
     }
 }