diff --git a/.gitignore b/.gitignore
index da01a6b42..9bfe49fc2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,8 @@
 build/**
-.build/**
\ No newline at end of file
+.build/**
+
+models/**
+
+.vscode/**
+
+**/__pycache__/**
diff --git a/ggml/src/CMakeLists.txt b/ggml/src/CMakeLists.txt
index 71934c679..03ee6904b 100644
--- a/ggml/src/CMakeLists.txt
+++ b/ggml/src/CMakeLists.txt
@@ -1,5 +1,13 @@
 include(CheckCXXCompilerFlag)
 
+option(CUSTOM_QK4_0 "Quantization block size for Q4_0 (32, 64, 128, 256)" 32)
+
+if (NOT CUSTOM_QK4_0 MATCHES "^(32|64|128|256)$")
+    message(FATAL_ERROR "Invalid CUSTOM_QK4_0 value: Must be one of {32, 64, 128, 256}")
+endif()
+
+add_compile_definitions(CUSTOM_QK4_0=${CUSTOM_QK4_0})
+
 add_compile_definitions(GGML_SCHED_MAX_COPIES=${GGML_SCHED_MAX_COPIES})
 
 # enable libstdc++ assertions for debug builds
diff --git a/ggml/src/ggml-common.h b/ggml/src/ggml-common.h
index 050161393..4b2bc6e1a 100644
--- a/ggml/src/ggml-common.h
+++ b/ggml/src/ggml-common.h
@@ -141,7 +141,16 @@ typedef sycl::half2 ggml_half2;
 
 #endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
 
-#define QK4_0 32
+#ifdef CUSTOM_QK4_0
+    #define QK4_0 CUSTOM_QK4_0
+#else
+    #define QK4_0 32 // Default value for QK4_0
+#endif
+
+#if (QK4_0 != 32 && QK4_0 != 64 && QK4_0 != 128 && QK4_0 != 256)
+    #error "Invalid QK4_0 value: QK4_0 must be one of {32, 64, 128, 256}"
+#endif
+
 typedef struct {
     ggml_half d;           // delta
     uint8_t qs[QK4_0 / 2]; // nibbles / quants
@@ -183,7 +192,16 @@ typedef struct {
 } block_q5_1;
 static_assert(sizeof(block_q5_1) == 2 * sizeof(ggml_half) + sizeof(uint32_t) + QK5_1 / 2, "wrong q5_1 block size/padding");
 
-#define QK8_0 32
+#ifdef CUSTOM_QK8_0
+    #define QK8_0 CUSTOM_QK4_0
+#else
+    #define QK8_0 32
+#endif
+
+#if (QK8_0 != 32 && QK8_0 != 64 && QK8_0 != 128 && QK8_0 != 256)
+    #error "Invalid QK8_0 value: QK8_0 must be one of {32, 64, 128, 256}"
+#endif
+
 typedef struct {
     ggml_half d;       // delta
     int8_t  qs[QK8_0]; // quants
@@ -403,7 +421,7 @@ typedef union {
 } iq1m_scale_t;
 
 // Non-linear quants
-#define QK4_NL 32
+#define QK4_NL 128
 typedef struct {
     ggml_half d;
     uint8_t qs[QK4_NL/2];
diff --git a/ggml/src/ggml-cpu/ggml-cpu-quants.c b/ggml/src/ggml-cpu/ggml-cpu-quants.c
index f0e276b69..fbd9513a8 100644
--- a/ggml/src/ggml-cpu/ggml-cpu-quants.c
+++ b/ggml/src/ggml-cpu/ggml-cpu-quants.c
@@ -707,12 +707,17 @@ void quantize_row_q5_1(const float * restrict x, void * restrict y, int64_t k) {
 }
 
 void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k) {
-    assert(QK8_0 == 32);
+    assert(QK8_0 == 32 || QK8_0 == 64 || QK8_0 == 128 || QK8_0 == 256);
     assert(k % QK8_0 == 0);
     const int nb = k / QK8_0;
 
     block_q8_0 * restrict y = vy;
 
+#if defined(CUSTOM_QK4_0) && (CUSTOM_QK4_0 != 32)
+    GGML_UNUSED(nb);
+    // scalar
+    quantize_row_q8_0_ref(x, y, k);
+#else
 #if defined(__ARM_NEON)
     for (int i = 0; i < nb; i++) {
         float32x4_t srcv [8];
@@ -989,6 +994,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int64_t k)
     // scalar
     quantize_row_q8_0_ref(x, y, k);
 #endif
+#endif
 }
 
 void quantize_row_q8_1(const float * restrict x, void * restrict vy, int64_t k) {
@@ -1735,7 +1741,7 @@ static inline __m128i get_scale_shuffle(int i) {
 #endif
 
 void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * restrict vx, size_t bx, const void * restrict vy, size_t by, int nrc) {
-    const int qk = QK8_0;
+    const int qk = 128;
     const int nb = n / qk;
 
     assert(n % qk == 0);
@@ -1825,6 +1831,26 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     int ib = 0;
     float sumf = 0;
 
+
+#if defined(CUSTOM_QK4_0) && (CUSTOM_QK4_0 != 32)
+    // Use only the basic implementation when CUSTOM_QK4_0 is defined and not 32
+    for (; ib < nb; ++ib) {
+        int sumi0 = 0;
+        int sumi1 = 0;
+
+        for (int j = 0; j < qk/2; ++j) {
+            const int v0 = (x[ib].qs[j] & 0x0F) - 8;
+            const int v1 = (x[ib].qs[j] >>   4) - 8;
+
+            sumi0 += (v0 * y[ib].qs[j]);
+            sumi1 += (v1 * y[ib].qs[j + qk/2]);
+        }
+
+        int sumi = sumi0 + sumi1;
+        sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
+    }
+#else
+    // All the SIMD implementations
 #if defined(__ARM_FEATURE_SVE)
     svfloat32_t sumv0 = svdup_n_f32(0.0f);
     svfloat32_t sumv1 = svdup_n_f32(0.0f);
@@ -2291,7 +2317,7 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
     }
 
     sumf = hsum_float_4x4(acc_0, acc_1, acc_2, acc_3);
-#endif
+#else
     for (; ib < nb; ++ib) {
         int sumi0 = 0;
         int sumi1 = 0;
@@ -2307,7 +2333,8 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void * r
         int sumi = sumi0 + sumi1;
         sumf += sumi*GGML_FP16_TO_FP32(x[ib].d)*GGML_FP16_TO_FP32(y[ib].d);
     }
-
+#endif
+#endif
     *s = sumf;
 }
 
diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c
index 7301a9c6c..838374028 100644
--- a/ggml/src/ggml-quants.c
+++ b/ggml/src/ggml-quants.c
@@ -1819,7 +1819,7 @@ size_t quantize_q6_K(const float * restrict src, void * restrict dst, int64_t nr
 }
 
 static void quantize_row_q4_0_impl(const float * restrict x, block_q4_0 * restrict y, int64_t n_per_row, const float * quant_weights) {
-    static_assert(QK4_0 == 32, "QK4_0 must be 32");
+    static_assert(QK4_0 == 32 || QK4_0 == 64 || QK4_0 == 128 || QK4_0 == 256, "QK4_0 must be one of {32, 64, 128, 256}");
 
     if (!quant_weights) {
         quantize_row_q4_0_ref(x, y, n_per_row);