diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 2c22f8450..6a6892a05 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -263,7 +263,6 @@ static bool read_custom_quant_config(const std::string& filename, llama_model_qu
     while (getline(file, line)) {
         // Skip empty lines and comments
         if (line.empty() || line[0] == '#') continue;
-            printf("  %s\n", line.c_str());
 
         // default file type
         if (line.find("ftype=") == 0) {
diff --git a/llama.cpp b/llama.cpp
index bf48e38e3..c2c5be35d 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -14196,6 +14196,26 @@ static size_t llama_tensor_quantize_internal(enum ggml_type new_type, const floa
     return new_size;
 }
 
+static bool match_string(const std::string& str, const std::string& pattern, uint32_t string_index = 0, uint32_t pattern_index = 0) {
+    // if both index pointers reach the end of str and pattern respectively
+    if (string_index == str.size() && pattern_index == pattern.size()) {
+        return true;
+    }
+
+    // if pattern character is '*', it can match with any sequence of characters.
+    if (pattern_index < pattern.size() && pattern[pattern_index] == '*') {
+        // move pattern index by 1 and match rest, or keep string index same and move pattern index
+        return match_string(str, pattern, string_index, pattern_index + 1) || (string_index < str.size() && match_string(str, pattern, string_index + 1, pattern_index));
+    }
+
+    // if current characters match or pattern character is '?'
+    if (string_index < str.size() && pattern_index < pattern.size() && (str[string_index] == pattern[pattern_index] || pattern[pattern_index] == '?')) {
+        return match_string(str, pattern, string_index + 1, pattern_index + 1);
+    }
+
+    return false;
+}
+
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
     ggml_type default_type;
 
@@ -14428,8 +14448,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             // type as determined by the ftype.
             if(params->override_ftype) {
                 for (uint32_t i = 0; i < params->override_ftype->count; ++i) {
-                    if (strcmp(params->override_ftype->names[i], tensor->name) == 0) {
-                        //LLAMA_LOG_INFO("\n%s: %s %s ---> %s\n", __func__, tensor->name, ggml_type_name(new_type), ggml_type_name(params->override_ftype->types[i]));
+                    if (match_string(tensor->name, params->override_ftype->names[i])) {
+                        // printf("\n -----> %s, %s\n", params->override_ftype->names[i], tensor->name);
                         new_type = params->override_ftype->types[i];
                         break;
                     }
diff --git a/quant.cfg b/quant.cfg
index f97dbe47e..186166716 100644
--- a/quant.cfg
+++ b/quant.cfg
@@ -1,54 +1,60 @@
-ftype=15
+# this defines the default ftype (the quantization mix code, 
+# that you pass to quantize if you're not using custom mix).
+# tensors that are not overriden below will be quantized 
+# according to this scheme.
 
-blk.12.ffn_down.weight=11
-blk.12.ffn_up.weight=11
+ftype=7
+ 
+# allowed values:
+# LLAMA_FTYPE_ALL_F32              = 0,
+# LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_1          = 3,  // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4,  // tok_embeddings.weight and output.weight are F16
+# // LLAMA_FTYPE_MOSTLY_Q4_2       = 5,  // support has been removed
+# // LLAMA_FTYPE_MOSTLY_Q4_3       = 6,  // support has been removed
+# LLAMA_FTYPE_MOSTLY_Q8_0          = 7,  // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q5_0          = 8,  // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q5_1          = 9,  // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q2_K          = 10, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q3_K_S        = 11, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q3_K_M        = 12, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q3_K_L        = 13, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_K_S        = 14, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_K_M        = 15, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q5_K_S        = 16, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q5_K_M        = 17, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ3_XS        = 22, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ3_XXS       = 23, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ1_S         = 24, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ4_NL        = 25, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ3_S         = 26, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ3_M         = 27, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ2_S         = 28, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ2_M         = 29, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ4_XS        = 30, // except 1d tensors
+# LLAMA_FTYPE_MOSTLY_IQ1_M         = 31, // except 1d tensors
 
-blk.13.ffn_down.weight=11
-blk.13.ffn_up.weight=11
+# this defines an override for tensors with names matching
+# a given string. filters are processed in order given, and the
+# first matching will be used. 
+# Wildcards are allowed:
+#     ? single character
+#     * multiple characters
 
-blk.14.ffn_down.weight=11
-blk.14.ffn_up.weight=11
-
-blk.15.ffn_down.weight=11
-blk.15.ffn_up.weight=11
-
-blk.16.ffn_up.weight=10
-blk.17.ffn_up.weight=10
-blk.18.ffn_up.weight=10
-blk.19.ffn_up.weight=10
-blk.20.ffn_up.weight=10
-blk.21.ffn_up.weight=10
-blk.22.ffn_up.weight=10
-blk.23.ffn_up.weight=10
-blk.24.ffn_up.weight=10
-blk.25.ffn_up.weight=10
-
-blk.16.ffn_down.weight=10
-blk.17.ffn_down.weight=10
-blk.18.ffn_down.weight=10
-blk.19.ffn_down.weight=10
-blk.20.ffn_down.weight=10
-blk.21.ffn_down.weight=10
-blk.22.ffn_down.weight=10
-blk.23.ffn_down.weight=10
-blk.24.ffn_down.weight=10
-blk.25.ffn_down.weight=10
-
-blk.26.ffn_down.weight=10
-blk.26.ffn_up.weight=10
-
-blk.27.ffn_down.weight=11
-blk.27.ffn_up.weight=11
-
-blk.28.ffn_down.weight=11
-blk.28.ffn_up.weight=11
-
-blk.29.ffn_down.weight=11
-blk.29.ffn_up.weight=11
-
-token_embd.weight=21
-output.weight=21
+blk.10.ffn_up.weight=7
+blk.1?.ffn_up.weight=10
+blk.2?.ffn_up.weight=10
+blk.1?.attn*=23
+blk.2?.attn*=23
+*down*=14
+*gate*=12
 
+# allowed values:
 # LLAMA_FTYPE_ALL_F32              = 0,
 # LLAMA_FTYPE_MOSTLY_F16           = 1,  // except 1d tensors
 # LLAMA_FTYPE_MOSTLY_Q4_0          = 2,  // except 1d tensors