diff --git a/Makefile b/Makefile
index e29b7704f..07549b0e7 100644
--- a/Makefile
+++ b/Makefile
@@ -71,7 +71,7 @@ endif
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
 	# Use all CPU extensions that are available:
-	CFLAGS += -march=native -mtune=native
+	CFLAGS += 
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)
diff --git a/koboldcpp.dll b/koboldcpp.dll
index b16f8a476..e8ca6e513 100644
Binary files a/koboldcpp.dll and b/koboldcpp.dll differ
diff --git a/koboldcpp_blas.dll b/koboldcpp_blas.dll
index 0393adf10..15c20c12d 100644
Binary files a/koboldcpp_blas.dll and b/koboldcpp_blas.dll differ
diff --git a/otherarch/ggml_v1.c b/otherarch/ggml_v1.c
index 39b60d04a..ee1de6241 100644
--- a/otherarch/ggml_v1.c
+++ b/otherarch/ggml_v1.c
@@ -87,7 +87,7 @@ typedef void* thread_ret_t;
 #define GGML_V1_SOFT_MAX_UNROLL 4
 #define GGML_V1_VEC_DOT_UNROLL  2
 
-#ifdef GGML_V1_USE_ACCELERATE
+#ifdef GGML_USE_ACCELERATE
 // uncomment to use vDSP for soft max computation
 // note: not sure if it is actually faster
 //#define GGML_V1_SOFT_MAX_ACCELERATE
@@ -110,9 +110,9 @@ typedef void* thread_ret_t;
         } \
     } while (0)
 
-#ifdef GGML_V1_USE_ACCELERATE
+#ifdef GGML_USE_ACCELERATE
 #include <Accelerate/Accelerate.h>
-#elif GGML_V1_USE_OPENBLAS
+#elif GGML_USE_OPENBLAS
 #include <cblas.h>
 #endif
 
@@ -1742,7 +1742,7 @@ inline static void ggml_v1_vec_gelu_f32(const int n, float * y, const float * x)
 #endif
 
 inline static void ggml_v1_vec_sum_f32(const int n, float * s, const float * x) {
-#ifndef GGML_V1_USE_ACCELERATE
+#ifndef GGML_USE_ACCELERATE
     ggml_v1_float sum = 0.0;
     for (int i = 0; i < n; ++i) {
         sum += x[i];
@@ -1754,7 +1754,7 @@ inline static void ggml_v1_vec_sum_f32(const int n, float * s, const float * x)
 }
 
 inline static void ggml_v1_vec_max_f32(const int n, float * s, const float * x) {
-#ifndef GGML_V1_USE_ACCELERATE
+#ifndef GGML_USE_ACCELERATE
     ggml_v1_float max = -INFINITY;
     for (int i = 0; i < n; ++i) {
         max = MAX(max, x[i]);
@@ -5077,7 +5077,7 @@ static void ggml_v1_compute_forward_norm(
 
 // ggml_v1_compute_forward_mul_mat
 
-#if defined(GGML_V1_USE_ACCELERATE) || defined(GGML_V1_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
 // helper function to determine if it is better to use BLAS or not
 // for large matrices, BLAS is faster
 static bool ggml_v1_compute_forward_mul_mat_use_blas(
@@ -5169,7 +5169,7 @@ static void ggml_v1_compute_forward_mul_mat_f32(
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
 
-#if defined(GGML_V1_USE_ACCELERATE) || defined(GGML_V1_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_v1_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_V1_ASSERT(nb10 == sizeof(float));
 
@@ -5414,7 +5414,7 @@ static void ggml_v1_compute_forward_mul_mat_f16_f32(
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
 
-#if defined(GGML_V1_USE_ACCELERATE) || defined(GGML_V1_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_v1_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_V1_ASSERT(nb10 == sizeof(float));
 
@@ -5720,7 +5720,7 @@ static void ggml_v1_compute_forward_mul_mat_q4_0_f32(
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
 
-#if defined(GGML_V1_USE_ACCELERATE) || defined(GGML_V1_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_v1_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_V1_ASSERT(nb10 == sizeof(float));
 
@@ -6020,7 +6020,7 @@ static void ggml_v1_compute_forward_mul_mat_q4_1_f32(
     // nb00 <  nb01 - src0 is transposed
     //   compute by src0 columns
 
-#if defined(GGML_V1_USE_ACCELERATE) || defined(GGML_V1_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     if (ggml_v1_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
         GGML_V1_ASSERT(nb10 == sizeof(float));
 
@@ -8870,7 +8870,7 @@ void ggml_v1_graph_compute(struct ggml_v1_context * ctx, struct ggml_v1_cgraph *
                         } else {
                             if (node->src0->type == GGML_V1_TYPE_F16 &&
                                 node->src1->type == GGML_V1_TYPE_F32) {
-#if defined(GGML_V1_USE_ACCELERATE) || defined(GGML_V1_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                                 if (ggml_v1_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                     node->n_tasks = 1; // TODO: this actually is doing nothing
                                                        //       the threads are still spinning
@@ -8889,7 +8889,7 @@ void ggml_v1_graph_compute(struct ggml_v1_context * ctx, struct ggml_v1_cgraph *
                                 cur = 0;
                             } else if (node->src0->type == GGML_V1_TYPE_Q4_0 &&
                                        node->src1->type == GGML_V1_TYPE_F32) {
-#if defined(GGML_V1_USE_ACCELERATE) || defined(GGML_V1_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                                 if (ggml_v1_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                     node->n_tasks = 1;
                                     cur = GGML_V1_TYPE_SIZE[GGML_V1_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -8901,7 +8901,7 @@ void ggml_v1_graph_compute(struct ggml_v1_context * ctx, struct ggml_v1_cgraph *
 #endif
                             } else if (node->src0->type == GGML_V1_TYPE_Q4_1 &&
                                        node->src1->type == GGML_V1_TYPE_F32) {
-#if defined(GGML_V1_USE_ACCELERATE) || defined(GGML_V1_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
                                 if (ggml_v1_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
                                     node->n_tasks = 1;
                                     cur = GGML_V1_TYPE_SIZE[GGML_V1_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -10150,7 +10150,7 @@ int ggml_v1_cpu_has_wasm_simd(void) {
 }
 
 int ggml_v1_cpu_has_blas(void) {
-#if defined(GGML_V1_USE_ACCELERATE) || defined(GGML_V1_USE_OPENBLAS)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
     return 1;
 #else
     return 0;
diff --git a/quantize.exe b/quantize.exe
index ccb9f410b..72ae9a009 100644
Binary files a/quantize.exe and b/quantize.exe differ