diff --git a/Makefile b/Makefile
index 3e58a28a7..17624656b 100644
--- a/Makefile
+++ b/Makefile
@@ -145,32 +145,35 @@ ggml.o: ggml.c ggml.h
 llama.o: llama.cpp llama.h llama_util.h llama_internal.h
 	$(CXX) $(CXXFLAGS) -c llama.cpp -o llama.o
 
+ggml_extra.o: ggml_extra.cpp ggml_extra.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 common.o: examples/common.cpp examples/common.h
 	$(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o
 
 clean:
 	rm -vf *.o main quantize quantize-stats perplexity embedding
 
-main: examples/main/main.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS)
+main: examples/main/main.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 	@echo
 
-quantize: examples/quantize/quantize.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize/quantize.cpp ggml.o llama.o -o quantize $(LDFLAGS)
+quantize: examples/quantize/quantize.cpp ggml.o llama.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o
-	$(CXX) $(CXXFLAGS) examples/quantize-stats/quantize-stats.cpp ggml.o llama.o -o quantize-stats $(LDFLAGS)
+quantize-stats: examples/quantize-stats/quantize-stats.cpp ggml.o llama.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS)
+perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o
-	$(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS)
+embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS)
 
-libllama.so: llama.o ggml.o
-	$(CXX) $(CXXFLAGS) -shared -fPIC -o libllama.so llama.o ggml.o $(LDFLAGS)
+libllama.so: llama.o ggml.o ggml_extra.o
+	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 #
 # Tests
 #
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp
index 680757c6b..313b7534f 100644
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -14,6 +14,8 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
         fprintf(stderr, "  type = 2 - q4_0\n");
         fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 4 - new q4_0\n");
+        fprintf(stderr, "  type = 5 - new q4_1\n");
         return 1;
     }
 
diff --git a/ggml_extra.cpp b/ggml_extra.cpp
index cabbefae7..e2ae005df 100644
--- a/ggml_extra.cpp
+++ b/ggml_extra.cpp
@@ -10,6 +10,11 @@
 
 namespace {
 
+constexpr int kChunkSize = 32*32*8;
+constexpr int QK = 32;
+constexpr int kBucketSize0 = QK/2 + sizeof(float);
+constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
+
 inline int toNearestInt(float fval) {
     assert(fval <= 4194303.f);
     constexpr float kSnapper=3<<22;
@@ -126,24 +131,19 @@ std::pair<float, float> kQuantize1(int n, const float* X, int8_t* L, std::vector
     return {a, b};
 }
 
-void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k, int type) {
-    constexpr int kChunkSize = 32*32*8;
-    constexpr int QK = 32;
-    constexpr int kBucketSize0 = QK/2 + sizeof(float);
-    constexpr int kBucketSize1 = QK/2 + 2*sizeof(float);
+void kQuantizeQ4(const float* X, void* buffer, int k, int type) {
     assert(k % QK == 0);
 
     auto processOne = [type] (const float* X, int8_t* L, char* y, std::vector<std::pair<float, int>>& work, std::vector<float>& tmpX) {
+        auto q = (uint8_t*)y;
         if (type == 0) {
             float scale = kQuantize0(QK, X, L, work, -7, 7);
-            std::memcpy(y, &scale, sizeof(scale)); y += sizeof(scale);
-            uint8_t* q = (uint8_t*)y;
+            std::memcpy(q, &scale, sizeof(scale)); q += sizeof(scale);
             for (int k=0; k<QK/2; ++k) q[k] = (L[2*k] + 8) | ((L[2*k+1] + 8) << 4);
         } else {
             auto result = kQuantize1(QK, X, L, tmpX, work, 7);
-            std::memcpy(y, &result.second, sizeof(result.second)); y += sizeof(result.second);
-            std::memcpy(y, &result.first,  sizeof(result.first));  y += sizeof(result.first);
-            uint8_t* q = (uint8_t*)y;
+            std::memcpy(q, &result.second, sizeof(result.second)); q += sizeof(result.second);
+            std::memcpy(q, &result.first,  sizeof(result.first));  q += sizeof(result.first);
             for (int k=0; k<QK/2; ++k) q[k] = L[2*k] | (L[2*k+1] << 4);
         }
     };
@@ -156,24 +156,25 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
         std::vector<std::pair<float,int>> work;
         std::vector<float> tmpX;
         int nb = k / QK;
+        auto x = X;
         for (int i=0; i<nb; ++i) {
-            processOne(x + QK*i, L.data(), y, work, tmpX);
+            processOne(x, L.data(), y, work, tmpX);
             y += bucketSize; x += QK;
         }
         return;
     }
 
     std::atomic<int> counter(0);
-    auto compute = [&counter, x, y, k, bucketSize, &processOne] () {
+    auto compute = [&counter, X, y, k, bucketSize, &processOne] () {
         std::vector<int8_t> L(QK);
         std::vector<std::pair<float,int>> work;
         std::vector<float> tmpX;
         while (true) {
-            int first = counter.fetch_add(kChunkSize);
+            int first = counter.fetch_add(kChunkSize, std::memory_order_relaxed);
             if (first >= k) break;
             int last = first + kChunkSize;
             if (last > k) last = k;
-            auto xi = x + first;
+            auto xi = X + first;
             auto yi = y + (first/QK)*bucketSize;
             int n = (last - first)/QK;
             for (int i=0; i<n; ++i) {
@@ -189,6 +190,21 @@ void kQuantizeQ4(const float* GGML_RESTRICT x, void* GGML_RESTRICT buffer, int k
     for (auto& w : workers) w.join();
 }
 
+void collectHisto(int k, const void* buffer, int64_t* hist, int type) {
+    if (!hist) return;
+    auto y = (const uint8_t*)buffer;
+    int m = type == 0 ? 4 : 8;
+    int n = k / 32;
+    for (int i=0; i<n; ++i) {
+        y += m;
+        for (int l=0; l<16; ++l) {
+            ++hist[y[l] & 15];
+            ++hist[y[l] >> 4];
+        }
+        y += 16;
+    }
+}
+
 }
 
 extern "C" {
@@ -201,4 +217,16 @@ void kQuantizeQ4_1(const float* x, void* buffer, int k) {
     kQuantizeQ4(x, buffer, k, 1);
 }
 
+size_t kQuantizeQ4_0H(const float* x, void* buffer, int k, int64_t* hist) {
+    kQuantizeQ4(x, buffer, k, 0);
+    collectHisto(k, buffer, hist, 0);
+    return (k / QK) * kBucketSize0;
+}
+
+size_t kQuantizeQ4_1H(const float* x, void* buffer, int k, int64_t* hist) {
+    kQuantizeQ4(x, buffer, k, 1);
+    collectHisto(k, buffer, hist, 1);
+    return (k / QK) * kBucketSize1;
+}
+
 }
diff --git a/ggml_extra.h b/ggml_extra.h
index 99041bed0..788fcd0ea 100644
--- a/ggml_extra.h
+++ b/ggml_extra.h
@@ -1,7 +1,12 @@
 #pragma once
 
 #ifdef  __cplusplus
+#include <cstdint>
+#include <cstddef>
 extern "C" {
+#else
+#include <stdint.h>
+#include <stddef.h>
 #endif
 
 #ifdef  __cplusplus
@@ -12,8 +17,10 @@ extern "C" {
 #endif
 
 void kQuantizeQ4_0(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+size_t kQuantizeQ4_0H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
 
 void kQuantizeQ4_1(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k);
+size_t kQuantizeQ4_1H(const float* GGML_RESTRICT x, void* GGML_RESTRICT y, int k, int64_t* hist);
 
 #ifdef  __cplusplus
 }
diff --git a/llama.cpp b/llama.cpp
index 54ba01eef..04ba10672 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8,6 +8,7 @@
 #include "llama_internal.h"
 
 #include "ggml.h"
+#include "ggml_extra.h"
 
 #include <array>
 #include <cinttypes>
@@ -1546,9 +1547,12 @@ static llama_vocab::id llama_sample_top_p_top_k(
 
 static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) {
     ggml_type quantized_type;
+    bool useNewQuantization = false;
     switch (itype) {
         case 2: quantized_type = GGML_TYPE_Q4_0; break;
         case 3: quantized_type = GGML_TYPE_Q4_1; break;
+        case 4: quantized_type = GGML_TYPE_Q4_0; useNewQuantization = true; break;
+        case 5: quantized_type = GGML_TYPE_Q4_1; useNewQuantization = true; break;
         default: throw format("invalid quantization type %d\n", itype);
     };
 
@@ -1616,11 +1620,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
             switch (new_type) {
                 case GGML_TYPE_Q4_0:
                     {
-                        new_size = ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+                        new_size = useNewQuantization ?
+                            kQuantizeQ4_0H(f32_data, new_data, nelements, hist_cur.data()) :
+                            ggml_quantize_q4_0(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
                     } break;
                 case GGML_TYPE_Q4_1:
                     {
-                        new_size = ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
+                        new_size = useNewQuantization ?
+                            kQuantizeQ4_1H(f32_data, new_data, nelements, hist_cur.data()) :
+                            ggml_quantize_q4_1(f32_data, new_data, nelements, (int) tensor.ne.at(0), hist_cur.data());
                     } break;
                 default:
                     LLAMA_ASSERT(false);