From e39e2b29d990483a018f377d869ea72f50e19330 Mon Sep 17 00:00:00 2001
From: zhycheng614 <perry@nexa4ai.com>
Date: Sat, 1 Feb 2025 01:00:59 +0000
Subject: [PATCH] updated readme

---
 .gitignore |  1 +
 README.md  | 28 +++++++++++++++++++++++++++-
 2 files changed, 28 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 26d8c1e7c..706e336a9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 build/**
+build_*/**
 .build/**
 
 models/**
diff --git a/README.md b/README.md
index 68406f91f..6665bf5ba 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,29 @@
 # llama.cpp
 
-This repo is cloned from llama.cpp [commit 74d73dc85cc2057446bf63cc37ff649ae7cebd80](https://github.com/ggerganov/llama.cpp/tree/74d73dc85cc2057446bf63cc37ff649ae7cebd80). It is compatible with llama-cpp-python [commit 7ecdd944624cbd49e4af0a5ce1aa402607d58dcc](https://github.com/abetlen/llama-cpp-python/commit/7ecdd944624cbd49e4af0a5ce1aa402607d58dcc)
\ No newline at end of file
+This repo is cloned from llama.cpp [commit 74d73dc85cc2057446bf63cc37ff649ae7cebd80](https://github.com/ggerganov/llama.cpp/tree/74d73dc85cc2057446bf63cc37ff649ae7cebd80). It is compatible with llama-cpp-python [commit 7ecdd944624cbd49e4af0a5ce1aa402607d58dcc](https://github.com/abetlen/llama-cpp-python/commit/7ecdd944624cbd49e4af0a5ce1aa402607d58dcc)
+
+## Customize quantization group size at compilation (CPU inference only)
+
+The only thing that is different is to add -DQK4_0 flag when cmake.
+
+```bash
+cmake -B build_cpu_g128 -DQK4_0=128
+cmake --build build_cpu_g128
+```
+
+To quantize the model with the customized group size, run
+
+```bash
+./build_cpu_g128/bin/llama-quantize <model_path.gguf> <quantization_type>
+```
+
+To run the quantized model, run
+
+```bash
+./build_cpu_g128/bin/llama-cli -m <quantized_model_path.gguf>
+```
+
+### Note:
+
+You should make sure that the model you run is quantized to the same group size as the one you compile with.
+Or you'll receive a runtime error when loading the model.