From 5bbf40aa47b767013c692b315ab06da6d5d88a86 Mon Sep 17 00:00:00 2001
From: Andrei Betlen <abetlen@gmail.com>
Date: Fri, 21 Apr 2023 17:40:27 -0400
Subject: [PATCH] Update llama.cpp

---
 examples/llama_cpp.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/llama_cpp.py b/examples/llama_cpp.py
index 5e8a5c316..0005e4290 100644
--- a/examples/llama_cpp.py
+++ b/examples/llama_cpp.py
@@ -117,6 +117,8 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = ctypes.c_int(3)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = ctypes.c_int(
     4
 )  # tok_embeddings.weight and output.weight are F16
+LLAMA_FTYPE_MOSTLY_Q4_2 = ctypes.c_int(5)  # except 1d tensors
+LLAMA_FTYPE_MOSTLY_Q4_3 = ctypes.c_int(6)  # except 1d tensors
 
 # Functions
 
@@ -169,13 +171,14 @@ _lib.llama_free.restype = None
 
 # TODO: not great API - very likely to change
 # Returns 0 on success
+# nthread - how many threads to use. If <=0, will use std::thread::hardware_concurrency(), else the number given
 def llama_model_quantize(
-    fname_inp: bytes, fname_out: bytes, itype: c_int
+    fname_inp: bytes, fname_out: bytes, ftype: c_int, nthread: c_int
 ) -> c_int:
-    return _lib.llama_model_quantize(fname_inp, fname_out, itype)
+    return _lib.llama_model_quantize(fname_inp, fname_out, ftype, nthread)
 
 
-_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int]
+_lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
 _lib.llama_model_quantize.restype = c_int