From e2a937ca6abadc7e01e139db31e6db9dce16e3e9 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 3 May 2023 18:43:23 +0300
Subject: [PATCH 1/4] minor : fix trailing whitespaces

---
 scripts/verify-checksum-models.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py
index 811372e47..1f1b3d24f 100644
--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@@ -6,13 +6,13 @@ def sha256sum(file):
     b  = bytearray(block_size)
     file_hash = hashlib.sha256()
     mv = memoryview(b)
-    with open(file, 'rb', buffering=0) as f: 
+    with open(file, 'rb', buffering=0) as f:
         while True:
             n = f.readinto(mv)
             if not n:
                 break
             file_hash.update(mv[:n])
-        
+
     return file_hash.hexdigest()
 
 # Define the path to the llama directory (parent folder of script directory)

From bca9ad938a2a43621cf406d993b755cc91728dd5 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 3 May 2023 20:09:42 +0300
Subject: [PATCH 2/4] minor : fix whitespaces (#1302)

---
 README.md                         |   2 +-
 scripts/verify-checksum-models.py | 155 +++++++++++++++---------------
 2 files changed, 78 insertions(+), 79 deletions(-)

diff --git a/README.md b/README.md
index de0a3deef..0002f8cc1 100644
--- a/README.md
+++ b/README.md
@@ -388,7 +388,7 @@ python3 .\scripts\verify-checksum-models.py
 ```
 
 - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory:
-    - On Linux: `sha256sum --ignore-missing -c SHA256SUMS` 
+    - On Linux: `sha256sum --ignore-missing -c SHA256SUMS`
     - on macOS: `shasum -a 256 --ignore-missing -c SHA256SUMS`
 
 ### Seminal papers and background on the models
diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py
index 1f1b3d24f..2ce572826 100644
--- a/scripts/verify-checksum-models.py
+++ b/scripts/verify-checksum-models.py
@@ -1,78 +1,77 @@
-import os
-import hashlib
-
-def sha256sum(file):
-    block_size = 16 * 1024 * 1024  # 16 MB block size
-    b  = bytearray(block_size)
-    file_hash = hashlib.sha256()
-    mv = memoryview(b)
-    with open(file, 'rb', buffering=0) as f:
-        while True:
-            n = f.readinto(mv)
-            if not n:
-                break
-            file_hash.update(mv[:n])
-
-    return file_hash.hexdigest()
-
-# Define the path to the llama directory (parent folder of script directory)
-llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
-
-# Define the file with the list of hashes and filenames
-hash_list_file = os.path.join(llama_path, "SHA256SUMS")
-
-# Check if the hash list file exists
-if not os.path.exists(hash_list_file):
-    print(f"Hash list file not found: {hash_list_file}")
-    exit(1)
-
-# Read the hash file content and split it into an array of lines
-with open(hash_list_file, "r") as f:
-    hash_list = f.read().splitlines()
-
-# Create an array to store the results
-results = []
-
-# Loop over each line in the hash list
-for line in hash_list:
-    # Split the line into hash and filename
-    hash_value, filename = line.split("  ")
-
-    # Get the full path of the file by joining the llama path and the filename
-    file_path = os.path.join(llama_path, filename)
-
-    # Informing user of the progress of the integrity check
-    print(f"Verifying the checksum of {file_path}")
-
-    # Check if the file exists
-    if os.path.exists(file_path):
-        # Calculate the SHA256 checksum of the file using hashlib
-        file_hash = sha256sum(file_path)
-
-        # Compare the file hash with the expected hash
-        if file_hash == hash_value:
-            valid_checksum = "V"
-            file_missing = ""
-        else:
-            valid_checksum = ""
-            file_missing = ""
-    else:
-        valid_checksum = ""
-        file_missing = "X"
-
-    # Add the results to the array
-    results.append({
-        "filename": filename,
-        "valid checksum": valid_checksum,
-        "file missing": file_missing
-    })
-
-
-# Print column headers for results table
-print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20))
-print("-" * 80)
-
-# Output the results as a table
-for r in results:
-    print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}")
-
+import os
+import hashlib
+
+def sha256sum(file):
+    block_size = 16 * 1024 * 1024  # 16 MB block size
+    b  = bytearray(block_size)
+    file_hash = hashlib.sha256()
+    mv = memoryview(b)
+    with open(file, 'rb', buffering=0) as f:
+        while True:
+            n = f.readinto(mv)
+            if not n:
+                break
+            file_hash.update(mv[:n])
+
+    return file_hash.hexdigest()
+
+# Define the path to the llama directory (parent folder of script directory)
+llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+
+# Define the file with the list of hashes and filenames
+hash_list_file = os.path.join(llama_path, "SHA256SUMS")
+
+# Check if the hash list file exists
+if not os.path.exists(hash_list_file):
+    print(f"Hash list file not found: {hash_list_file}")
+    exit(1)
+
+# Read the hash file content and split it into an array of lines
+with open(hash_list_file, "r") as f:
+    hash_list = f.read().splitlines()
+
+# Create an array to store the results
+results = []
+
+# Loop over each line in the hash list
+for line in hash_list:
+    # Split the line into hash and filename
+    hash_value, filename = line.split("  ")
+
+    # Get the full path of the file by joining the llama path and the filename
+    file_path = os.path.join(llama_path, filename)
+
+    # Informing user of the progress of the integrity check
+    print(f"Verifying the checksum of {file_path}")
+
+    # Check if the file exists
+    if os.path.exists(file_path):
+        # Calculate the SHA256 checksum of the file using hashlib
+        file_hash = sha256sum(file_path)
+
+        # Compare the file hash with the expected hash
+        if file_hash == hash_value:
+            valid_checksum = "V"
+            file_missing = ""
+        else:
+            valid_checksum = ""
+            file_missing = ""
+    else:
+        valid_checksum = ""
+        file_missing = "X"
+
+    # Add the results to the array
+    results.append({
+        "filename": filename,
+        "valid checksum": valid_checksum,
+        "file missing": file_missing
+    })
+
+
+# Print column headers for results table
+print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20))
+print("-" * 80)
+
+# Output the results as a table
+for r in results:
+    print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}")

From 6daa09d87926fe654385c2887e39ec3eeaa58120 Mon Sep 17 00:00:00 2001
From: khimaros <me@khimaros.com>
Date: Wed, 3 May 2023 10:58:11 -0700
Subject: [PATCH 3/4] examples : read chat prompts from a template file (#1196)

---
 examples/chat-13B.sh            | 48 +++++++++++++--------------------
 prompts/chat-with-vicuna-v0.txt |  7 +++++
 prompts/chat-with-vicuna-v1.txt |  7 +++++
 prompts/chat.txt                | 28 +++++++++++++++++++
 4 files changed, 60 insertions(+), 30 deletions(-)
 create mode 100644 prompts/chat-with-vicuna-v0.txt
 create mode 100644 prompts/chat-with-vicuna-v1.txt
 create mode 100644 prompts/chat.txt

diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh
index d7148d184..35c089d57 100755
--- a/examples/chat-13B.sh
+++ b/examples/chat-13B.sh
@@ -1,9 +1,12 @@
 #!/bin/bash
 
+set -e
+
 cd "$(dirname "$0")/.." || exit
 
 MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}"
-USER_NAME="${USER_NAME:-User}"
+PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt}
+USER_NAME="${USER_NAME:-USER}"
 AI_NAME="${AI_NAME:-ChatLLaMa}"
 
 # Adjust to the number of CPU cores you want to use.
@@ -15,39 +18,24 @@ N_PREDICTS="${N_PREDICTS:-2048}"
 # For example, override the context size by doing: ./chatLLaMa --ctx_size 1024
 GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}"
 
+DATE_TIME=$(date +%H:%M)
+DATE_YEAR=$(date +%Y)
+
+PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt)
+
+sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \
+    -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \
+    -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \
+    -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \
+     $PROMPT_TEMPLATE > $PROMPT_FILE
+
 # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS
 ./main $GEN_OPTIONS \
   --model "$MODEL" \
   --threads "$N_THREAD" \
   --n_predict "$N_PREDICTS" \
   --color --interactive \
+  --file ${PROMPT_FILE} \
   --reverse-prompt "${USER_NAME}:" \
-  --prompt "
-Text transcript of a never ending dialog, where ${USER_NAME} interacts with an AI assistant named ${AI_NAME}.
-${AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer ${USER_NAME}'s requests immediately and with details and precision.
-There are no annotations like (30 seconds passed...) or (to himself), just what ${USER_NAME} and ${AI_NAME} say aloud to each other.
-The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
-The transcript only includes text, it does not include markup like HTML and Markdown.
-
-$USER_NAME: Hello, $AI_NAME!
-$AI_NAME: Hello $USER_NAME! How may I help you today?
-$USER_NAME: What year is it?
-$AI_NAME: We are in $(date +%Y).
-$USER_NAME: Please tell me the largest city in Europe.
-$AI_NAME: The largest city in Europe is Moscow, the capital of Russia.
-$USER_NAME: What can you tell me about Moscow?
-$AI_NAME: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
-$USER_NAME: What is a cat?
-$AI_NAME: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
-$USER_NAME: How do I pass command line arguments to a Node.js program?
-$AI_NAME: The arguments are stored in process.argv.
-
-    argv[0] is the path to the Node. js executable.
-    argv[1] is the path to the script file.
-    argv[2] is the first argument passed to the script.
-    argv[3] is the second argument passed to the script and so on.
-$USER_NAME: Name a color.
-$AI_NAME: Blue.
-$USER_NAME: What time is it?
-$AI_NAME: It is $(date +%H:%M).
-$USER_NAME:" "$@"
+  --in-prefix ' ' \
+  "$@"
diff --git a/prompts/chat-with-vicuna-v0.txt b/prompts/chat-with-vicuna-v0.txt
new file mode 100644
index 000000000..0462e8421
--- /dev/null
+++ b/prompts/chat-with-vicuna-v0.txt
@@ -0,0 +1,7 @@
+A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+### [[USER_NAME]]: Hello, [[AI_NAME]].
+### [[AI_NAME]]: Hello. How may I help you today?
+### [[USER_NAME]]: Please tell me the largest city in Europe.
+### [[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia.
+### [[USER_NAME]]:
diff --git a/prompts/chat-with-vicuna-v1.txt b/prompts/chat-with-vicuna-v1.txt
new file mode 100644
index 000000000..fdbe778af
--- /dev/null
+++ b/prompts/chat-with-vicuna-v1.txt
@@ -0,0 +1,7 @@
+A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions.
+
+[[USER_NAME]]: Hello, [[AI_NAME]].
+[[AI_NAME]]: Hello. How may I help you today?
+[[USER_NAME]]: Please tell me the largest city in Europe.
+[[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia.
+[[USER_NAME]]:
diff --git a/prompts/chat.txt b/prompts/chat.txt
new file mode 100644
index 000000000..5452a1866
--- /dev/null
+++ b/prompts/chat.txt
@@ -0,0 +1,28 @@
+Text transcript of a never ending dialog, where [[USER_NAME]] interacts with an AI assistant named [[AI_NAME]].
+[[AI_NAME]] is helpful, kind, honest, friendly, good at writing and never fails to answer [[USER_NAME]]'s requests immediately and with details and precision.
+There are no annotations like (30 seconds passed...) or (to himself), just what [[USER_NAME]] and [[AI_NAME]] say aloud to each other.
+The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long.
+The transcript only includes text, it does not include markup like HTML and Markdown.
+
+[[USER_NAME]]: Hello, [[AI_NAME]]!
+[[AI_NAME]]: Hello [[USER_NAME]]! How may I help you today?
+[[USER_NAME]]: What year is it?
+[[AI_NAME]]: We are in [[DATE_YEAR]].
+[[USER_NAME]]: Please tell me the largest city in Europe.
+[[AI_NAME]]: The largest city in Europe is Moscow, the capital of Russia.
+[[USER_NAME]]: What can you tell me about Moscow?
+[[AI_NAME]]: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center.
+[[USER_NAME]]: What is a cat?
+[[AI_NAME]]: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae.
+[[USER_NAME]]: How do I pass command line arguments to a Node.js program?
+[[AI_NAME]]: The arguments are stored in process.argv.
+
+    argv[0] is the path to the Node. js executable.
+    argv[1] is the path to the script file.
+    argv[2] is the first argument passed to the script.
+    argv[3] is the second argument passed to the script and so on.
+[[USER_NAME]]: Name a color.
+[[AI_NAME]]: Blue.
+[[USER_NAME]]: What time is it?
+[[AI_NAME]]: It is [[DATE_TIME]].
+[[USER_NAME]]:

From 799fdc1b5d888b8a8682baf112e1c2a2df0df1c4 Mon Sep 17 00:00:00 2001
From: Georgi Gerganov <ggerganov@gmail.com>
Date: Wed, 3 May 2023 23:24:20 +0300
Subject: [PATCH 4/4] ggml : vectorize Q8_0 quantization

https://github.com/ggerganov/ggml/pull/127#issuecomment-1533648531
---
 ggml.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 120 insertions(+)

diff --git a/ggml.c b/ggml.c
index addf0c308..0bcb5f617 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1509,15 +1509,135 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
 }
 
 static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
+    assert(QK8_0 == 32);
     assert(k % QK8_0 == 0);
+    const int nb = k / QK8_0;
 
     block_q8_0 * restrict y = vy;
 
+#if defined(__ARM_NEON)
+    for (int i = 0; i < nb; i++) {
+        float32x4_t srcv [8];
+        float32x4_t asrcv[8];
+        float32x4_t amaxv[8];
+
+        for (int l = 0; l < 8; l++) srcv[l]  = vld1q_f32(x + i*32 + 4*l);
+        for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
+
+        for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
+        for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
+        for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
+
+        const float amax = vmaxvq_f32(amaxv[0]);
+
+        const float d = amax / ((1 << 7) - 1);
+        const float id = d ? 1.0f/d : 0.0f;
+
+        y[i].d = d;
+
+        for (int l = 0; l < 8; l++) {
+            const float32x4_t v  = vmulq_n_f32(srcv[l], id);
+            const int32x4_t   vi = vcvtnq_s32_f32(v);
+
+            y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
+            y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
+            y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
+            y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
+        }
+    }
+#elif defined(__AVX2__) || defined(__AVX__)
+    for (int i = 0; i < nb; i++) {
+        // Load elements into 4 AVX vectors
+        __m256 v0 = _mm256_loadu_ps( x );
+        __m256 v1 = _mm256_loadu_ps( x + 8 );
+        __m256 v2 = _mm256_loadu_ps( x + 16 );
+        __m256 v3 = _mm256_loadu_ps( x + 24 );
+        x += 32;
+
+        // Compute max(abs(e)) for the block
+        const __m256 signBit = _mm256_set1_ps( -0.0f );
+        __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
+        maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
+
+        __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
+        max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
+        max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
+        const float maxScalar = _mm_cvtss_f32( max4 );
+
+        // Quantize these floats
+        const float d = maxScalar / 127.f;
+        y[i].d = d;
+        const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
+        const __m256 mul = _mm256_set1_ps( id );
+
+        // Apply the multiplier
+        v0 = _mm256_mul_ps( v0, mul );
+        v1 = _mm256_mul_ps( v1, mul );
+        v2 = _mm256_mul_ps( v2, mul );
+        v3 = _mm256_mul_ps( v3, mul );
+
+        // Round to nearest integer
+        v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
+        v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
+        v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
+        v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
+
+        // Convert floats to integers
+        __m256i i0 = _mm256_cvtps_epi32( v0 );
+        __m256i i1 = _mm256_cvtps_epi32( v1 );
+        __m256i i2 = _mm256_cvtps_epi32( v2 );
+        __m256i i3 = _mm256_cvtps_epi32( v3 );
+
+#if defined(__AVX2__)
+        // Convert int32 to int16
+        i0 = _mm256_packs_epi32( i0, i1 );	// 0, 1, 2, 3,  8, 9, 10, 11,  4, 5, 6, 7, 12, 13, 14, 15
+        i2 = _mm256_packs_epi32( i2, i3 );	// 16, 17, 18, 19,  24, 25, 26, 27,  20, 21, 22, 23, 28, 29, 30, 31
+                                            // Convert int16 to int8
+        i0 = _mm256_packs_epi16( i0, i2 );	// 0, 1, 2, 3,  8, 9, 10, 11,  16, 17, 18, 19,  24, 25, 26, 27,  4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+
+        // We got our precious signed bytes, but the order is now wrong
+        // These AVX2 pack instructions process 16-byte pieces independently
+        // The following instruction is fixing the order
+        const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
+        i0 = _mm256_permutevar8x32_epi32( i0, perm );
+
+        _mm256_storeu_si256((__m256i *)y[i].qs, i0);
+#else
+        // Since we don't have in AVX some necessary functions,
+        // we split the registers in half and call AVX2 analogs from SSE
+        __m128i ni0 = _mm256_castsi256_si128( i0 );
+        __m128i ni1 = _mm256_extractf128_si256( i0, 1);
+        __m128i ni2 = _mm256_castsi256_si128( i1 );
+        __m128i ni3 = _mm256_extractf128_si256( i1, 1);
+        __m128i ni4 = _mm256_castsi256_si128( i2 );
+        __m128i ni5 = _mm256_extractf128_si256( i2, 1);
+        __m128i ni6 = _mm256_castsi256_si128( i3 );
+        __m128i ni7 = _mm256_extractf128_si256( i3, 1);
+
+        // Convert int32 to int16
+        ni0 = _mm_packs_epi32( ni0, ni1 );
+        ni2 = _mm_packs_epi32( ni2, ni3 );
+        ni4 = _mm_packs_epi32( ni4, ni5 );
+        ni6 = _mm_packs_epi32( ni6, ni7 );
+        // Convert int16 to int8
+        ni0 = _mm_packs_epi16( ni0, ni2 );
+        ni4 = _mm_packs_epi16( ni4, ni6 );
+
+        _mm_storeu_si128((__m128i *)(y[i].qs +  0), ni0);
+        _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
+#endif
+    }
+#else
+    // scalar
     quantize_row_q8_0_reference(x, y, k);
+#endif
 }
 
 // reference implementation for deterministic creation of model files
 static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
+    assert(QK8_1 == 32);
     assert(k % QK8_1 == 0);
     const int nb = k / QK8_1;