From e2a937ca6abadc7e01e139db31e6db9dce16e3e9 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 3 May 2023 18:43:23 +0300 Subject: [PATCH 1/4] minor : fix trailing whitespaces --- scripts/verify-checksum-models.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py index 811372e47..1f1b3d24f 100644 --- a/scripts/verify-checksum-models.py +++ b/scripts/verify-checksum-models.py @@ -6,13 +6,13 @@ def sha256sum(file): b = bytearray(block_size) file_hash = hashlib.sha256() mv = memoryview(b) - with open(file, 'rb', buffering=0) as f: + with open(file, 'rb', buffering=0) as f: while True: n = f.readinto(mv) if not n: break file_hash.update(mv[:n]) - + return file_hash.hexdigest() # Define the path to the llama directory (parent folder of script directory) From bca9ad938a2a43621cf406d993b755cc91728dd5 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 3 May 2023 20:09:42 +0300 Subject: [PATCH 2/4] minor : fix whitespaces (#1302) --- README.md | 2 +- scripts/verify-checksum-models.py | 155 +++++++++++++++--------------- 2 files changed, 78 insertions(+), 79 deletions(-) diff --git a/README.md b/README.md index de0a3deef..0002f8cc1 100644 --- a/README.md +++ b/README.md @@ -388,7 +388,7 @@ python3 .\scripts\verify-checksum-models.py ``` - On linux or macOS it is also possible to run the following commands to verify if you have all possible latest files in your self-installed `./models` subdirectory: - - On Linux: `sha256sum --ignore-missing -c SHA256SUMS` + - On Linux: `sha256sum --ignore-missing -c SHA256SUMS` - on macOS: `shasum -a 256 --ignore-missing -c SHA256SUMS` ### Seminal papers and background on the models diff --git a/scripts/verify-checksum-models.py b/scripts/verify-checksum-models.py index 1f1b3d24f..2ce572826 100644 --- a/scripts/verify-checksum-models.py +++ b/scripts/verify-checksum-models.py @@ -1,78 +1,77 @@ -import os -import hashlib - -def sha256sum(file): - block_size = 16 * 1024 * 1024 # 16 MB block size - b = bytearray(block_size) - file_hash = hashlib.sha256() - mv = memoryview(b) - with open(file, 'rb', buffering=0) as f: - while True: - n = f.readinto(mv) - if not n: - break - file_hash.update(mv[:n]) - - return file_hash.hexdigest() - -# Define the path to the llama directory (parent folder of script directory) -llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) - -# Define the file with the list of hashes and filenames -hash_list_file = os.path.join(llama_path, "SHA256SUMS") - -# Check if the hash list file exists -if not os.path.exists(hash_list_file): - print(f"Hash list file not found: {hash_list_file}") - exit(1) - -# Read the hash file content and split it into an array of lines -with open(hash_list_file, "r") as f: - hash_list = f.read().splitlines() - -# Create an array to store the results -results = [] - -# Loop over each line in the hash list -for line in hash_list: - # Split the line into hash and filename - hash_value, filename = line.split(" ") - - # Get the full path of the file by joining the llama path and the filename - file_path = os.path.join(llama_path, filename) - - # Informing user of the progress of the integrity check - print(f"Verifying the checksum of {file_path}") - - # Check if the file exists - if os.path.exists(file_path): - # Calculate the SHA256 checksum of the file using hashlib - file_hash = sha256sum(file_path) - - # Compare the file hash with the expected hash - if file_hash == hash_value: - valid_checksum = "V" - file_missing = "" - else: - valid_checksum = "" - file_missing = "" - else: - valid_checksum = "" - file_missing = "X" - - # Add the results to the array - results.append({ - "filename": filename, - "valid checksum": valid_checksum, - "file missing": file_missing - }) - - -# Print column headers for results table -print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) -print("-" * 80) - -# Output the results as a table -for r in results: - print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") - +import os +import hashlib + +def sha256sum(file): + block_size = 16 * 1024 * 1024 # 16 MB block size + b = bytearray(block_size) + file_hash = hashlib.sha256() + mv = memoryview(b) + with open(file, 'rb', buffering=0) as f: + while True: + n = f.readinto(mv) + if not n: + break + file_hash.update(mv[:n]) + + return file_hash.hexdigest() + +# Define the path to the llama directory (parent folder of script directory) +llama_path = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir)) + +# Define the file with the list of hashes and filenames +hash_list_file = os.path.join(llama_path, "SHA256SUMS") + +# Check if the hash list file exists +if not os.path.exists(hash_list_file): + print(f"Hash list file not found: {hash_list_file}") + exit(1) + +# Read the hash file content and split it into an array of lines +with open(hash_list_file, "r") as f: + hash_list = f.read().splitlines() + +# Create an array to store the results +results = [] + +# Loop over each line in the hash list +for line in hash_list: + # Split the line into hash and filename + hash_value, filename = line.split(" ") + + # Get the full path of the file by joining the llama path and the filename + file_path = os.path.join(llama_path, filename) + + # Informing user of the progress of the integrity check + print(f"Verifying the checksum of {file_path}") + + # Check if the file exists + if os.path.exists(file_path): + # Calculate the SHA256 checksum of the file using hashlib + file_hash = sha256sum(file_path) + + # Compare the file hash with the expected hash + if file_hash == hash_value: + valid_checksum = "V" + file_missing = "" + else: + valid_checksum = "" + file_missing = "" + else: + valid_checksum = "" + file_missing = "X" + + # Add the results to the array + results.append({ + "filename": filename, + "valid checksum": valid_checksum, + "file missing": file_missing + }) + + +# Print column headers for results table +print("\n" + "filename".ljust(40) + "valid checksum".center(20) + "file missing".center(20)) +print("-" * 80) + +# Output the results as a table +for r in results: + print(f"{r['filename']:40} {r['valid checksum']:^20} {r['file missing']:^20}") From 6daa09d87926fe654385c2887e39ec3eeaa58120 Mon Sep 17 00:00:00 2001 From: khimaros Date: Wed, 3 May 2023 10:58:11 -0700 Subject: [PATCH 3/4] examples : read chat prompts from a template file (#1196) --- examples/chat-13B.sh | 48 +++++++++++++-------------------- prompts/chat-with-vicuna-v0.txt | 7 +++++ prompts/chat-with-vicuna-v1.txt | 7 +++++ prompts/chat.txt | 28 +++++++++++++++++++ 4 files changed, 60 insertions(+), 30 deletions(-) create mode 100644 prompts/chat-with-vicuna-v0.txt create mode 100644 prompts/chat-with-vicuna-v1.txt create mode 100644 prompts/chat.txt diff --git a/examples/chat-13B.sh b/examples/chat-13B.sh index d7148d184..35c089d57 100755 --- a/examples/chat-13B.sh +++ b/examples/chat-13B.sh @@ -1,9 +1,12 @@ #!/bin/bash +set -e + cd "$(dirname "$0")/.." || exit MODEL="${MODEL:-./models/13B/ggml-model-q4_0.bin}" -USER_NAME="${USER_NAME:-User}" +PROMPT_TEMPLATE=${PROMPT_TEMPLATE:-./prompts/chat.txt} +USER_NAME="${USER_NAME:-USER}" AI_NAME="${AI_NAME:-ChatLLaMa}" # Adjust to the number of CPU cores you want to use. @@ -15,39 +18,24 @@ N_PREDICTS="${N_PREDICTS:-2048}" # For example, override the context size by doing: ./chatLLaMa --ctx_size 1024 GEN_OPTIONS="${GEN_OPTIONS:---ctx_size 2048 --temp 0.7 --top_k 40 --top_p 0.5 --repeat_last_n 256 --batch_size 1024 --repeat_penalty 1.17647}" +DATE_TIME=$(date +%H:%M) +DATE_YEAR=$(date +%Y) + +PROMPT_FILE=$(mktemp -t llamacpp_prompt.XXXXXXX.txt) + +sed -e "s/\[\[USER_NAME\]\]/$USER_NAME/g" \ + -e "s/\[\[AI_NAME\]\]/$AI_NAME/g" \ + -e "s/\[\[DATE_TIME\]\]/$DATE_TIME/g" \ + -e "s/\[\[DATE_YEAR\]\]/$DATE_YEAR/g" \ + $PROMPT_TEMPLATE > $PROMPT_FILE + # shellcheck disable=SC2086 # Intended splitting of GEN_OPTIONS ./main $GEN_OPTIONS \ --model "$MODEL" \ --threads "$N_THREAD" \ --n_predict "$N_PREDICTS" \ --color --interactive \ + --file ${PROMPT_FILE} \ --reverse-prompt "${USER_NAME}:" \ - --prompt " -Text transcript of a never ending dialog, where ${USER_NAME} interacts with an AI assistant named ${AI_NAME}. -${AI_NAME} is helpful, kind, honest, friendly, good at writing and never fails to answer ${USER_NAME}'s requests immediately and with details and precision. -There are no annotations like (30 seconds passed...) or (to himself), just what ${USER_NAME} and ${AI_NAME} say aloud to each other. -The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. -The transcript only includes text, it does not include markup like HTML and Markdown. - -$USER_NAME: Hello, $AI_NAME! -$AI_NAME: Hello $USER_NAME! How may I help you today? -$USER_NAME: What year is it? -$AI_NAME: We are in $(date +%Y). -$USER_NAME: Please tell me the largest city in Europe. -$AI_NAME: The largest city in Europe is Moscow, the capital of Russia. -$USER_NAME: What can you tell me about Moscow? -$AI_NAME: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. -$USER_NAME: What is a cat? -$AI_NAME: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. -$USER_NAME: How do I pass command line arguments to a Node.js program? -$AI_NAME: The arguments are stored in process.argv. - - argv[0] is the path to the Node. js executable. - argv[1] is the path to the script file. - argv[2] is the first argument passed to the script. - argv[3] is the second argument passed to the script and so on. -$USER_NAME: Name a color. -$AI_NAME: Blue. -$USER_NAME: What time is it? -$AI_NAME: It is $(date +%H:%M). -$USER_NAME:" "$@" + --in-prefix ' ' \ + "$@" diff --git a/prompts/chat-with-vicuna-v0.txt b/prompts/chat-with-vicuna-v0.txt new file mode 100644 index 000000000..0462e8421 --- /dev/null +++ b/prompts/chat-with-vicuna-v0.txt @@ -0,0 +1,7 @@ +A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions. + +### [[USER_NAME]]: Hello, [[AI_NAME]]. +### [[AI_NAME]]: Hello. How may I help you today? +### [[USER_NAME]]: Please tell me the largest city in Europe. +### [[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia. +### [[USER_NAME]]: diff --git a/prompts/chat-with-vicuna-v1.txt b/prompts/chat-with-vicuna-v1.txt new file mode 100644 index 000000000..fdbe778af --- /dev/null +++ b/prompts/chat-with-vicuna-v1.txt @@ -0,0 +1,7 @@ +A chat between a curious human ("[[USER_NAME]]") and an artificial intelligence assistant ("[[AI_NAME]]"). The assistant gives helpful, detailed, and polite answers to the human's questions. + +[[USER_NAME]]: Hello, [[AI_NAME]]. +[[AI_NAME]]: Hello. How may I help you today? +[[USER_NAME]]: Please tell me the largest city in Europe. +[[AI_NAME]]: Sure. The largest city in Europe is Moscow, the capital of Russia. +[[USER_NAME]]: diff --git a/prompts/chat.txt b/prompts/chat.txt new file mode 100644 index 000000000..5452a1866 --- /dev/null +++ b/prompts/chat.txt @@ -0,0 +1,28 @@ +Text transcript of a never ending dialog, where [[USER_NAME]] interacts with an AI assistant named [[AI_NAME]]. +[[AI_NAME]] is helpful, kind, honest, friendly, good at writing and never fails to answer [[USER_NAME]]'s requests immediately and with details and precision. +There are no annotations like (30 seconds passed...) or (to himself), just what [[USER_NAME]] and [[AI_NAME]] say aloud to each other. +The dialog lasts for years, the entirety of it is shared below. It's 10000 pages long. +The transcript only includes text, it does not include markup like HTML and Markdown. + +[[USER_NAME]]: Hello, [[AI_NAME]]! +[[AI_NAME]]: Hello [[USER_NAME]]! How may I help you today? +[[USER_NAME]]: What year is it? +[[AI_NAME]]: We are in [[DATE_YEAR]]. +[[USER_NAME]]: Please tell me the largest city in Europe. +[[AI_NAME]]: The largest city in Europe is Moscow, the capital of Russia. +[[USER_NAME]]: What can you tell me about Moscow? +[[AI_NAME]]: Moscow, on the Moskva River in western Russia, is the nation's cosmopolitan capital. In its historic core is the Kremlin, a complex that's home to the president and tsarist treasures in the Armoury. Outside its walls is Red Square, Russia’s symbolic center. +[[USER_NAME]]: What is a cat? +[[AI_NAME]]: A cat is a domestic species of small carnivorous mammal. It is the only domesticated species in the family Felidae. +[[USER_NAME]]: How do I pass command line arguments to a Node.js program? +[[AI_NAME]]: The arguments are stored in process.argv. + + argv[0] is the path to the Node. js executable. + argv[1] is the path to the script file. + argv[2] is the first argument passed to the script. + argv[3] is the second argument passed to the script and so on. +[[USER_NAME]]: Name a color. +[[AI_NAME]]: Blue. +[[USER_NAME]]: What time is it? +[[AI_NAME]]: It is [[DATE_TIME]]. +[[USER_NAME]]: From 799fdc1b5d888b8a8682baf112e1c2a2df0df1c4 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Wed, 3 May 2023 23:24:20 +0300 Subject: [PATCH 4/4] ggml : vectorize Q8_0 quantization https://github.com/ggerganov/ggml/pull/127#issuecomment-1533648531 --- ggml.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 120 insertions(+) diff --git a/ggml.c b/ggml.c index addf0c308..0bcb5f617 100644 --- a/ggml.c +++ b/ggml.c @@ -1509,15 +1509,135 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r } static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { + assert(QK8_0 == 32); assert(k % QK8_0 == 0); + const int nb = k / QK8_0; block_q8_0 * restrict y = vy; +#if defined(__ARM_NEON) + for (int i = 0; i < nb; i++) { + float32x4_t srcv [8]; + float32x4_t asrcv[8]; + float32x4_t amaxv[8]; + + for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l); + for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]); + + for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]); + for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]); + for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]); + + const float amax = vmaxvq_f32(amaxv[0]); + + const float d = amax / ((1 << 7) - 1); + const float id = d ? 1.0f/d : 0.0f; + + y[i].d = d; + + for (int l = 0; l < 8; l++) { + const float32x4_t v = vmulq_n_f32(srcv[l], id); + const int32x4_t vi = vcvtnq_s32_f32(v); + + y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0); + y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1); + y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2); + y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3); + } + } +#elif defined(__AVX2__) || defined(__AVX__) + for (int i = 0; i < nb; i++) { + // Load elements into 4 AVX vectors + __m256 v0 = _mm256_loadu_ps( x ); + __m256 v1 = _mm256_loadu_ps( x + 8 ); + __m256 v2 = _mm256_loadu_ps( x + 16 ); + __m256 v3 = _mm256_loadu_ps( x + 24 ); + x += 32; + + // Compute max(abs(e)) for the block + const __m256 signBit = _mm256_set1_ps( -0.0f ); + __m256 maxAbs = _mm256_andnot_ps( signBit, v0 ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) ); + maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) ); + + __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) ); + max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) ); + max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) ); + const float maxScalar = _mm_cvtss_f32( max4 ); + + // Quantize these floats + const float d = maxScalar / 127.f; + y[i].d = d; + const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f; + const __m256 mul = _mm256_set1_ps( id ); + + // Apply the multiplier + v0 = _mm256_mul_ps( v0, mul ); + v1 = _mm256_mul_ps( v1, mul ); + v2 = _mm256_mul_ps( v2, mul ); + v3 = _mm256_mul_ps( v3, mul ); + + // Round to nearest integer + v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST ); + v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST ); + v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST ); + v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST ); + + // Convert floats to integers + __m256i i0 = _mm256_cvtps_epi32( v0 ); + __m256i i1 = _mm256_cvtps_epi32( v1 ); + __m256i i2 = _mm256_cvtps_epi32( v2 ); + __m256i i3 = _mm256_cvtps_epi32( v3 ); + +#if defined(__AVX2__) + // Convert int32 to int16 + i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15 + i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31 + // Convert int16 to int8 + i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 + + // We got our precious signed bytes, but the order is now wrong + // These AVX2 pack instructions process 16-byte pieces independently + // The following instruction is fixing the order + const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 ); + i0 = _mm256_permutevar8x32_epi32( i0, perm ); + + _mm256_storeu_si256((__m256i *)y[i].qs, i0); +#else + // Since we don't have in AVX some necessary functions, + // we split the registers in half and call AVX2 analogs from SSE + __m128i ni0 = _mm256_castsi256_si128( i0 ); + __m128i ni1 = _mm256_extractf128_si256( i0, 1); + __m128i ni2 = _mm256_castsi256_si128( i1 ); + __m128i ni3 = _mm256_extractf128_si256( i1, 1); + __m128i ni4 = _mm256_castsi256_si128( i2 ); + __m128i ni5 = _mm256_extractf128_si256( i2, 1); + __m128i ni6 = _mm256_castsi256_si128( i3 ); + __m128i ni7 = _mm256_extractf128_si256( i3, 1); + + // Convert int32 to int16 + ni0 = _mm_packs_epi32( ni0, ni1 ); + ni2 = _mm_packs_epi32( ni2, ni3 ); + ni4 = _mm_packs_epi32( ni4, ni5 ); + ni6 = _mm_packs_epi32( ni6, ni7 ); + // Convert int16 to int8 + ni0 = _mm_packs_epi16( ni0, ni2 ); + ni4 = _mm_packs_epi16( ni4, ni6 ); + + _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0); + _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4); +#endif + } +#else + // scalar quantize_row_q8_0_reference(x, y, k); +#endif } // reference implementation for deterministic creation of model files static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { + assert(QK8_1 == 32); assert(k % QK8_1 == 0); const int nb = k / QK8_1;