Merge branch 'master' into compilade/bitnet-ternary

This commit is contained in:
Francis Couture-Harpin 2024-08-22 16:42:24 -04:00
commit cb6d9962c4
77 changed files with 4681 additions and 2212 deletions

View file

@ -2145,6 +2145,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
// sycl backend will limit task global_range < MAX_INT
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
// however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
// these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
test_cases.emplace_back(new test_conv_transpose_1d());
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
@ -2287,6 +2294,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
// sycl backend will limit task global_range < MAX_INT
// test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
// however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
// this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
// test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
for (ggml_type type_a : base_types) {
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
for (int n_mats : {4, 8}) {

View file

@ -503,7 +503,7 @@ static void test_special_chars() {
"aaaaabcccc",
"aaaabccc",
"aaaabccccc",
"🔵🟠✅❌abc❌✅🟠🔵"
"🔵🟠✅❌abc❌✅🟠🔵",
"🔵🟠abc🟠🔵"
}
);

View file

@ -0,0 +1,139 @@
#!/bin/bash
set -e
# Array of models to iterate over
declare -a params=(
"Gemma2ForCausalLM 64"
"LlamaForCausalLM 64"
"Phi3ForCausalLM 64"
)
MODELS_REPO=lora-tests
MODELS_REPO_URL=https://huggingface.co/ggml-org/$MODELS_REPO
# Clone the Hugging Face repository if the directory does not exist
if [ ! -d "$MODELS_REPO" ]; then
echo "Cloning the Hugging Face repository..."
git clone $MODELS_REPO_URL
else
echo "Repository already exists. Skipping clone."
fi
# Array to store results to print
results=()
trim_leading_whitespace() {
local input_string="$1"
echo "${input_string#"${input_string%%[![:space:]]*}"}"
}
extract_starting_substring() {
local reference_string="$1"
local target_string="$2"
local target_length=${#target_string}
echo "${reference_string:0:$target_length}"
}
get_first_word() {
local input_string="$1"
read -r first_word _ <<< "$input_string"
echo "$first_word"
}
# Load the expected strings
EXPECTED_BASE_FULL=$(cat $MODELS_REPO/data/pale_blue_dot.txt)
EXPECTED_LORA_FULL=$(cat $MODELS_REPO/data/bohemian_rhapsody.txt)
EXPECTED_BASE_FIRST_WORD=$(get_first_word "$EXPECTED_BASE_FULL")
EXPECTED_LORA_FIRST_WORD=$(get_first_word "$EXPECTED_LORA_FULL")
run_conversion_and_inference_lora() {
local model_name=$1
local hidden_size=$2
echo -e "\n\n-------- RUNNING TEST FOR MODEL $model_name --------\n\n"
# Convert safetensors to gguf
echo "Running convert_hf_to_gguf.py for $model_name with hidden_size $hidden_size..."
python convert_hf_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \
--outfile $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
--outtype f32
echo -e "\n\n---------------------------\n\n"
echo "Running convert_lora_to_gguf.py for $model_name with hidden_size $hidden_size..."
python3 convert_lora_to_gguf.py $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora \
--base $MODELS_REPO/$model_name/hidden_size=$hidden_size/base \
--outtype f32
echo -e "\n\n---------------------------\n\n"
echo "Running llama-export-lora with lora for $model_name with hidden_size $hidden_size..."
./llama-export-lora \
-m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
-o $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
--lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf
# Run inference
echo -e "\n\n---------------------------\n\n"
echo "Running llama-cli without lora for $model_name with hidden_size $hidden_size..."
OUTPUT_BASE=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
-p "$EXPECTED_BASE_FIRST_WORD" -n 50 --seed 42 --temp 0)
echo -e "\n\n---------------------------\n\n"
echo "Running llama-cli with hot lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_HOT=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32.gguf \
--lora $MODELS_REPO/$model_name/hidden_size=$hidden_size/lora/Lora-F32-LoRA.gguf \
-p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
echo -e "\n\n---------------------------\n\n"
echo "Running llama-cli with merged lora for $model_name with hidden_size $hidden_size..."
OUTPUT_LORA_MERGED=$(./llama-cli -m $MODELS_REPO/$model_name/hidden_size=$hidden_size/base/Base-F32-lora-merged.gguf \
-p "$EXPECTED_LORA_FIRST_WORD" -n 50 --seed 42 --temp 0)
# Remove any initial white space
OUTPUT_BASE=$(trim_leading_whitespace "$OUTPUT_BASE")
OUTPUT_LORA_HOT=$(trim_leading_whitespace "$OUTPUT_LORA_HOT")
OUTPUT_LORA_MERGED=$(trim_leading_whitespace "$OUTPUT_LORA_MERGED")
# Extract the corresponding substring from full string
EXPECTED_BASE=$(extract_starting_substring "$EXPECTED_BASE_FULL" "$OUTPUT_BASE")
EXPECTED_LORA=$(extract_starting_substring "$EXPECTED_LORA_FULL" "$OUTPUT_LORA_HOT")
# Assert output equals the expected output
if [[ "$OUTPUT_BASE" != "$EXPECTED_BASE" ]]; then
echo "Error: $model_name OUTPUT_BASE does not start with the expected string."
echo -e "Out=$OUTPUT_BASE\n\nExp=$EXPECTED_BASE"
exit 1
fi
if [[ "$OUTPUT_LORA_HOT" != "$EXPECTED_LORA" ]]; then
echo "Error: $model_name OUTPUT_LORA_HOT does not start with the expected string."
echo -e "Out=$OUTPUT_LORA_HOT\n\nExp=$EXPECTED_LORA"
exit 1
fi
if [[ "$OUTPUT_LORA_MERGED" != "$EXPECTED_LORA" ]]; then
echo "Error: $model_name OUTPUT_LORA_MERGED does not start with the expected string."
echo -e "Out=$OUTPUT_LORA_MERGED\n\nExp=$EXPECTED_LORA"
exit 1
fi
# Store the results
results+=("
\n\033[1mResults for $model_name with hidden_size $hidden_size:\033[0m
\n\033[32m • Base:\n$OUTPUT_BASE
\n\033[34m • Lora hot:\n$OUTPUT_LORA_HOT
\n\033[36m • Lora merged:\n$OUTPUT_LORA_MERGED
\n \033[0m
")
echo "All tests passed for $model_name with hidden_size $hidden_size!"
}
# Run test for each model
for param in "${params[@]}"; do
run_conversion_and_inference_lora $param
done
# Print results
echo -e "\n\n---------------------------\n\n"
echo -e "\n\033[1mSummary of All Results:\033[0m"
for result in "${results[@]}"; do
echo -e "$result"
done