From 8768c7c45afd3a65006594c6dda2be8e6154ef4d Mon Sep 17 00:00:00 2001 From: slaren Date: Mon, 11 Nov 2024 23:44:27 +0100 Subject: [PATCH] fix tests and examples --- examples/llama-bench/llama-bench.cpp | 25 +--------------------- examples/quantize-stats/quantize-stats.cpp | 19 ++++++++-------- ggml/src/ggml-blas/CMakeLists.txt | 2 ++ ggml/src/ggml-quants.c | 2 ++ pocs/vdot/vdot.cpp | 6 +++--- tests/test-quantize-fns.cpp | 19 ++++++++-------- tests/test-quantize-perf.cpp | 17 ++++++++------- 7 files changed, 37 insertions(+), 53 deletions(-) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 1eddfd0db..d227e3036 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -774,13 +774,6 @@ static std::vector get_cmd_params_instances(const cmd_param struct test { static const std::string build_commit; static const int build_number; - static const bool cuda; - static const bool vulkan; - static const bool kompute; - static const bool metal; - static const bool sycl; - static const bool gpu_blas; - static const bool blas; static const std::string cpu_info; static const std::string gpu_info; std::string model_filename; @@ -793,7 +786,6 @@ struct test { std::string cpu_mask; bool cpu_strict; int poll; - bool has_rpc; ggml_type type_k; ggml_type type_v; int n_gpu_layers; @@ -822,7 +814,6 @@ struct test { cpu_mask = inst.cpu_mask; cpu_strict = inst.cpu_strict; poll = inst.poll; - has_rpc = !inst.rpc_servers.empty(); type_k = inst.type_k; type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; @@ -881,7 +872,6 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { "build_commit", "build_number", - "cuda", "vulkan", "kompute", "metal", "sycl", "rpc", "gpu_blas", "blas", "cpu_info", "gpu_info", "model_filename", "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", @@ -908,8 +898,7 @@ struct test { field == "avg_ns" || field == "stddev_ns") { return INT; } - if (field == "cuda" || field == "vulkan" || field == "kompute" || field == "metal" || - field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || + if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || field == "use_mmap" || field == "embeddings") { return BOOL; @@ -938,8 +927,6 @@ struct test { } std::vector values = { build_commit, std::to_string(build_number), - std::to_string(cuda), std::to_string(vulkan), std::to_string(vulkan), - std::to_string(metal), std::to_string(sycl), std::to_string(has_rpc), std::to_string(gpu_blas), std::to_string(blas), cpu_info, gpu_info, model_filename, model_type, std::to_string(model_size), std::to_string(model_n_params), std::to_string(n_batch), std::to_string(n_ubatch), @@ -967,13 +954,6 @@ struct test { const std::string test::build_commit = LLAMA_COMMIT; const int test::build_number = LLAMA_BUILD_NUMBER; -const bool test::cuda = !!ggml_cpu_has_cuda(); -const bool test::vulkan = !!ggml_cpu_has_vulkan(); -const bool test::kompute = !!ggml_cpu_has_kompute(); -const bool test::metal = !!ggml_cpu_has_metal(); -const bool test::gpu_blas = !!ggml_cpu_has_gpublas(); -const bool test::blas = !!ggml_cpu_has_blas(); -const bool test::sycl = !!ggml_cpu_has_sycl(); const std::string test::cpu_info = get_cpu_info(); const std::string test::gpu_info = get_gpu_info(); @@ -1268,9 +1248,6 @@ struct markdown_printer : public printer { value = buf; } else if (field == "backend") { value = test::get_backend(); - if (t.has_rpc) { - value += "+RPC"; - } } else if (field == "test") { if (t.n_prompt > 0 && t.n_gen == 0) { snprintf(buf, sizeof(buf), "pp%d", t.n_prompt); diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp index e372856c6..912caf346 100644 --- a/examples/quantize-stats/quantize-stats.cpp +++ b/examples/quantize-stats/quantize-stats.cpp @@ -142,7 +142,7 @@ static bool tensor_is_contiguous(const struct ggml_tensor * tensor) { } static void test_roundtrip_on_chunk( - const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, bool use_reference, + const ggml_tensor * layer, int64_t offset, int64_t chunk_size, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, float * input_scratch, char * quantized_scratch, float * output_scratch, error_stats & stats ) { if (layer->type == GGML_TYPE_F16) { @@ -156,7 +156,7 @@ static void test_roundtrip_on_chunk( if (use_reference) { qfns.from_float_ref(input_scratch, quantized_scratch, chunk_size); } else { - qfns.from_float(input_scratch, quantized_scratch, chunk_size); + qfns_cpu.from_float(input_scratch, quantized_scratch, chunk_size); } qfns.to_float(quantized_scratch, output_scratch, chunk_size); @@ -166,7 +166,7 @@ static void test_roundtrip_on_chunk( // Run quantization function for a single layer and update error stats static void test_roundtrip_on_layer( - std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, bool use_reference, + std::string & name, bool print_layer_stats, const ggml_type_traits & qfns, const ggml_type_traits_cpu & qfns_cpu, bool use_reference, const ggml_tensor * layer, std::vector & input_scratch, std::vector & quantized_scratch, std::vector & output_scratch, error_stats & total_error, int max_thread = 0 ) { @@ -187,13 +187,13 @@ static void test_roundtrip_on_layer( int num_chunks = (nelements + chunk_size - 1)/chunk_size; if (num_chunks < 2 || max_thread < 2) { - test_roundtrip_on_chunk(layer, 0, nelements, qfns, use_reference, input_scratch_ptr, quantized_scratch.data(), + test_roundtrip_on_chunk(layer, 0, nelements, qfns, qfns_cpu, use_reference, input_scratch_ptr, quantized_scratch.data(), output_scratch.data(), print_layer_stats ? layer_error : total_error); } else { auto & stats = print_layer_stats ? layer_error : total_error; std::mutex mutex; uint64_t counter = 0; - auto compute = [&mutex, &counter, &stats, &qfns, nelements, layer, use_reference, input_scratch_ptr, + auto compute = [&mutex, &counter, &stats, &qfns, &qfns_cpu, nelements, layer, use_reference, input_scratch_ptr, &quantized_scratch, &output_scratch, chunk_size] () { error_stats local_stats {}; while (true) { @@ -205,7 +205,7 @@ static void test_roundtrip_on_layer( } lock.unlock(); uint64_t chunk = offset + chunk_size < nelements ? chunk_size : nelements - offset; - test_roundtrip_on_chunk(layer, offset, chunk, qfns, use_reference, input_scratch_ptr + offset, + test_roundtrip_on_chunk(layer, offset, chunk, qfns, qfns_cpu, use_reference, input_scratch_ptr + offset, quantized_scratch.data() + 4*offset, output_scratch.data() + offset, local_stats); } }; @@ -371,8 +371,9 @@ int main(int argc, char ** argv) { if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) { continue; } - const auto * qfns = ggml_get_type_traits(type); - if (qfns->from_float && qfns->to_float) { + const auto * qfns = ggml_get_type_traits(type); + const auto * qfns_cpu = ggml_get_type_traits_cpu(type); + if (qfns_cpu->from_float && qfns->to_float) { if (params.verbose) { printf("testing %s ...\n", ggml_type_name(type)); } @@ -393,7 +394,7 @@ int main(int argc, char ** argv) { test_roundtrip_on_layer( layer_name, params.per_layer_stats, - *qfns, + *qfns, *qfns_cpu, params.reference, kv_tensor.second, input_scratch, diff --git a/ggml/src/ggml-blas/CMakeLists.txt b/ggml/src/ggml-blas/CMakeLists.txt index a1e5ff30d..2ad18b54e 100644 --- a/ggml/src/ggml-blas/CMakeLists.txt +++ b/ggml/src/ggml-blas/CMakeLists.txt @@ -19,6 +19,8 @@ if (BLAS_FOUND) target_include_directories(ggml-blas PRIVATE . ..) if (${GGML_BLAS_VENDOR} MATCHES "Apple") + add_compile_definitions(ACCELERATE_NEW_LAPACK) + add_compile_definitions(ACCELERATE_LAPACK_ILP64) add_compile_definitions(GGML_BLAS_USE_ACCELERATE) elseif ("${BLAS_INCLUDE_DIRS}" STREQUAL "") # BLAS_INCLUDE_DIRS is missing in FindBLAS.cmake. diff --git a/ggml/src/ggml-quants.c b/ggml/src/ggml-quants.c index 79e4d37cc..7301a9c6c 100644 --- a/ggml/src/ggml-quants.c +++ b/ggml/src/ggml-quants.c @@ -4931,6 +4931,8 @@ void quantize_row_iq2_s_ref(const float * restrict x, block_iq2_s * restrict y, quantize_iq2_s(x, y, 1, k, NULL); } +// =============================== data validation + static bool validate_float(float f, size_t i) { if (isinf(f)) { fprintf(stderr, "ggml_validate_row_data: found inf value at block %zu\n", i); diff --git a/pocs/vdot/vdot.cpp b/pocs/vdot/vdot.cpp index e9af8a363..67c6964b6 100644 --- a/pocs/vdot/vdot.cpp +++ b/pocs/vdot/vdot.cpp @@ -263,9 +263,9 @@ int main(int argc, char** argv) { // Note, we do not include this in the timing as in practical application // we already have the quantized model weights. if (useQ4_1) { - funcs->from_float(x1.data(), q41.data(), kVecSize); + funcs_cpu->from_float(x1.data(), q41.data(), kVecSize); } else { - funcs->from_float(x1.data(), q40.data(), kVecSize); + funcs_cpu->from_float(x1.data(), q40.data(), kVecSize); } // Now measure time the dot product needs using the "scalar" version above @@ -284,7 +284,7 @@ int main(int argc, char** argv) { dot_q4_q8(kVecSize, &result, q40.data(), q8.data()); } else { - const auto * vdot = ggml_get_type_traits(funcs_cpu->vec_dot_type); + const auto * vdot = ggml_get_type_traits_cpu(funcs_cpu->vec_dot_type); vdot->from_float(y1.data(), q8.data(), kVecSize); if (useQ4_1) funcs_cpu->vec_dot(kVecSize, &result, 0, q41.data(), 0, q8.data(), 0, 1); else funcs_cpu->vec_dot(kVecSize, &result, 0, q40.data(), 0, q8.data(), 0, 1); diff --git a/tests/test-quantize-fns.cpp b/tests/test-quantize-fns.cpp index 000e60adf..8d0bf0470 100644 --- a/tests/test-quantize-fns.cpp +++ b/tests/test-quantize-fns.cpp @@ -45,22 +45,23 @@ static float array_rmse(const float * a1, const float * a2, size_t n) { } // Total quantization error on test data -static float total_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) { +static float total_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); - qfns->from_float(test_data, tmp_q.data(), test_size); + qfns_cpu->from_float(test_data, tmp_q.data(), test_size); qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); return array_rmse(test_data, tmp_out.data(), test_size); } // Total quantization error on test data -static float reference_quantization_error(const ggml_type_traits * qfns, size_t test_size, const float * test_data) { +static float reference_quantization_error(const ggml_type_traits * qfns, const ggml_type_traits_cpu * qfns_cpu, size_t test_size, const float * test_data) { std::vector tmp_q(2*test_size); std::vector tmp_out(test_size); std::vector tmp_out_ref(test_size); - qfns->from_float(test_data, tmp_q.data(), test_size); + // FIXME: why is done twice? + qfns_cpu->from_float(test_data, tmp_q.data(), test_size); qfns->to_float(tmp_q.data(), tmp_out.data(), test_size); qfns->from_float_ref(test_data, tmp_q.data(), test_size); @@ -84,9 +85,9 @@ static float dot_product_error( std::vector tmp_q1(2*test_size); std::vector tmp_q2(2*test_size); - const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type); + const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type); - qfns->from_float(test_data1, tmp_q1.data(), test_size); + qfns_cpu->from_float(test_data1, tmp_q1.data(), test_size); vdot->from_float(test_data2, tmp_q2.data(), test_size); float result = INFINITY; @@ -145,8 +146,8 @@ int main(int argc, char * argv[]) { printf("Testing %s\n", ggml_type_name((ggml_type) i)); ggml_quantize_init(ei); - if (qfns->from_float && qfns->to_float) { - const float total_error = total_quantization_error(qfns, test_size, test_data.data()); + if (qfns_cpu->from_float && qfns->to_float) { + const float total_error = total_quantization_error(qfns, qfns_cpu, test_size, test_data.data()); const float max_quantization_error = type == GGML_TYPE_TQ1_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : type == GGML_TYPE_TQ2_0 ? MAX_QUANTIZATION_TOTAL_ERROR_TERNARY : @@ -161,7 +162,7 @@ int main(int argc, char * argv[]) { printf("%5s absolute quantization error: %s (%f)\n", ggml_type_name(type), RESULT_STR[failed], total_error); } - const float reference_error = reference_quantization_error(qfns, test_size, test_data.data()); + const float reference_error = reference_quantization_error(qfns, qfns_cpu, test_size, test_data.data()); failed = !(reference_error < MAX_QUANTIZATION_REFERENCE_ERROR); num_failed += failed; if (failed || verbose) { diff --git a/tests/test-quantize-perf.cpp b/tests/test-quantize-perf.cpp index 221424de8..ac0d12714 100644 --- a/tests/test-quantize-perf.cpp +++ b/tests/test-quantize-perf.cpp @@ -123,9 +123,10 @@ static void usage(char * argv[]) { printf(" --type TYPE set test type as"); for (int i = 0; i < GGML_TYPE_COUNT; i++) { ggml_type type = (ggml_type) i; - const auto * qfns = ggml_get_type_traits(type); + const auto * qfns = ggml_get_type_traits(type); + const auto * qfns_cpu = ggml_get_type_traits_cpu(type); if (ggml_type_name(type) != NULL) { - if (qfns->from_float && qfns->to_float) { + if (qfns_cpu->from_float && qfns->to_float) { printf(" %s", ggml_type_name(type)); } } @@ -277,7 +278,7 @@ int main(int argc, char * argv[]) { continue; } - if (qfns->from_float && qfns->to_float) { + if (qfns_cpu->from_float && qfns->to_float) { printf("%s\n", ggml_type_name(type)); ggml_quantize_init(type); @@ -301,7 +302,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - qfns->from_float(test_data1, test_q1, size); + qfns_cpu->from_float(test_data1, test_q1, size); return test_q1[0]; }; size_t quantized_size = ggml_row_size(type, size); @@ -312,7 +313,7 @@ int main(int argc, char * argv[]) { if (params.op_dequantize_row_q) { printf(" dequantize_row_q\n"); - qfns->from_float(test_data1, test_q1, largest); + qfns_cpu->from_float(test_data1, test_q1, largest); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { @@ -330,7 +331,7 @@ int main(int argc, char * argv[]) { for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float { - const auto * vdot = ggml_get_type_traits(qfns_cpu->vec_dot_type); + const auto * vdot = ggml_get_type_traits_cpu(qfns_cpu->vec_dot_type); vdot->from_float(test_data1, test_q1, size); return test_q1[0]; }; @@ -342,8 +343,8 @@ int main(int argc, char * argv[]) { if (params.op_vec_dot_q) { printf(" vec_dot_q\n"); - qfns->from_float(test_data1, test_q1, largest); - qfns->from_float(test_data2, test_q2, largest); + qfns_cpu->from_float(test_data1, test_q1, largest); + qfns_cpu->from_float(test_data2, test_q2, largest); for (size_t size : params.test_sizes) { printf(" %zu values (%.2f MB)\n", size, 4*size/(float)(1024*1024)); auto quantize_fn = [&](void) -> float {