From 6ed7279adbfcdad194b0403114f4ce7ecb2e64eb Mon Sep 17 00:00:00 2001 From: Wenjing Yu Date: Fri, 26 Jul 2024 16:59:48 -0700 Subject: [PATCH] remove benchmark --- .gitignore | 1 - Makefile | 21 +- examples/CMakeLists.txt | 1 - examples/batched.swift/.gitignore | 9 - examples/batched.swift/Makefile | 6 - examples/batched.swift/Package.swift | 22 -- examples/batched.swift/README.md | 4 - examples/batched.swift/Sources/main.swift | 262 --------------------- examples/benchmark/CMakeLists.txt | 6 - examples/benchmark/benchmark-matmult.cpp | 275 ---------------------- 10 files changed, 2 insertions(+), 605 deletions(-) delete mode 100644 examples/batched.swift/.gitignore delete mode 100755 examples/batched.swift/Makefile delete mode 100644 examples/batched.swift/Package.swift delete mode 100644 examples/batched.swift/README.md delete mode 100644 examples/batched.swift/Sources/main.swift delete mode 100644 examples/benchmark/CMakeLists.txt delete mode 100644 examples/benchmark/benchmark-matmult.cpp diff --git a/.gitignore b/.gitignore index 7c7dee0c6..b5a055955 100644 --- a/.gitignore +++ b/.gitignore @@ -56,7 +56,6 @@ cmake-build-* CMakeSettings.json compile_commands.json ggml-metal-embed.metal -llama-batched-swift /rpc-server out/ tmp/ diff --git a/Makefile b/Makefile index 72ef316ea..4c321edd5 100644 --- a/Makefile +++ b/Makefile @@ -1,8 +1,6 @@ # Define the default target now so that it is always the first target BUILD_TARGETS = \ libllava.a \ - llama-bench \ - llama-benchmark-matmult \ llama-cli \ llama-convert-llama2c-to-ggml \ llama-embedding \ @@ -38,8 +36,8 @@ BUILD_TARGETS = \ # Legacy build targets that were renamed in #7809, but should still be removed when the project is cleaned LEGACY_TARGETS_CLEAN = main quantize quantize-stats perplexity imatrix embedding vdot q8dot convert-llama2c-to-ggml \ - simple save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli \ - retrieval speculative infill tokenize benchmark-matmult parallel export-lora lookahead lookup passkey gritlm + simple save-load-state server gguf gguf-split eval-callback libllava.a llava-cli \ + retrieval speculative infill tokenize parallel export-lora lookahead lookup passkey gritlm # Legacy build targets that were renamed in #7809, but we want to build binaries that for them that output a deprecation warning if people try to use them. # We don't want to clutter things too much, so we only build replacements for the most commonly used binaries. @@ -1229,11 +1227,6 @@ llama-convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c- $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) -llama-bench: examples/llama-bench/llama-bench.cpp \ - $(OBJ_ALL) - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - llama-export-lora: examples/export-lora/export-lora.cpp \ $(OBJ_ALL) $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) @@ -1360,16 +1353,6 @@ common/build-info.cpp: $(wildcard .git/index) scripts/build-info.sh common/build-info.o: common/build-info.cpp $(CXX) $(CXXFLAGS) -c $(filter-out %.h,$^) -o $@ -llama-benchmark-matmult: examples/benchmark/benchmark-matmult.cpp \ - $(OBJ_GGML) common/build-info.o - $(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<) - $(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS) - -run-benchmark-matmult: llama-benchmark-matmult - ./$@ - -.PHONY: run-benchmark-matmult swift - # # PoCs # diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index a157473c5..0a5f3647d 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -13,7 +13,6 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}) if (EMSCRIPTEN) else() add_subdirectory(cvector-generator) - add_subdirectory(benchmark) add_subdirectory(convert-llama2c-to-ggml) add_subdirectory(embedding) add_subdirectory(eval-callback) diff --git a/examples/batched.swift/.gitignore b/examples/batched.swift/.gitignore deleted file mode 100644 index e1e863bec..000000000 --- a/examples/batched.swift/.gitignore +++ /dev/null @@ -1,9 +0,0 @@ -.DS_Store -/.build -/Packages -xcuserdata/ -DerivedData/ -.swiftpm/configuration/registries.json -.swiftpm/xcode/package.xcworkspace/contents.xcworkspacedata -.netrc -batched_swift diff --git a/examples/batched.swift/Makefile b/examples/batched.swift/Makefile deleted file mode 100755 index 1f9156e58..000000000 --- a/examples/batched.swift/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -.PHONY: build - -build: - xcodebuild -scheme llama-batched-swift -destination "generic/platform=macOS" -derivedDataPath build - rm -f ./llama-batched-swift - ln -s ./build/Build/Products/Debug/llama-batched-swift ./llama-batched-swift diff --git a/examples/batched.swift/Package.swift b/examples/batched.swift/Package.swift deleted file mode 100644 index 7e8afd084..000000000 --- a/examples/batched.swift/Package.swift +++ /dev/null @@ -1,22 +0,0 @@ -// swift-tools-version: 5.5 -// The swift-tools-version declares the minimum version of Swift required to build this package. - -import PackageDescription - -let package = Package( - name: "llama-batched-swift", - platforms: [.macOS(.v12)], - dependencies: [ - .package(name: "llama", path: "../../"), - ], - targets: [ - // Targets are the basic building blocks of a package, defining a module or a test suite. - // Targets can depend on other targets in this package and products from dependencies. - .executableTarget( - name: "llama-batched-swift", - dependencies: ["llama"], - path: "Sources", - linkerSettings: [.linkedFramework("Foundation"), .linkedFramework("AppKit")] - ), - ] -) diff --git a/examples/batched.swift/README.md b/examples/batched.swift/README.md deleted file mode 100644 index 7f2e2fcdc..000000000 --- a/examples/batched.swift/README.md +++ /dev/null @@ -1,4 +0,0 @@ -This is a swift clone of `examples/batched`. - -$ `make` -$ `./llama-batched-swift MODEL_PATH [PROMPT] [PARALLEL]` diff --git a/examples/batched.swift/Sources/main.swift b/examples/batched.swift/Sources/main.swift deleted file mode 100644 index 616494d2d..000000000 --- a/examples/batched.swift/Sources/main.swift +++ /dev/null @@ -1,262 +0,0 @@ -import Foundation -import llama - -let arguments = CommandLine.arguments - -// Check that we have at least one argument (the model path) -guard arguments.count > 1 else { - print("Usage: swift MODEL_PATH [PROMPT] [PARALLEL]") - exit(1) -} - -let modelPath: String = arguments[1] -let prompt: String = arguments.count > 2 ? arguments[2] : "Hello my name is" -let n_parallel: Int = arguments.count > 3 && Int(arguments[3]) != nil ? Int(arguments[3])! : 1 - -// total length of the sequences including the prompt -let n_len: Int = 32 - -// init LLM -llama_backend_init() -defer { - llama_backend_free() -} - -let model_params = llama_model_default_params() -guard let model = llama_load_model_from_file(modelPath.cString(using: .utf8), model_params) else { - print("Failed to load model") - exit(1) -} - -defer { - llama_free_model(model) -} - -var tokens = tokenize(text: prompt, add_bos: true) - -let n_kv_req = UInt32(tokens.count) + UInt32((n_len - Int(tokens.count)) * n_parallel) - -var context_params = llama_context_default_params() -context_params.seed = 1234 -context_params.n_ctx = n_kv_req -context_params.n_batch = UInt32(max(n_len, n_parallel)) -context_params.n_threads = 8 -context_params.n_threads_batch = 8 - -let context = llama_new_context_with_model(model, context_params) -guard context != nil else { - print("Failed to initialize context") - exit(1) -} - -defer { - llama_free(context) -} - -let n_ctx = llama_n_ctx(context) - -print("\nn_len = \(n_len), n_ctx = \(n_ctx), n_batch = \(context_params.n_batch), n_parallel = \(n_parallel), n_kv_req = \(n_kv_req)\n") - -if n_kv_req > n_ctx { - print("error: n_kv_req (%d) > n_ctx, the required KV cache size is not big enough\n", n_kv_req) - exit(1) -} - -var buffer: [CChar] = [] -for id: llama_token in tokens { - print(token_to_piece(token: id, buffer: &buffer) ?? "", terminator: "") -} - -print("\n") - -var batch = llama_batch_init(max(Int32(tokens.count), Int32(n_parallel)), 0, 1) -defer { - llama_batch_free(batch) -} - -// evaluate the initial prompt -batch.n_tokens = Int32(tokens.count) - -for (i, token) in tokens.enumerated() { - batch.token[i] = token - batch.pos[i] = Int32(i) - batch.n_seq_id[i] = 1 - // batch.seq_id[i][0] = 0 - // TODO: is this the proper way to do this? - if let seq_id = batch.seq_id[i] { - seq_id[0] = 0 - } - batch.logits[i] = 0 -} - -// llama_decode will output logits only for the last token of the prompt -batch.logits[Int(batch.n_tokens) - 1] = 1 - -if llama_decode(context, batch) != 0 { - print("llama_decode() failed") - exit(1) -} - -for i in 1 ..< n_parallel { - llama_kv_cache_seq_cp(context, 0, Int32(i), 0, batch.n_tokens) -} - -if n_parallel > 1 { - print("generating \(n_parallel) sequences ...\n") -} - -var streams: [String] = .init(repeating: "", count: n_parallel) -var streamBuffers: [[CChar]] = .init(repeating: [], count: n_parallel) -var i_batch = [Int32](repeating: batch.n_tokens - 1, count: n_parallel) - -var n_cur = batch.n_tokens -var n_decode = 0 - -let t_main_start = ggml_time_us() - -while n_cur <= n_len { - // prepare the next batch - batch.n_tokens = 0 - - // sample the next token for each parallel sequence / stream - for i in 0 ..< n_parallel { - if i_batch[i] < 0 { - // the stream has already finished - continue - } - - var n_vocab = llama_n_vocab(model) - var logits = llama_get_logits_ith(context, i_batch[i]) - - var candidates: [llama_token_data] = .init(repeating: llama_token_data(), count: Int(n_vocab)) - - for token_id in 0 ..< n_vocab { - candidates.append(llama_token_data(id: token_id, logit: logits![Int(token_id)], p: 0.0)) - } - - var candidates_p: llama_token_data_array = .init( - data: &candidates, - size: candidates.count, - sorted: false - ) - - let top_k: Int32 = 40 - let top_p: Float = 0.9 - let temp: Float = 0.4 - - llama_sample_top_k(context, &candidates_p, top_k, 1) - llama_sample_top_p(context, &candidates_p, top_p, 1) - llama_sample_temp(context, &candidates_p, temp) - - let new_token_id = llama_sample_token(context, &candidates_p) - - // const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); - - // is it an end of stream? -> mark the stream as finished - if llama_token_is_eog(model, new_token_id) || n_cur == n_len { - i_batch[i] = -1 - // print("") - if n_parallel > 1 { - print("stream \(i) finished at n_cur = \(n_cur)") - } - - continue - } - - let nextStringPiece = token_to_piece(token: new_token_id, buffer: &streamBuffers[i]) ?? "" - - // if there is only one stream, we print immediately to stdout - if n_parallel == 1 { - print(nextStringPiece, terminator: "") - } - streams[i] += nextStringPiece - - // push this new token for next evaluation - batch.token[Int(batch.n_tokens)] = new_token_id - batch.pos[Int(batch.n_tokens)] = n_cur - batch.n_seq_id[Int(batch.n_tokens)] = 1 - if let seq_id = batch.seq_id[Int(batch.n_tokens)] { - seq_id[0] = Int32(i) - } - batch.logits[Int(batch.n_tokens)] = 1 - - i_batch[i] = batch.n_tokens - - batch.n_tokens += 1 - - n_decode += 1 - } - - // all streams are finished - if batch.n_tokens == 0 { - break - } - - n_cur += 1 - - // evaluate the current batch with the transformer model - if llama_decode(context, batch) != 0 { - print("llama_decode() failed") - exit(1) - } -} - -if n_parallel > 1 { - print("\n") - for (i, stream) in streams.enumerated() { - print("sequence \(i):\n\n\(prompt)\(stream)\n") - } -} - -let t_main_end = ggml_time_us() - -print("decoded \(n_decode) tokens in \(String(format: "%.2f", Double(t_main_end - t_main_start) / 1_000_000.0)) s, speed: \(String(format: "%.2f", Double(n_decode) / (Double(t_main_end - t_main_start) / 1_000_000.0))) t/s\n") - -llama_print_timings(context) - -private func tokenize(text: String, add_bos: Bool) -> [llama_token] { - let utf8Count = text.utf8.count - let n_tokens = utf8Count + (add_bos ? 1 : 0) - let tokens = UnsafeMutablePointer.allocate(capacity: n_tokens) - let tokenCount = llama_tokenize(model, text, Int32(utf8Count), tokens, Int32(n_tokens), add_bos, /*special tokens*/ false) - var swiftTokens: [llama_token] = [] - for i in 0 ..< tokenCount { - swiftTokens.append(tokens[Int(i)]) - } - tokens.deallocate() - return swiftTokens -} - -private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? { - var result = [CChar](repeating: 0, count: 8) - let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false) - if nTokens < 0 { - let actualTokensCount = -Int(nTokens) - result = .init(repeating: 0, count: actualTokensCount) - let check = llama_token_to_piece( - model, - token, - &result, - Int32(result.count), - 0, - false - ) - assert(check == actualTokensCount) - } else { - result.removeLast(result.count - Int(nTokens)) - } - if buffer.isEmpty, let utfString = String(cString: result + [0], encoding: .utf8) { - return utfString - } else { - buffer.append(contentsOf: result) - let data = Data(buffer.map { UInt8(bitPattern: $0) }) - if buffer.count >= 4 { // 4 bytes is the max length of a utf8 character so if we're here we need to reset the buffer - buffer = [] - } - guard let bufferString = String(data: data, encoding: .utf8) else { - return nil - } - buffer = [] - return bufferString - } -} diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt deleted file mode 100644 index 34a58cc02..000000000 --- a/examples/benchmark/CMakeLists.txt +++ /dev/null @@ -1,6 +0,0 @@ -set(TARGET llama-bench-matmult) -add_executable(${TARGET} benchmark-matmult.cpp) -install(TARGETS ${TARGET} RUNTIME) -target_link_libraries(${TARGET} PRIVATE llama build_info ${CMAKE_THREAD_LIBS_INIT}) -target_include_directories(${TARGET} PRIVATE ../../common) -target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp deleted file mode 100644 index 47cb16c69..000000000 --- a/examples/benchmark/benchmark-matmult.cpp +++ /dev/null @@ -1,275 +0,0 @@ -#include "common.h" -#include "ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * graph, int n_threads) { - struct ggml_cplan plan = ggml_graph_plan(graph, n_threads); - - if (plan.work_size > 0) { - buf.resize(plan.work_size); - plan.work_data = buf.data(); - } - - ggml_graph_compute(graph, &plan); -} - -static float tensor_sum_elements(const ggml_tensor * tensor) { - double sum = 0; - if (tensor->type == GGML_TYPE_F32) { - for (int j = 0; j < tensor->ne[1]; j++) { - for (int k = 0; k < tensor->ne[0]; k++) { - sum += ((float *) tensor->data)[j*tensor->ne[0] + k]; - } - } - } - return sum; -} - -static void tensor_dump(const ggml_tensor * tensor, const char * name) { - printf("%15s: type = %i (%5s) ne = %5" PRIi64 " x %5" PRIi64 " x %5" PRIi64 ", nb = (%5zi, %5zi, %5zi) - ", name, - tensor->type, ggml_type_name(tensor->type), - tensor->ne[0], tensor->ne[1], tensor->ne[2], tensor->nb[0], tensor->nb[1], tensor->nb[2]); - float sum = tensor_sum_elements(tensor); - printf("Sum of tensor %s is %6.2f\n", name, sum); -} - -#define TENSOR_DUMP(tensor) tensor_dump(tensor, #tensor) - -struct benchmark_params_struct { - int32_t n_threads = 1; - int32_t n_iterations = 10; -}; - -static void print_usage(int /*argc*/, char ** argv, struct benchmark_params_struct params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -i N, --iter N number of iterations to use during computation (default: %d)\n", params.n_iterations); - fprintf(stderr, "\n"); -} - -int main(int argc, char ** argv) { - struct benchmark_params_struct benchmark_params; - - bool invalid_param = false; - std::string arg; - for (int i = 1; i < argc; i++) { - arg = argv[i]; - - if (arg == "-t" || arg == "--threads") { - if (++i >= argc) { - invalid_param = true; - break; - } - benchmark_params.n_threads = std::stoi(argv[i]); - } else if (arg == "-i" || arg == "--iter") { - if (++i >= argc) { - invalid_param = true; - break; - } - benchmark_params.n_iterations = std::stoi(argv[i]); - } else if (arg == "-h" || arg == "--help") { - print_usage(argc, argv, benchmark_params); - exit(0); - } - } - if (invalid_param) { - fprintf(stderr, "error: invalid parameter for argument: %s\n", arg.c_str()); - print_usage(argc, argv, benchmark_params); - exit(1); - } - - print_build_info(); - printf("Starting Test\n"); - - // create the ggml context - struct ggml_context * ctx; - //const int sizex = 4096; - //const int sizey = 11008; - -#undef VERBOSE_DEBUGGING -#ifndef VERBOSE_DEBUGGING - const int sizey = 4096; - const int sizex = 11008; - const int sizez = 128; -#else - /* Working - let's increase size */ - const int sizey = 1; - const int sizex = (8*32); - const int sizez = 1; - - /*const int sizey = 1; - const int sizex = 3*(8*32); - const int sizez = 1;*/ -#endif - - //printf("Memsize required = %i\n", sizex*sizex); - - // TODO: perform the bench for all types or for a user specified type - const ggml_type qtype = GGML_TYPE_Q4_1; - - size_t ctx_size = 0; - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizez); - ctx_size += ggml_row_size(qtype, sizex*sizey); - ctx_size += ggml_row_size(qtype, sizex*sizey); - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS - ctx_size += ggml_row_size(GGML_TYPE_F32, sizex*sizey); // BLAS - ctx_size += 1024*1024*16; - - printf("Allocating Memory of size %zi bytes, %zi MB\n",ctx_size, (ctx_size/1024/1024)); - - struct ggml_init_params params = { - /*.mem_size =*/ ctx_size, - /*.mem_buffer =*/ NULL, - /* no_alloc =*/ 0 - }; - - ctx = ggml_init(params); - if (!ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return 1; - } - - - printf("Creating new tensors\n"); - // printf("Creating new tensor m1\n"); - struct ggml_tensor * m11 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - ggml_set_f32(m11, 1.0f); - - // printf("Creating new tensor m1\n"); - struct ggml_tensor * m12 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizey); - ggml_set_f32(m12, 1.5f); - - // printf("Creating new tensor m2\n"); - struct ggml_tensor * m2 = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, sizex, sizez); - ggml_set_f32(m2, 2.0f); - - printf("\n------ Test 1 - Matrix Mult via F32 code\n"); - // printf("Creating new tensor m11xm2\n"); - struct ggml_tensor * m11xm2 = ggml_mul_mat(ctx, m11, m2); - - // printf("Creating compute graph\n"); - struct ggml_cgraph * gf = ggml_new_graph(ctx); - ggml_build_forward_expand(gf, m11xm2); - - printf("n_threads=%i\n", benchmark_params.n_threads); - - TENSOR_DUMP(m11); - TENSOR_DUMP(m2); - - std::vector work_buffer; - - ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads); - - TENSOR_DUMP(gf->nodes[0]); - - printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype)); - - int32_t nelements = sizex*sizey; - - // Set up a the benchmark matrices - // printf("Creating new tensor q11 & Running quantize\n"); - struct ggml_tensor * q11 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m11->data, q11->data, 0, nelements/m11->ne[0], m11->ne[0], nullptr); - - // Set up a the compute graph - // printf("Creating new tensor q31\n"); - struct ggml_tensor * q31 = ggml_mul_mat(ctx, q11, m2); - - // printf("Creating compute graph\n"); - struct ggml_cgraph * gf31 = ggml_new_graph(ctx); - ggml_build_forward_expand(gf31, q31); - - // Set up a second graph computation to make sure we override the CPU cache lines - // printf("Creating new tensor q12 & Running quantize\n"); - struct ggml_tensor * q12 = ggml_new_tensor_2d(ctx, qtype, sizex, sizey); - ggml_quantize_chunk(qtype, (const float *) m12->data, q12->data, 0, nelements/m12->ne[0], m12->ne[0], nullptr); - - // printf("Creating new tensor q32\n"); - struct ggml_tensor * q32 = ggml_mul_mat(ctx, q12, m2); - - //printf("Creating compute graph\n"); - struct ggml_cgraph * gf32 = ggml_new_graph(ctx); - ggml_build_forward_expand(gf32, q32); - printf("n_threads=%i\n", benchmark_params.n_threads); - - const int dimx = sizex; - const int dimy = sizey; - const int dimz = sizez; - long long int flops_per_dot_product = dimy + dimy; - long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; - printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); - - - // Let's use the F32 result from above as a reference for the quantized multiplication - float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]); - - printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n"); - printf("=====================================================================================\n"); - - double gflops_sum = 0; - for (int i=0;inodes[0]); - float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference); - float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; // Let's accept an epsilon of 10^-6 - - if (delta > allowed_delta) { - printf("\nABORT - ERROR in Matrix Multiplication result - expected %6.2f, got %6.2f (delta %6.2f > allowed_delta %6.2f)\n", - sum_of_F32_reference, - sum_of_Q4_result, - delta, - allowed_delta - ); - exit(0); - } - - // Running a different graph computation to make sure we override the CPU cache lines - ggml_graph_compute_helper(work_buffer, gf32, benchmark_params.n_threads); - } - printf("\n"); - printf("Average%78.2f\n",gflops_sum/((double)benchmark_params.n_iterations)); - printf("=====================================================================================\n"); -}