From 7e5395575a3360598f2565c73c8a2ec0c0abbdb8 Mon Sep 17 00:00:00 2001 From: Marco Matthies <71844+marcom@users.noreply.github.com> Date: Mon, 27 Mar 2023 06:55:26 +0200 Subject: [PATCH 01/12] Fix missing ggml link in cmake for examples/* on w64-mingw32 (#542) --- examples/embedding/CMakeLists.txt | 2 +- examples/main/CMakeLists.txt | 2 +- examples/perplexity/CMakeLists.txt | 2 +- examples/quantize/CMakeLists.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/embedding/CMakeLists.txt b/examples/embedding/CMakeLists.txt index 88c425d4a..def5b831b 100644 --- a/examples/embedding/CMakeLists.txt +++ b/examples/embedding/CMakeLists.txt @@ -1,4 +1,4 @@ set(TARGET embedding) add_executable(${TARGET} embedding.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/main/CMakeLists.txt b/examples/main/CMakeLists.txt index b2dcc2910..aa1f79406 100644 --- a/examples/main/CMakeLists.txt +++ b/examples/main/CMakeLists.txt @@ -1,4 +1,4 @@ set(TARGET main) add_executable(${TARGET} main.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/perplexity/CMakeLists.txt b/examples/perplexity/CMakeLists.txt index 5836df8b2..9bd8e376f 100644 --- a/examples/perplexity/CMakeLists.txt +++ b/examples/perplexity/CMakeLists.txt @@ -1,4 +1,4 @@ set(TARGET perplexity) add_executable(${TARGET} perplexity.cpp) -target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE common llama ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/quantize/CMakeLists.txt b/examples/quantize/CMakeLists.txt index fb27d4517..17a995bbd 100644 --- a/examples/quantize/CMakeLists.txt +++ b/examples/quantize/CMakeLists.txt @@ -1,4 +1,4 @@ set(TARGET quantize) add_executable(${TARGET} quantize.cpp) -target_link_libraries(${TARGET} PRIVATE llama ${CMAKE_THREAD_LIBS_INIT}) +target_link_libraries(${TARGET} PRIVATE llama ggml ${CMAKE_THREAD_LIBS_INIT}) target_compile_features(${TARGET} PRIVATE cxx_std_11) From 4b8efff0e3945090379aa2f897ff125c8f9cdbae Mon Sep 17 00:00:00 2001 From: RJ Adriaansen Date: Tue, 28 Mar 2023 08:11:09 +0200 Subject: [PATCH 02/12] Add embedding example to Makefile (#540) --- Makefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 98a2d85f3..973b2951b 100644 --- a/Makefile +++ b/Makefile @@ -212,7 +212,7 @@ $(info I CC: $(CCV)) $(info I CXX: $(CXXV)) $(info ) -default: main quantize perplexity +default: main quantize perplexity embedding # # Build library @@ -228,7 +228,7 @@ common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c examples/common.cpp -o common.o clean: - rm -vf *.o main quantize perplexity + rm -vf *.o main quantize perplexity embedding main: examples/main/main.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/main/main.cpp ggml.o llama.o common.o -o main $(LDFLAGS) @@ -242,6 +242,9 @@ quantize: examples/quantize/quantize.cpp ggml.o llama.o perplexity: examples/perplexity/perplexity.cpp ggml.o llama.o common.o $(CXX) $(CXXFLAGS) examples/perplexity/perplexity.cpp ggml.o llama.o common.o -o perplexity $(LDFLAGS) +embedding: examples/embedding/embedding.cpp ggml.o llama.o common.o + $(CXX) $(CXXFLAGS) examples/embedding/embedding.cpp ggml.o llama.o common.o -o embedding $(LDFLAGS) + # # Tests # From 7b8dbcb78b2f65c4676e41da215800d65846edd0 Mon Sep 17 00:00:00 2001 From: anzz1 Date: Tue, 28 Mar 2023 17:09:55 +0300 Subject: [PATCH 03/12] main.cpp fixes, refactoring (#571) - main: entering empty line passes back control without new input in interactive/instruct modes - instruct mode: keep prompt fix - instruct mode: duplicate instruct prompt fix - refactor: move common console code from main->common --- examples/common.cpp | 67 +++++++++++++++-- examples/common.h | 30 ++++++++ examples/main/main.cpp | 166 ++++++++++++++--------------------------- 3 files changed, 144 insertions(+), 119 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 2ab000f4f..880ebe9a2 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -9,11 +9,20 @@ #include #include - #if defined(_MSC_VER) || defined(__MINGW32__) - #include // using malloc.h with MSC/MINGW - #elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) - #include - #endif +#if defined(_MSC_VER) || defined(__MINGW32__) +#include // using malloc.h with MSC/MINGW +#elif !defined(__FreeBSD__) && !defined(__NetBSD__) && !defined(__OpenBSD__) +#include +#endif + +#if defined (_WIN32) +#pragma comment(lib,"kernel32.lib") +extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle); +extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode); +extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); +extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID); +extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID); +#endif bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { // determine sensible default number of threads. @@ -204,7 +213,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); fprintf(stderr, " -f FNAME, --file FNAME\n"); fprintf(stderr, " prompt file to start generation.\n"); - fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 - infinity)\n", params.n_predict); + fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); @@ -216,7 +225,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); - fprintf(stderr, " --keep number of tokens to keep from the initial prompt\n"); + fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); if (ggml_mlock_supported()) { fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } @@ -256,3 +265,47 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s return res; } + +/* Keep track of current color of output, and emit ANSI code if it changes. */ +void set_console_color(console_state & con_st, console_color_t color) { + if (con_st.use_color && con_st.color != color) { + switch(color) { + case CONSOLE_COLOR_DEFAULT: + printf(ANSI_COLOR_RESET); + break; + case CONSOLE_COLOR_PROMPT: + printf(ANSI_COLOR_YELLOW); + break; + case CONSOLE_COLOR_USER_INPUT: + printf(ANSI_BOLD ANSI_COLOR_GREEN); + break; + } + con_st.color = color; + } +} + +#if defined (_WIN32) +void win32_console_init(bool enable_color) { + unsigned long dwMode = 0; + void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11) + if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) { + hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12) + if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) { + hConOut = 0; + } + } + if (hConOut) { + // Enable ANSI colors on Windows 10+ + if (enable_color && !(dwMode & 0x4)) { + SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) + } + // Set console output codepage to UTF8 + SetConsoleOutputCP(65001); // CP_UTF8 + } + void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10) + if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) { + // Set console input codepage to UTF8 + SetConsoleCP(65001); // CP_UTF8 + } +} +#endif diff --git a/examples/common.h b/examples/common.h index 8caefd859..1505aa927 100644 --- a/examples/common.h +++ b/examples/common.h @@ -63,3 +63,33 @@ std::string gpt_random_prompt(std::mt19937 & rng); // std::vector llama_tokenize(struct llama_context * ctx, const std::string & text, bool add_bos); + +// +// Console utils +// + +#define ANSI_COLOR_RED "\x1b[31m" +#define ANSI_COLOR_GREEN "\x1b[32m" +#define ANSI_COLOR_YELLOW "\x1b[33m" +#define ANSI_COLOR_BLUE "\x1b[34m" +#define ANSI_COLOR_MAGENTA "\x1b[35m" +#define ANSI_COLOR_CYAN "\x1b[36m" +#define ANSI_COLOR_RESET "\x1b[0m" +#define ANSI_BOLD "\x1b[1m" + +enum console_color_t { + CONSOLE_COLOR_DEFAULT=0, + CONSOLE_COLOR_PROMPT, + CONSOLE_COLOR_USER_INPUT +}; + +struct console_state { + bool use_color = false; + console_color_t color = CONSOLE_COLOR_DEFAULT; +}; + +void set_console_color(console_state & con_st, console_color_t color); + +#if defined (_WIN32) +void win32_console_init(bool enable_color); +#endif diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 66b7c2d5d..d5ab2cf75 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -18,58 +18,13 @@ #include #endif -#if defined (_WIN32) -#pragma comment(lib,"kernel32.lib") -extern "C" __declspec(dllimport) void* __stdcall GetStdHandle(unsigned long nStdHandle); -extern "C" __declspec(dllimport) int __stdcall GetConsoleMode(void* hConsoleHandle, unsigned long* lpMode); -extern "C" __declspec(dllimport) int __stdcall SetConsoleMode(void* hConsoleHandle, unsigned long dwMode); -extern "C" __declspec(dllimport) int __stdcall SetConsoleCP(unsigned int wCodePageID); -extern "C" __declspec(dllimport) int __stdcall SetConsoleOutputCP(unsigned int wCodePageID); -#endif - -#define ANSI_COLOR_RED "\x1b[31m" -#define ANSI_COLOR_GREEN "\x1b[32m" -#define ANSI_COLOR_YELLOW "\x1b[33m" -#define ANSI_COLOR_BLUE "\x1b[34m" -#define ANSI_COLOR_MAGENTA "\x1b[35m" -#define ANSI_COLOR_CYAN "\x1b[36m" -#define ANSI_COLOR_RESET "\x1b[0m" -#define ANSI_BOLD "\x1b[1m" - -/* Keep track of current color of output, and emit ANSI code if it changes. */ -enum console_state { - CONSOLE_STATE_DEFAULT=0, - CONSOLE_STATE_PROMPT, - CONSOLE_STATE_USER_INPUT -}; - -static console_state con_st = CONSOLE_STATE_DEFAULT; -static bool con_use_color = false; - -void set_console_state(console_state new_st) { - if (!con_use_color) return; - // only emit color code if state changed - if (new_st != con_st) { - con_st = new_st; - switch(con_st) { - case CONSOLE_STATE_DEFAULT: - printf(ANSI_COLOR_RESET); - return; - case CONSOLE_STATE_PROMPT: - printf(ANSI_COLOR_YELLOW); - return; - case CONSOLE_STATE_USER_INPUT: - printf(ANSI_BOLD ANSI_COLOR_GREEN); - return; - } - } -} +static console_state con_st; static bool is_interacting = false; #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) void sigint_handler(int signo) { - set_console_state(CONSOLE_STATE_DEFAULT); + set_console_color(con_st, CONSOLE_COLOR_DEFAULT); printf("\n"); // this also force flush stdout. if (signo == SIGINT) { if (!is_interacting) { @@ -81,32 +36,6 @@ void sigint_handler(int signo) { } #endif -#if defined (_WIN32) -void win32_console_init(void) { - unsigned long dwMode = 0; - void* hConOut = GetStdHandle((unsigned long)-11); // STD_OUTPUT_HANDLE (-11) - if (!hConOut || hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode)) { - hConOut = GetStdHandle((unsigned long)-12); // STD_ERROR_HANDLE (-12) - if (hConOut && (hConOut == (void*)-1 || !GetConsoleMode(hConOut, &dwMode))) { - hConOut = 0; - } - } - if (hConOut) { - // Enable ANSI colors on Windows 10+ - if (con_use_color && !(dwMode & 0x4)) { - SetConsoleMode(hConOut, dwMode | 0x4); // ENABLE_VIRTUAL_TERMINAL_PROCESSING (0x4) - } - // Set console output codepage to UTF8 - SetConsoleOutputCP(65001); // CP_UTF8 - } - void* hConIn = GetStdHandle((unsigned long)-10); // STD_INPUT_HANDLE (-10) - if (hConIn && hConIn != (void*)-1 && GetConsoleMode(hConIn, &dwMode)) { - // Set console input codepage to UTF8 - SetConsoleCP(65001); // CP_UTF8 - } -} -#endif - int main(int argc, char ** argv) { gpt_params params; params.model = "models/llama-7B/ggml-model.bin"; @@ -115,13 +44,12 @@ int main(int argc, char ** argv) { return 1; } - // save choice to use color for later // (note for later: this is a slightly awkward choice) - con_use_color = params.use_color; + con_st.use_color = params.use_color; #if defined (_WIN32) - win32_console_init(); + win32_console_init(params.use_color); #endif if (params.perplexity) { @@ -218,7 +146,10 @@ int main(int argc, char ** argv) { return 1; } - params.n_keep = std::min(params.n_keep, (int) embd_inp.size()); + // number of tokens to keep when resetting context + if (params.n_keep < 0 || params.n_keep > (int)embd_inp.size() || params.instruct) { + params.n_keep = (int)embd_inp.size(); + } // prefix & suffix for instruct mode const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", true); @@ -226,16 +157,12 @@ int main(int argc, char ** argv) { // in instruct mode, we inject a prefix and a suffix to each input by the user if (params.instruct) { - params.interactive = true; + params.interactive_start = true; params.antiprompt.push_back("### Instruction:\n\n"); } - // enable interactive mode if reverse prompt is specified - if (params.antiprompt.size() != 0) { - params.interactive = true; - } - - if (params.interactive_start) { + // enable interactive mode if reverse prompt or interactive start is specified + if (params.antiprompt.size() != 0 || params.interactive_start) { params.interactive = true; } @@ -297,17 +224,18 @@ int main(int argc, char ** argv) { #endif " - Press Return to return control to LLaMa.\n" " - If you want to submit another line, end your input in '\\'.\n\n"); - is_interacting = params.interactive_start || params.instruct; + is_interacting = params.interactive_start; } - bool input_noecho = false; + bool is_antiprompt = false; + bool input_noecho = false; int n_past = 0; int n_remain = params.n_predict; int n_consumed = 0; // the first thing we will do is to output the prompt, so set color accordingly - set_console_state(CONSOLE_STATE_PROMPT); + set_console_color(con_st, CONSOLE_COLOR_PROMPT); std::vector embd; @@ -408,36 +336,38 @@ int main(int argc, char ** argv) { } // reset color to default if we there is no pending user input if (!input_noecho && (int)embd_inp.size() == n_consumed) { - set_console_state(CONSOLE_STATE_DEFAULT); + set_console_color(con_st, CONSOLE_COLOR_DEFAULT); } // in interactive mode, and not currently processing queued inputs; // check if we should prompt the user for more if (params.interactive && (int) embd_inp.size() <= n_consumed) { - // check for reverse prompt - std::string last_output; - for (auto id : last_n_tokens) { - last_output += llama_token_to_str(ctx, id); - } - // Check if each of the reverse prompts appears at the end of the output. - for (std::string & antiprompt : params.antiprompt) { - if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) { - is_interacting = true; - set_console_state(CONSOLE_STATE_USER_INPUT); - fflush(stdout); - break; + // check for reverse prompt + if (params.antiprompt.size()) { + std::string last_output; + for (auto id : last_n_tokens) { + last_output += llama_token_to_str(ctx, id); + } + + is_antiprompt = false; + // Check if each of the reverse prompts appears at the end of the output. + for (std::string & antiprompt : params.antiprompt) { + if (last_output.find(antiprompt.c_str(), last_output.length() - antiprompt.length(), antiprompt.length()) != std::string::npos) { + is_interacting = true; + is_antiprompt = true; + set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); + fflush(stdout); + break; + } } } if (n_past > 0 && is_interacting) { // potentially set color to indicate we are taking user input - set_console_state(CONSOLE_STATE_USER_INPUT); + set_console_color(con_st, CONSOLE_COLOR_USER_INPUT); if (params.instruct) { - n_consumed = embd_inp.size(); - embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); - printf("\n> "); } @@ -463,17 +393,29 @@ int main(int argc, char ** argv) { } while (another_line); // done taking input, reset color - set_console_state(CONSOLE_STATE_DEFAULT); + set_console_color(con_st, CONSOLE_COLOR_DEFAULT); - auto line_inp = ::llama_tokenize(ctx, buffer, false); - embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + // Add tokens to embd only if the input buffer is non-empty + // Entering a empty line lets the user pass control back + if (buffer.length() > 1) { - if (params.instruct) { - embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + // instruct mode: insert instruction prefix + if (params.instruct && !is_antiprompt) { + n_consumed = embd_inp.size(); + embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); + } + + auto line_inp = ::llama_tokenize(ctx, buffer, false); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + + // instruct mode: insert response suffix + if (params.instruct) { + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + } + + n_remain -= line_inp.size(); } - n_remain -= line_inp.size(); - input_noecho = true; // do not echo this again } @@ -506,7 +448,7 @@ int main(int argc, char ** argv) { llama_print_timings(ctx); llama_free(ctx); - set_console_state(CONSOLE_STATE_DEFAULT); + set_console_color(con_st, CONSOLE_COLOR_DEFAULT); return 0; } From a6bdc47cba23713a22ade47dd65b6afeb8009ff4 Mon Sep 17 00:00:00 2001 From: slaren <2141330+slaren@users.noreply.github.com> Date: Tue, 28 Mar 2023 16:26:55 +0200 Subject: [PATCH 04/12] Fix usage of F16C intrinsics in AVX code (#563) * Fix usage of F16C intrinsics in AVX code when F16C is not defined --- ggml.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/ggml.c b/ggml.c index ec00e6317..ba89b5d84 100644 --- a/ggml.c +++ b/ggml.c @@ -1122,13 +1122,36 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { #define GGML_F16_EPR 8 // F16 arithmetic is not supported by AVX, so we use F32 instead -// we take advantage of the _mm256_cvt intrinsics to convert F16 <-> F32 #define GGML_F32Cx8 __m256 #define GGML_F32Cx8_ZERO _mm256_setzero_ps() #define GGML_F32Cx8_SET1(x) _mm256_set1_ps(x) + +#if defined(__F16C__) +// the _mm256_cvt intrinsics require F16C #define GGML_F32Cx8_LOAD(x) _mm256_cvtph_ps(_mm_loadu_si128((__m128i *)(x))) #define GGML_F32Cx8_STORE(x, y) _mm_storeu_si128((__m128i *)(x), _mm256_cvtps_ph(y, 0)) +#else +static inline __m256 __avx_f32cx8_load(ggml_fp16_t *x) { + float tmp[8]; + + for (int i = 0; i < 8; i++) + tmp[i] = GGML_FP16_TO_FP32(x[i]); + + return _mm256_loadu_ps(tmp); +} +static inline void __avx_f32cx8_store(ggml_fp16_t *x, __m256 y) { + float arr[8]; + + _mm256_storeu_ps(arr, y); + + for (int i = 0; i < 8; i++) + x[i] = GGML_FP16_TO_FP32(arr[i]); +} +#define GGML_F32Cx8_LOAD(x) __avx_f32cx8_load(x) +#define GGML_F32Cx8_STORE(x, y) __avx_f32cx8_store(x, y) +#endif + #define GGML_F32Cx8_FMA GGML_F32x8_FMA #define GGML_F32Cx8_ADD _mm256_add_ps #define GGML_F32Cx8_MUL _mm256_mul_ps From 28ba975aea1dcae2f31770516f5d542ff177771e Mon Sep 17 00:00:00 2001 From: dotpy314 <33351922+dotpy314@users.noreply.github.com> Date: Tue, 28 Mar 2023 23:06:28 +0800 Subject: [PATCH 05/12] Check the existence of f16_model_path_base in quantize.py (#574) Co-authored-by: Jincheng Miao --- quantize.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/quantize.py b/quantize.py index 16b5963d3..641df8dda 100644 --- a/quantize.py +++ b/quantize.py @@ -74,6 +74,10 @@ def main(): args.models_path, model, "ggml-model-f16.bin" ) + if not os.path.isfile(f16_model_path_base): + print(f'The file %s was not found' % f16_model_path_base) + sys.exit(1) + f16_model_parts_paths = map( lambda filename: os.path.join(f16_model_path_base, filename), glob.glob(f"{f16_model_path_base}*") From e0670260fb50a882b37074112b1881fb0820cf77 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 28 Mar 2023 18:34:35 +0300 Subject: [PATCH 06/12] gitignore : add "embedding" --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index ce01fd541..053311fee 100644 --- a/.gitignore +++ b/.gitignore @@ -20,6 +20,7 @@ models/* /quantize /result /perplexity +/embedding arm_neon.h compile_commands.json From c1f885067c61191a07a1aedf684168dda62f3f71 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Tue, 28 Mar 2023 15:56:03 +0000 Subject: [PATCH 07/12] ggml : introduce structs for the q4 data blocks (#356) * Introduce structs for the q4 data blocks * ggml : rename quant struct variables + fix ARM_NEON --------- Co-authored-by: Georgi Gerganov --- examples/quantize/quantize.cpp | 4 +- ggml.c | 359 +++++++++++++-------------------- ggml.h | 4 +- llama.cpp | 11 +- llama.h | 3 +- tests/test-quantize.c | 4 +- 6 files changed, 150 insertions(+), 235 deletions(-) diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index f0230f5dc..3888ff587 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -4,8 +4,6 @@ #include #include -const int QK = 32; - // usage: // ./llama-quantize models/llama/ggml-model.bin models/llama/ggml-model-quant.bin type // @@ -39,7 +37,7 @@ int main(int argc, char ** argv) { { const int64_t t_start_us = ggml_time_us(); - if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype, QK)) { + if (llama_model_quantize(fname_inp.c_str(), fname_out.c_str(), itype)) { fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str()); return 1; } diff --git a/ggml.c b/ggml.c index ba89b5d84..bf8ec8ab2 100644 --- a/ggml.c +++ b/ggml.c @@ -448,17 +448,27 @@ static inline __m128i packNibbles( __m256i bytes ) // method 5 // blocks of QK elements // represented with a single float (delta) and QK/2 8-bit ints (i.e QK 4-bit signed integer factors) +typedef struct { + float d; // delta + uint8_t qs[QK / 2]; // nibbles / quants +} block_q4_0; +static_assert(sizeof(block_q4_0) == sizeof(float) + QK / 2, "wrong q4_0 block size/padding"); + +// method 4 +// blocks of QK elements +// represented with 2 floats (delta + min) and QK/2 8-bit ints (i.e QK 4-bit unsigned integer factors) +typedef struct { + float d; + float m; + uint8_t qs[QK / 2]; // nibbles / quants +} block_q4_1; +static_assert(sizeof(block_q4_1) == sizeof(float) * 2 + QK / 2, "wrong q4_1 block size/padding"); // reference implementation for deterministic creation of model files -static void quantize_row_q4_0_reference(const float * restrict x, void * restrict y, int k) { +static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { assert(k % QK == 0); const int nb = k / QK; - const size_t bs = sizeof(float) + QK/2; - - uint8_t * restrict pd = ((uint8_t *)y + 0*bs); - uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); - uint8_t pp[QK/2]; for (int i = 0; i < nb; i++) { @@ -472,8 +482,7 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric const float d = amax / ((1 << 3) - 1); const float id = d ? 1.0f/d : 0.0f; - *(float *)pd = d; - pd += bs; + y[i].d = d; for (int l = 0; l < QK; l += 2) { const float v0 = x[i*QK + l + 0]*id; @@ -488,23 +497,15 @@ static void quantize_row_q4_0_reference(const float * restrict x, void * restric pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb, pp, sizeof(pp)); - pb += bs; + memcpy(y[i].qs, pp, sizeof(pp)); } } -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { +static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int k) { assert(k % QK == 0); - -#if defined(__ARM_NEON) || defined(__AVX2__) || defined(__wasm_simd128__) || defined(__POWER9_VECTOR__) const int nb = k / QK; - const size_t bs = sizeof(float) + QK/2; - uint8_t * restrict pd = ((uint8_t *)y + 0*bs); - uint8_t * restrict pb = ((uint8_t *)y + 0*bs + sizeof(float)); - - uint8_t pp[QK/2]; -#endif + block_q4_0 * restrict y = vy; #if defined(__POWER9_VECTOR__) const vector float v85 = vec_splats(8.5f); @@ -532,10 +533,10 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { const float d = amax / ((1 << 3) - 1); const float id = d ? 1.0/d : 0.0; - *(float *)pd = d; - pd += bs; + y[i].d = d; const vector float vid = vec_splats(id); + uint8_t * restrict pb = y[i].qs; for (int l = 0; l < 8; l++) { const vector float vf = vec_madd(srcv[l], vid, v85); const vector signed int vi = vec_signed(vf); @@ -543,11 +544,9 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { pb[2*l + 0] = vec_extract(vi, 0) | (vec_extract(vi, 1) << 4); pb[2*l + 1] = vec_extract(vi, 2) | (vec_extract(vi, 3) << 4); } - - //memcpy(pb, pp, sizeof(pp)); - pb += bs; } #elif __ARM_NEON + uint8_t pp[QK/2]; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -569,8 +568,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { const float d = amax / ((1 << 3) - 1); const float id = d ? 1.0/d : 0.0; - *(float *)pd = d; - pd += bs; + y[i].d = d; for (int l = 0; l < 8; l++) { const float32x4_t v = vmulq_n_f32(srcv[l], id); @@ -581,8 +579,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { pp[2*l + 1] = vgetq_lane_s32(vi, 2) | (vgetq_lane_s32(vi, 3) << 4); } - memcpy(pb, pp, sizeof(pp)); - pb += bs; + memcpy(y[i].qs, pp, sizeof(pp)); } #elif defined(__AVX2__) for (int i = 0; i < nb; i++) { @@ -607,8 +604,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { // Quantize these floats const float d = maxScalar / 7.0f; - *(float *)pd = d; - pd += bs; + y[i].d = d; const float id = ( maxScalar != 0.0f ) ? 7.0f / maxScalar : 0.0f; const __m256 mul = _mm256_set1_ps( id ); @@ -648,10 +644,10 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { // Compress the vector into 4 bit/value, and store __m128i res = packNibbles( i0 ); - _mm_storeu_si128( ( __m128i* )pb, res ); - pb += bs; + _mm_storeu_si128( ( __m128i* )y[i].qs, res ); } #elif defined(__wasm_simd128__) + uint8_t pp[QK/2]; for (int i = 0; i < nb; i++) { float amax = 0.0f; // absolute max @@ -673,8 +669,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { const float d = amax / ((1 << 3) - 1); const float id = d ? 1.0/d : 0.0; - *(float *)pd = d; - pd += bs; + y[i].d = d; for (int l = 0; l < 8; l++) { const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id)); @@ -685,8 +680,7 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { pp[2*l + 1] = wasm_i32x4_extract_lane(vi, 2) | (wasm_i32x4_extract_lane(vi, 3) << 4); } - memcpy(pb, pp, sizeof(pp)); - pb += bs; + memcpy(y[i].qs, pp, sizeof(pp)); } #else // scalar @@ -694,18 +688,11 @@ void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { #endif } -// method 4 -// blocks of QK elements -// represented with 2 floats (min + delta) and QK/2 8-bit ints (i.e QK 4-bit unsigned integer factors) -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { +static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int k) { assert(k % QK == 0); - const int nb = k / QK; - const size_t bs = 2*sizeof(float) + QK/2; - uint8_t * restrict pd = ((uint8_t *)y + 0*bs); - uint8_t * restrict pm = ((uint8_t *)y + 0*bs + sizeof(float)); - uint8_t * restrict pb = ((uint8_t *)y + 0*bs + 2*sizeof(float)); + block_q4_1 * restrict y = vy; uint8_t pp[QK/2]; @@ -722,10 +709,8 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { const float d = (max - min) / ((1 << 4) - 1); const float id = d ? 1.0f/d : 0.0f; - *(float *)pm = min; - *(float *)pd = d; - pm += bs; - pd += bs; + y[i].d = d; + y[i].m = min; for (int l = 0; l < QK; l += 2) { const float v0 = (x[i*QK + l + 0] - min)*id; @@ -740,27 +725,22 @@ void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { pp[l/2] = vi0 | (vi1 << 4); } - memcpy(pb, pp, sizeof(pp)); - pb += bs; + memcpy(y[i].qs, pp, sizeof(pp)); } } -// TODO: vectorize -void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { +static void dequantize_row_q4_0(const void * restrict vx, float * restrict y, int k) { assert(k % QK == 0); - const int nb = k / QK; - const size_t bs = sizeof(float) + QK/2; - const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); - const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + sizeof(float)); + const block_q4_0 * restrict x = vx; #if defined(__AVX2__) for (int i = 0; i < nb; i++) { // scale factor - const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs)); + const __m256 d_v = _mm256_broadcast_ss(&x[i].d); - const uint8_t * restrict pp = pb + i*bs; + const uint8_t * restrict pp = x[i].qs; for (int l = 0; l < QK; l += 32) { // Load 32x4-bit integers into 32x8-bit integers @@ -790,17 +770,15 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { } #elif defined(__ARM_NEON) for (int i = 0; i < nb; i++) { - const float d = *(const float *) (pd + i*bs); + const float32x4_t vd = vdupq_n_f32(x[i].d); - const uint8_t * restrict pp = pb + i*bs; - - const float32x4_t vd = vdupq_n_f32(d); + const uint8_t * restrict pp = x[i].qs; for (int l = 0; l < QK; l += 16) { // Load 16x4-bit integers into 8x8-bit integers const uint8x8_t v8 = vld1_u8(pp + l/2); - // Expand 4-bit nibbles to 8-bit bytes + // Expand 4-bit qs to 8-bit bytes const uint8x8_t v0 = vand_u8(v8, vdup_n_u8(0x0f)); const uint8x8_t v1 = vshr_n_u8(v8, 4); @@ -844,9 +822,9 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { #else // scalar for (int i = 0; i < nb; i++) { - const float d = *(const float *) (pd + i*bs); + const float d = x[i].d; - const uint8_t * restrict pp = pb + i*bs; + const uint8_t * restrict pp = x[i].qs; for (int l = 0; l < QK; l += 2) { const uint8_t vi = pp[l/2]; @@ -869,22 +847,18 @@ void dequantize_row_q4_0(const void * restrict x, float * restrict y, int k) { #endif } -void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { +static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, int k) { assert(k % QK == 0); - const int nb = k / QK; - const size_t bs = 2*sizeof(float) + QK/2; - const uint8_t * restrict pd = ((const uint8_t *)x + 0*bs); - const uint8_t * restrict pm = ((const uint8_t *)x + 0*bs + sizeof(float)); - const uint8_t * restrict pb = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); + const block_q4_1 * restrict x = vx; #if defined(__AVX2__) for (int i = 0; i < nb; i++) { - const __m256 d_v = _mm256_broadcast_ss((const float *) (pd + i*bs)); - const __m256 d_m = _mm256_broadcast_ss((const float *) (pm + i*bs)); + const __m256 d_v = _mm256_broadcast_ss(&x[i].d); + const __m256 d_m = _mm256_broadcast_ss(&x[i].m); - const uint8_t * restrict pp = pb + i*bs; + const uint8_t * restrict pp = x[i].qs; for (int l = 0; l < QK; l += 32) { // Load 32x4-bit integers into 32x8-bit integers @@ -911,10 +885,10 @@ void dequantize_row_q4_1(const void * restrict x, float * restrict y, int k) { } #else for (int i = 0; i < nb; i++) { - const float d = *(const float *) (pd + i*bs); - const float m = *(const float *) (pm + i*bs); + const float d = x[i].d; + const float m = x[i].m; - const uint8_t * restrict pp = pb + i*bs; + const uint8_t * restrict pp = x[i].qs; for (int l = 0; l < QK; l += 2) { const uint8_t vi = pp[l/2]; @@ -1502,25 +1476,15 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float #if __AVX512F__ && QK == 32 static inline __m512 dot_q4_0_oneblock_avx512( __m512 acc, - const uint8_t * pd0, - const uint8_t * pd1, - const uint8_t * pb0, - const uint8_t * pb1, - size_t bs, + const block_q4_0 * restrict x, + const block_q4_0 * restrict y, int i ) { - const float * d0_0 = (const float *) (pd0 + i*bs); - const float * d1_0 = (const float *) (pd1 + i*bs); - - const uint8_t * restrict p0 = pb0 + (i+0)*bs; - const uint8_t * restrict p1 = pb1 + (i+0)*bs; - // Compute combined scale for the block - float scaleScalar = d0_0[0] * d1_0[0]; - __m512 scale = _mm512_set1_ps( scaleScalar ); + __m512 d = _mm512_set1_ps( x[i].d * y[i].d ); - __m256i bx = bytesFromNibbles( p0 ); - __m256i by = bytesFromNibbles( p1 ); + __m256i bx = bytesFromNibbles( x[i].qs ); + __m256i by = bytesFromNibbles( y[i].qs ); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. const __m256i off = _mm256_set1_epi8( 8 ); @@ -1536,7 +1500,7 @@ static inline __m512 dot_q4_0_oneblock_avx512( // Convert int32_t to float __m512 p = _mm512_cvtepi32_ps( i64 ); // Apply the scale, and accumulate - return _mm512_fmadd_ps( scale, p, acc ); + return _mm512_fmadd_ps( d, p, acc ); } #endif @@ -1576,19 +1540,14 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t *s = sumf; } -inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict x, const void * restrict y) { +inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK; assert(n % QK == 0); assert(nb % 2 == 0); - const size_t bs = sizeof(float) + QK/2; - - const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs); - const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs); - - const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + sizeof(float)); - const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + sizeof(float)); + const block_q4_0 * restrict x = vx; + const block_q4_0 * restrict y = vy; float sumf = 0.0; @@ -1597,23 +1556,18 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void float sum1 = 0.0f; for (int i = 0; i < nb; i += 2) { - const float d0_0 = *(const float *) (pd0 + i*bs); - const float d1_0 = *(const float *) (pd1 + i*bs); - const float d0_1 = *(const float *) (pd0 + (i + 1)*bs); - const float d1_1 = *(const float *) (pd1 + (i + 1)*bs); - - //printf("d0_0: %f, d1_0: %f, d0_1: %f, d1_1: %f\n", d0_0, d1_0, d0_1, d1_1); - - const uint8_t * restrict p0 = pb0 + i*bs; - const uint8_t * restrict p1 = pb1 + i*bs; + const block_q4_0 * restrict x0 = &x[i + 0]; + const block_q4_0 * restrict y0 = &y[i + 0]; + const block_q4_0 * restrict x1 = &x[i + 1]; + const block_q4_0 * restrict y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0xf); const int8x16_t s8b = vdupq_n_s8(0x8); - const uint8x16_t v0_0 = vld1q_u8(p0); - const uint8x16_t v1_0 = vld1q_u8(p1); - const uint8x16_t v0_1 = vld1q_u8(p0 + bs); - const uint8x16_t v1_1 = vld1q_u8(p1 + bs); + const uint8x16_t v0_0 = vld1q_u8(x0->qs); + const uint8x16_t v1_0 = vld1q_u8(y0->qs); + const uint8x16_t v0_1 = vld1q_u8(x1->qs); + const uint8x16_t v1_1 = vld1q_u8(y1->qs); // 4-bit -> 8-bit const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8(v0_0, m4b)); @@ -1651,11 +1605,11 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void // scalar #if defined(__ARM_FEATURE_QRDMX) - sum0 += d0_0*d1_0*vaddvq_s32(p_0); - sum1 += d0_1*d1_1*vaddvq_s32(p_1); + sum0 += x0->d * y0->d * vaddvq_s32(p_0); + sum1 += x1->d * y1->d * vaddvq_s32(p_1); #else - sum0 += d0_0*d1_0*(vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3)); - sum1 += d0_1*d1_1*(vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); + sum0 += x0->d * y0->d * (vgetq_lane_s32(p_0, 0) + vgetq_lane_s32(p_0, 1) + vgetq_lane_s32(p_0, 2) + vgetq_lane_s32(p_0, 3)); + sum1 += x1->d * y1->d * (vgetq_lane_s32(p_1, 0) + vgetq_lane_s32(p_1, 1) + vgetq_lane_s32(p_1, 2) + vgetq_lane_s32(p_1, 3)); #endif #else const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); @@ -1681,11 +1635,11 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void // scalar #if defined(__ARM_FEATURE_QRDMX) - sum0 += d0_0*d1_0*vaddvq_s16(p_0); - sum1 += d0_1*d1_1*vaddvq_s16(p_1); + sum0 += x0->d * y0->d * vaddvq_s16(p_0); + sum1 += x1->d * y1->d * vaddvq_s16(p_1); #else - sum0 += d0_0*d1_0*(vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7)); - sum1 += d0_1*d1_1*(vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7)); + sum0 += x0->d * y0->d * (vgetq_lane_s16(p_0, 0) + vgetq_lane_s16(p_0, 1) + vgetq_lane_s16(p_0, 2) + vgetq_lane_s16(p_0, 3) + vgetq_lane_s16(p_0, 4) + vgetq_lane_s16(p_0, 5) + vgetq_lane_s16(p_0, 6) + vgetq_lane_s16(p_0, 7)); + sum1 += x1->d * y1->d * (vgetq_lane_s16(p_1, 0) + vgetq_lane_s16(p_1, 1) + vgetq_lane_s16(p_1, 2) + vgetq_lane_s16(p_1, 3) + vgetq_lane_s16(p_1, 4) + vgetq_lane_s16(p_1, 5) + vgetq_lane_s16(p_1, 6) + vgetq_lane_s16(p_1, 7)); #endif #endif } @@ -1703,19 +1657,19 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void for (int superblock_ix = 0; superblock_ix < superblock_count; superblock_ix += 1) { int i = superblock_ix * superblock_size; - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+0 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+1 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+2 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+3 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+4 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+5 ); - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i+6 ); - acc1 = dot_q4_0_oneblock_avx512( acc1, pd0, pd1, pb0, pb1, bs, i+7 ); + acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+0 ); + acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+1 ); + acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+2 ); + acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+3 ); + acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+4 ); + acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+5 ); + acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i+6 ); + acc1 = dot_q4_0_oneblock_avx512( acc1, x, y, i+7 ); } // Remainders for (int i = superblock_count * superblock_size; i < nb; ++i) { - acc0 = dot_q4_0_oneblock_avx512( acc0, pd0, pd1, pb0, pb1, bs, i ); + acc0 = dot_q4_0_oneblock_avx512( acc0, x, y, i ); } // Horizontal sum of all lanes of the accumulator @@ -1726,18 +1680,12 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void // Main loop for (int i = 0; i < nb; ++i) { - const float * d0_0 = (const float *) (pd0 + i*bs); - const float * d1_0 = (const float *) (pd1 + i*bs); - - const uint8_t * restrict p0 = pb0 + i*bs; - const uint8_t * restrict p1 = pb1 + i*bs; - // Compute combined scale for the block - const __m256 scale = _mm256_mul_ps( _mm256_broadcast_ss( d0_0 ), _mm256_broadcast_ss( d1_0 ) ); + const __m256 d = _mm256_mul_ps( _mm256_broadcast_ss( &x[i].d ), _mm256_broadcast_ss( &y[i].d ) ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - __m256i bx = bytesFromNibbles( p0 ); - __m256i by = bytesFromNibbles( p1 ); + __m256i bx = bytesFromNibbles( x[i].qs ); + __m256i by = bytesFromNibbles( y[i].qs ); // Now we have a vector with bytes in [ 0 .. 15 ] interval. Offset them into [ -8 .. +7 ] interval. const __m256i off = _mm256_set1_epi8( 8 ); @@ -1759,7 +1707,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void // Convert int32_t to float __m256 p = _mm256_cvtepi32_ps( i32 ); // Apply the scale, and accumulate - acc = _mm256_fmadd_ps( scale, p, acc ); + acc = _mm256_fmadd_ps( d, p, acc ); } // Return horizontal sum of the acc vector @@ -1775,21 +1723,18 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void float sum1 = 0.0f; for (int i = 0; i < nb; i += 2) { - const float d0_0 = *(const float *) (pd0 + i*bs); - const float d1_0 = *(const float *) (pd1 + i*bs); - const float d0_1 = *(const float *) (pd0 + (i + 1)*bs); - const float d1_1 = *(const float *) (pd1 + (i + 1)*bs); - - const uint8_t * restrict p0 = pb0 + i*bs; - const uint8_t * restrict p1 = pb1 + i*bs; + const block_q4_0 * restrict x0 = &px[i + 0]; + const block_q4_0 * restrict y0 = &py[i + 0]; + const block_q4_0 * restrict x1 = &px[i + 1]; + const block_q4_0 * restrict y1 = &py[i + 1]; const v128_t m4b = wasm_u8x16_splat(0xf); const v128_t s8b = wasm_i8x16_splat(0x8); - const v128_t v0_0 = wasm_v128_load(p0); - const v128_t v0_1 = wasm_v128_load(p0 + bs); - const v128_t v1_0 = wasm_v128_load(p1); - const v128_t v1_1 = wasm_v128_load(p1 + bs); + const v128_t v0_0 = wasm_v128_load(x0.qs); + const v128_t v0_1 = wasm_v128_load(y0.qs); + const v128_t v1_0 = wasm_v128_load(x1.qs); + const v128_t v1_1 = wasm_v128_load(y1.qs); // 4-bit -> 8-bit const v128_t v0_0l = wasm_v128_and(v0_0, m4b); @@ -1839,12 +1784,12 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void const v128_t p_0 = wasm_i16x8_add(pl_0, ph_0); const v128_t p_1 = wasm_i16x8_add(pl_1, ph_1); - sum0 += d0_0*d1_0*( + sum0 += x0->d * y0->d * ( wasm_i16x8_extract_lane(p_0, 0) + wasm_i16x8_extract_lane(p_0, 1) + wasm_i16x8_extract_lane(p_0, 2) + wasm_i16x8_extract_lane(p_0, 3) + wasm_i16x8_extract_lane(p_0, 4) + wasm_i16x8_extract_lane(p_0, 5) + wasm_i16x8_extract_lane(p_0, 6) + wasm_i16x8_extract_lane(p_0, 7)); - sum1 += d0_1*d1_1*( + sum1 += x1->d * y1->d * ( wasm_i16x8_extract_lane(p_1, 0) + wasm_i16x8_extract_lane(p_1, 1) + wasm_i16x8_extract_lane(p_1, 2) + wasm_i16x8_extract_lane(p_1, 3) + wasm_i16x8_extract_lane(p_1, 4) + wasm_i16x8_extract_lane(p_1, 5) + @@ -1855,11 +1800,11 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void #else // scalar for (int i = 0; i < nb; i++) { - const float d0 = *(const float *) (pd0 + i*bs); - const float d1 = *(const float *) (pd1 + i*bs); + const float d0 = x[i].d; + const float d1 = y[i].d; - const uint8_t * restrict p0 = pb0 + i*bs; - const uint8_t * restrict p1 = pb1 + i*bs; + const uint8_t * restrict p0 = x[i].qs; + const uint8_t * restrict p1 = y[i].qs; for (int j = 0; j < QK/2; j++) { const uint8_t v0 = p0[j]; @@ -1879,19 +1824,11 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void *s = sumf; } -inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict x, const void * restrict y) { +inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { const int nb = n / QK; - const size_t bs = 2*sizeof(float) + QK/2; - - const uint8_t * restrict pd0 = ((const uint8_t *)x + 0*bs); - const uint8_t * restrict pd1 = ((const uint8_t *)y + 0*bs); - - const uint8_t * restrict pm0 = ((const uint8_t *)x + 0*bs + sizeof(float)); - const uint8_t * restrict pm1 = ((const uint8_t *)y + 0*bs + sizeof(float)); - - const uint8_t * restrict pb0 = ((const uint8_t *)x + 0*bs + 2*sizeof(float)); - const uint8_t * restrict pb1 = ((const uint8_t *)y + 0*bs + 2*sizeof(float)); + const block_q4_1 * restrict x = vx; + const block_q4_1 * restrict y = vy; float sumf = 0.0; @@ -1903,21 +1840,17 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void // Main loop for (int i = 0; i < nb; ++i) { - const float * m0 = (const float *) (pm0 + i*bs); - const float * m1 = (const float *) (pm1 + i*bs); + const float * d0 = &x[i].d; + const float * d1 = &y[i].d; - const float * d0 = (const float *) (pd0 + i*bs); - const float * d1 = (const float *) (pd1 + i*bs); - - const uint8_t * restrict p0 = pb0 + i*bs; - const uint8_t * restrict p1 = pb1 + i*bs; + const float * m0 = &x[i].m; + const float * m1 = &y[i].m; const __m256 d0v = _mm256_broadcast_ss( d0 ); const __m256 d1v = _mm256_broadcast_ss( d1 ); const __m256 m0v = _mm256_broadcast_ss( m0 ); const __m256 m1v = _mm256_broadcast_ss( m1 ); - // Compute combined scale for the block const __m256 scale_01 = _mm256_mul_ps( d0v, d1v ); @@ -1927,8 +1860,8 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void const __m256 cross_scales = _mm256_blend_ps( scale_0, scale_1, 0b10101010 ); // Load 16 bytes, and unpack 4 bit fields into bytes, making 32 bytes - __m256i bx = bytesFromNibbles( p0 ); - __m256i by = bytesFromNibbles( p1 ); + __m256i bx = bytesFromNibbles( x[i].qs ); + __m256i by = bytesFromNibbles( y[i].qs ); // Now we have a vector with bytes in [ 0 .. 15 ] interval. @@ -1973,14 +1906,14 @@ inline static void ggml_vec_dot_q4_1(const int n, float * restrict s, const void #else // scalar for (int i = 0; i < nb; i++) { - const float m0 = *(const float *) (pm0 + i*bs); - const float m1 = *(const float *) (pm1 + i*bs); + const float d0 = x[i].d; + const float d1 = y[i].d; - const float d0 = *(const float *) (pd0 + i*bs); - const float d1 = *(const float *) (pd1 + i*bs); + const float m0 = x[i].m; + const float m1 = y[i].m; - const uint8_t * restrict p0 = pb0 + i*bs; - const uint8_t * restrict p1 = pb1 + i*bs; + const uint8_t * restrict p0 = x[i].qs; + const uint8_t * restrict p1 = y[i].qs; for (int j = 0; j < QK/2; j++) { const uint8_t v0 = p0[j]; @@ -2251,8 +2184,8 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = { static_assert(GGML_TYPE_COUNT == 7, "GGML_TYPE_COUNT != 5"); static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = { - sizeof(float ) + QK/2, - sizeof(float )*2 + QK/2, + sizeof(block_q4_0), + sizeof(block_q4_1), sizeof(int8_t ), sizeof(int16_t), sizeof(int32_t), @@ -10369,64 +10302,50 @@ enum ggml_opt_result ggml_opt( //////////////////////////////////////////////////////////////////////////////// -size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist) { - const int nb = k / qk; - const size_t bs = (sizeof(float) + sizeof(uint8_t)*qk/2); - const size_t row_size = nb*bs; - - assert(k % qk == 0); - - char * pdst = (char *) dst; +size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK == 0); + const int nb = k / QK; for (int j = 0; j < n; j += k) { - uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); - uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + sizeof(float)); + block_q4_0 * restrict y = (block_q4_0 *)dst + j/QK; - quantize_row_q4_0_reference(src + j, pd, k); + quantize_row_q4_0_reference(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < qk; l += 2) { - const uint8_t vi0 = pb[l/2] & 0xF; - const uint8_t vi1 = pb[l/2] >> 4; + for (int l = 0; l < QK; l += 2) { + const uint8_t vi0 = y[i].qs[l/2] & 0xF; + const uint8_t vi1 = y[i].qs[l/2] >> 4; hist[vi0]++; hist[vi1]++; } - pb += bs; } } - return (n/k)*row_size; + return (n/QK*sizeof(block_q4_0)); } -size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist) { - const int nb = k / qk; - const size_t bs = (2*sizeof(float) + sizeof(uint8_t)*qk/2); - const size_t row_size = nb*bs; - - assert(k % qk == 0); - - char * pdst = (char *) dst; +size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist) { + assert(k % QK == 0); + const int nb = k / QK; for (int j = 0; j < n; j += k) { - uint8_t * pd = (uint8_t *) (pdst + (j/k)*row_size + 0*bs); - uint8_t * pb = (uint8_t *) (pdst + (j/k)*row_size + 0*bs + 2*sizeof(float)); + block_q4_1 * restrict y = (block_q4_1 *)dst + j/QK; - quantize_row_q4_1(src + j, pd, k); + quantize_row_q4_1(src + j, y, k); for (int i = 0; i < nb; i++) { - for (int l = 0; l < qk; l += 2) { - const uint8_t vi0 = pb[l/2] & 0xF; - const uint8_t vi1 = pb[l/2] >> 4; + for (int l = 0; l < QK; l += 2) { + const uint8_t vi0 = y[i].qs[l/2] & 0xF; + const uint8_t vi1 = y[i].qs[l/2] >> 4; hist[vi0]++; hist[vi1]++; } - pb += bs; } } - return (n/k)*row_size; + return (n/QK*sizeof(block_q4_1)); } //////////////////////////////////////////////////////////////////////////////// diff --git a/ggml.h b/ggml.h index ddb97318b..335230f9f 100644 --- a/ggml.h +++ b/ggml.h @@ -748,8 +748,8 @@ enum ggml_opt_result ggml_opt( // quantization // -size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int qk, int64_t * hist); -size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int qk, int64_t * hist); +size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * hist); +size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * hist); // // system info diff --git a/llama.cpp b/llama.cpp index 2bd520353..b0eab2e72 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1345,7 +1345,7 @@ static llama_vocab::id llama_sample_top_p_top_k( // // TODO: reuse code from the llama_model_load() somehow -bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype, int qk) { +static bool llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, int itype) { ggml_type type = GGML_TYPE_Q4_1; switch (itype) { @@ -1568,11 +1568,11 @@ bool llama_model_quantize_internal(const std::string & fname_inp, const std::str switch (type) { case GGML_TYPE_Q4_0: { - cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data()); + cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; case GGML_TYPE_Q4_1: { - cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], qk, hist_cur.data()); + cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; default: { @@ -1711,9 +1711,8 @@ void llama_free(struct llama_context * ctx) { int llama_model_quantize( const char * fname_inp, const char * fname_out, - int itype, - int qk) { - if (!llama_model_quantize_internal(fname_inp, fname_out, itype, qk)) { + int itype) { + if (!llama_model_quantize_internal(fname_inp, fname_out, itype)) { fprintf(stderr, "%s: failed to quantize\n", __func__); return 1; } diff --git a/llama.h b/llama.h index ebf55f41c..d3f4cae61 100644 --- a/llama.h +++ b/llama.h @@ -81,8 +81,7 @@ extern "C" { LLAMA_API int llama_model_quantize( const char * fname_inp, const char * fname_out, - int itype, - int qk); + int itype); // Run the llama inference to obtain the logits and probabilities for the next token. // tokens + n_tokens is the provided batch of new tokens to process diff --git a/tests/test-quantize.c b/tests/test-quantize.c index d59ecb8ab..993e9dcc3 100644 --- a/tests/test-quantize.c +++ b/tests/test-quantize.c @@ -13,7 +13,7 @@ int main(void) { src[i] = (float)(i + 1); } - size_t size = ggml_quantize_q4_0(src, dst, QK, QK, QK, hist); + size_t size = ggml_quantize_q4_0(src, dst, QK, QK, hist); assert(size == 20); float max_result = ((float *)dst)[0]; float max_expected = src[31] / ((1 << 3) - 1); @@ -24,7 +24,7 @@ int main(void) { assert(q4_result == q4_expected); } - size = ggml_quantize_q4_1(src, dst, QK, QK, QK, hist); + size = ggml_quantize_q4_1(src, dst, QK, QK, hist); assert(size == 24); float delta_result = ((float *)dst)[0]; float delta_expected = (src[31] - src[0]) / ((1 << 4) - 1); From 20e1e84884376b3fb44ffbfd48d478b2934b0b5e Mon Sep 17 00:00:00 2001 From: Jed Fox Date: Tue, 28 Mar 2023 11:39:01 -0500 Subject: [PATCH 08/12] deploy : add a Package.swift for SwiftPM support (#393) * Add a Package.swift for SwiftPM support * Swap from exclusions to allowlist --- .gitignore | 5 +++++ Package.swift | 20 ++++++++++++++++++++ spm-headers/llama.h | 1 + 3 files changed, 26 insertions(+) create mode 100644 Package.swift create mode 120000 spm-headers/llama.h diff --git a/.gitignore b/.gitignore index 053311fee..741c6b4ea 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ .vscode/ .DS_Store +.build/ build/ build-em/ build-debug/ @@ -27,3 +28,7 @@ compile_commands.json .envrc .direnv/ + +.venv +__pycache__ +.swiftpm diff --git a/Package.swift b/Package.swift new file mode 100644 index 000000000..79d13c82d --- /dev/null +++ b/Package.swift @@ -0,0 +1,20 @@ +// swift-tools-version:5.3 + +import PackageDescription + +let package = Package( + name: "llama", + products: [ + .library(name: "llama", targets: ["llama"]), + ], + targets: [ + .target( + name: "llama", + path: ".", + sources: ["ggml.c", "llama.cpp"], + publicHeadersPath: "spm-headers", + cSettings: [.unsafeFlags(["-Wno-shorten-64-to-32"])] + ), + ], + cxxLanguageStandard: .cxx11 +) diff --git a/spm-headers/llama.h b/spm-headers/llama.h new file mode 120000 index 000000000..9acceb980 --- /dev/null +++ b/spm-headers/llama.h @@ -0,0 +1 @@ +../llama.h \ No newline at end of file From 436e56193199a1625f8c561069f702e8840a9e08 Mon Sep 17 00:00:00 2001 From: Stephan Walter Date: Tue, 28 Mar 2023 16:48:20 +0000 Subject: [PATCH 09/12] all : be more strict about converting float to double (#458) * Be more strict about converting float to double * Test equivalence of round, SILU implementations Test module is commented out in CMakeLists.txt because the tests may take a long time, depending on how much the compiler optimizes. * Fix softmax in perplexity.cpp * all : prefer float over double where appropriate * perplexity : add --------- Co-authored-by: Georgi Gerganov --- CMakeLists.txt | 5 +- Makefile | 4 + examples/common.cpp | 6 +- examples/main/main.cpp | 11 +-- examples/perplexity/perplexity.cpp | 20 +++-- examples/quantize/quantize.cpp | 4 +- ggml.c | 138 +++++++++++++++-------------- llama.cpp | 52 +++++------ llama.h | 8 +- tests/CMakeLists.txt | 1 + tests/test-double-float.c | 53 +++++++++++ 11 files changed, 185 insertions(+), 117 deletions(-) create mode 100644 tests/test-double-float.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 27a222a16..241be4c15 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,17 +124,18 @@ if (LLAMA_ALL_WARNINGS) -Wall -Wextra -Wpedantic - -Wshadow -Wcast-qual + -Wdouble-promotion + -Wshadow -Wstrict-prototypes -Wpointer-arith - -Wno-unused-function ) set(cxx_flags -Wall -Wextra -Wpedantic -Wcast-qual + -Wdouble-promotion ) else() # todo : msvc diff --git a/Makefile b/Makefile index 973b2951b..9cfa89f7a 100644 --- a/Makefile +++ b/Makefile @@ -35,6 +35,10 @@ CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC LDFLAGS = +# warnings +CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith -Wno-unused-function +CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function + # OS specific # TODO: support Windows ifeq ($(UNAME_S),Linux) diff --git a/examples/common.cpp b/examples/common.cpp index 880ebe9a2..af3ad9eb7 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -215,13 +215,13 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, " prompt file to start generation.\n"); fprintf(stderr, " -n N, --n_predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); fprintf(stderr, " --top_k N top-k sampling (default: %d)\n", params.top_k); - fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", params.top_p); + fprintf(stderr, " --top_p N top-p sampling (default: %.1f)\n", (double)params.top_p); fprintf(stderr, " --repeat_last_n N last n tokens to consider for penalize (default: %d)\n", params.repeat_last_n); - fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", params.repeat_penalty); + fprintf(stderr, " --repeat_penalty N penalize repeat sequence of tokens (default: %.1f)\n", (double)params.repeat_penalty); fprintf(stderr, " -c N, --ctx_size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating\n"); fprintf(stderr, " --memory_f32 use f32 instead of f16 for memory key+value\n"); - fprintf(stderr, " --temp N temperature (default: %.1f)\n", params.temp); + fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp); fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n"); fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stderr, " --perplexity compute perplexity over the prompt\n"); diff --git a/examples/main/main.cpp b/examples/main/main.cpp index d5ab2cf75..3130aef0c 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -209,7 +209,8 @@ int main(int argc, char ** argv) { fprintf(stderr, "Input prefix: '%s'\n", params.input_prefix.c_str()); } } - fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); + fprintf(stderr, "sampling: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", + params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty); fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); fprintf(stderr, "\n\n"); @@ -274,10 +275,10 @@ int main(int argc, char ** argv) { if ((int) embd_inp.size() <= n_consumed && !is_interacting) { // out of user input, sample next token - const float top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - const float repeat_penalty = params.repeat_penalty; + const int32_t top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + const float repeat_penalty = params.repeat_penalty; llama_token id = 0; diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 75d526d3d..07ed0a829 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -1,15 +1,17 @@ #include "common.h" #include "llama.h" -std::vector softmax(const std::vector& logits) { - std::vector probs(logits.size()); +#include + +std::vector softmax(const std::vector& logits) { + std::vector probs(logits.size()); float max_logit = logits[0]; for (float v : logits) max_logit = std::max(max_logit, v); double sum_exp = 0.0; for (size_t i = 0; i < logits.size(); i++) { // Subtract the maximum logit value from the current logit value for numerical stability - float logit = logits[i] - max_logit; - double exp_logit = std::exp(logit); + const float logit = logits[i] - max_logit; + const float exp_logit = expf(logit); sum_exp += exp_logit; probs[i] = exp_logit; } @@ -24,14 +26,16 @@ void perplexity(llama_context * ctx, const gpt_params & params) { auto tokens = ::llama_tokenize(ctx, params.prompt, true); int count = 0; - double nll = 0.0; int seq_count = tokens.size() / params.n_ctx; + double nll = 0.0; + fprintf(stderr, "%s : calculating perplexity over %d chunks\n", __func__, seq_count); for (int i = 0; i < seq_count; ++i) { int start = i * params.n_ctx; - int end = start + params.n_ctx - 1; + int end = start + params.n_ctx - 1; // TODO: this is not optimal, e.g. it makes the batch 511 instead of 512 + // it is better to always be power of 2 for better performance std::vector embd(tokens.begin() + start, tokens.begin() + end); auto start_t = std::chrono::high_resolution_clock::now(); if (llama_eval(ctx, embd.data(), embd.size(), 0, params.n_threads)) { @@ -40,7 +44,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) { } auto end_t = std::chrono::high_resolution_clock::now(); if (i == 0) { - double seconds = std::chrono::duration(end_t - start_t).count(); + const float seconds = std::chrono::duration(end_t - start_t).count(); printf("%.2f seconds per pass - ETA %.2f hours\n", seconds, (seconds * seq_count) / (60.0*60.0)); } // We get the logits for all the tokens in the context window (params.n_ctx) @@ -63,7 +67,7 @@ void perplexity(llama_context * ctx, const gpt_params & params) { std::vector tok_logits( logits + j * n_vocab, logits + (j + 1) * n_vocab); - double prob = softmax(tok_logits)[tokens[start + j + 1]]; + const float prob = softmax(tok_logits)[tokens[start + j + 1]]; nll += -std::log(prob); ++count; } diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index 3888ff587..b444328ac 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -50,8 +50,8 @@ int main(int argc, char ** argv) { const int64_t t_main_end_us = ggml_time_us(); printf("\n"); - printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0f); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + printf("%s: quantize time = %8.2f ms\n", __func__, t_quantize_us/1000.0); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0); } return 0; diff --git a/ggml.c b/ggml.c index bf8ec8ab2..83395a701 100644 --- a/ggml.c +++ b/ggml.c @@ -150,10 +150,10 @@ typedef double ggml_float; // #include -#define GGML_COMPUTE_FP16_TO_FP32(x) (x) +#define GGML_COMPUTE_FP16_TO_FP32(x) ((float) (x)) #define GGML_COMPUTE_FP32_TO_FP16(x) (x) -#define GGML_FP16_TO_FP32(x) (x) +#define GGML_FP16_TO_FP32(x) ((float) (x)) #define GGML_FP32_TO_FP16(x) (x) #else @@ -322,7 +322,7 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) { // note: do not use these inside ggml.c // these are meant to be used via the ggml.h API float ggml_fp16_to_fp32(ggml_fp16_t x) { - return GGML_FP16_TO_FP32(x); + return (float) GGML_FP16_TO_FP32(x); } ggml_fp16_t ggml_fp32_to_fp16(float x) { @@ -488,8 +488,8 @@ static void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * r const float v0 = x[i*QK + l + 0]*id; const float v1 = x[i*QK + l + 1]*id; - const uint8_t vi0 = ((int8_t) (round(v0))) + 8; - const uint8_t vi1 = ((int8_t) (round(v1))) + 8; + const uint8_t vi0 = (int8_t)roundf(v0) + 8; + const uint8_t vi1 = (int8_t)roundf(v1) + 8; assert(vi0 >= 0 && vi0 < 16); assert(vi1 >= 0 && vi1 < 16); @@ -566,7 +566,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int MAX(vgetq_lane_f32(amaxv[0], 2), vgetq_lane_f32(amaxv[0], 3))); const float d = amax / ((1 << 3) - 1); - const float id = d ? 1.0/d : 0.0; + const float id = d ? 1.0f/d : 0.0f; y[i].d = d; @@ -716,8 +716,8 @@ static void quantize_row_q4_1(const float * restrict x, void * restrict vy, int const float v0 = (x[i*QK + l + 0] - min)*id; const float v1 = (x[i*QK + l + 1] - min)*id; - const uint8_t vi0 = round(v0); - const uint8_t vi1 = round(v1); + const uint8_t vi0 = roundf(v0); + const uint8_t vi1 = roundf(v1); assert(vi0 >= 0 && vi0 < 16); assert(vi1 >= 0 && vi1 < 16); @@ -1001,7 +1001,7 @@ static void dequantize_row_q4_1(const void * restrict vx, float * restrict y, in } \ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \ - res = vaddvq_f32(vaddq_f32(t0, t1)); \ + res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \ } #define GGML_F16_VEC GGML_F16x8 @@ -1437,9 +1437,8 @@ inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, co inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { - ggml_float sumf = 0.0; - #ifdef GGML_SIMD + float sumf = 0.0f; const int np = (n & ~(GGML_F32_STEP - 1)); GGML_F32_VEC sum[GGML_F32_ARR] = { GGML_F32_VEC_ZERO }; @@ -1465,8 +1464,9 @@ inline static void ggml_vec_dot_f32(const int n, float * restrict s, const float } #else // scalar + ggml_float sumf = 0.0; for (int i = 0; i < n; ++i) { - sumf += x[i]*y[i]; + sumf += (ggml_float)(x[i]*y[i]); } #endif @@ -1529,11 +1529,11 @@ inline static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t // leftovers for (int i = np; i < n; ++i) { - sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); + sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); } #else for (int i = 0; i < n; ++i) { - sumf += GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i]); + sumf += (ggml_float)(GGML_FP16_TO_FP32(x[i])*GGML_FP16_TO_FP32(y[i])); } #endif @@ -1549,7 +1549,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void const block_q4_0 * restrict x = vx; const block_q4_0 * restrict y = vy; - float sumf = 0.0; + ggml_float sumf = 0.0; #if defined(__ARM_NEON) float sum0 = 0.0f; @@ -1644,7 +1644,7 @@ inline static void ggml_vec_dot_q4_0(const int n, float * restrict s, const void #endif } - sumf = sum0 + sum1; + sumf = (ggml_float)(sum0 + sum1); #elif defined(__AVX512F__) // Initialize accumulator with zeros __m512 acc0 = _mm512_setzero_ps(); @@ -1972,13 +1972,13 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re // leftovers for (int i = np; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]); + sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); } } #else for (int i = 0; i < n; ++i) { for (int j = 0; j < GGML_VEC_DOT_UNROLL; ++j) { - sumf[j] += GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i]); + sumf[j] += (ggml_float)(GGML_FP16_TO_FP32(x[j][i])*GGML_FP16_TO_FP32(y[i])); } } #endif @@ -2049,19 +2049,19 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) { #endif } -inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrt(*s); } +inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, x, x); *s = sqrtf(*s); } inline static void ggml_vec_sqr_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i]; } -inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrt(x[i]); } +inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); } inline static void ggml_vec_abs_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); } inline static void ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); } inline static void ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; } inline static void ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; } -static const ggml_float GELU_COEF_A = 0.044715; -static const ggml_float SQRT_2_OVER_PI = 0.79788456080286535587989211986876; +static const float GELU_COEF_A = 0.044715f; +static const float SQRT_2_OVER_PI = 0.79788456080286535587989211986876f; inline static float ggml_gelu_f32(float x) { - return 0.5*x*(1.0 + tanh(SQRT_2_OVER_PI*x*(1.0 + GELU_COEF_A*x*x))); + return 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); } inline static void ggml_vec_gelu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { @@ -2090,7 +2090,7 @@ inline static void ggml_vec_gelu_f32(const int n, float * y, const float * x) { // Sigmoid Linear Unit (SiLU) function inline static float ggml_silu_f32(float x) { - return x/(1.0 + exp(-x)); + return x/(1.0f + expf(-x)); } inline static void ggml_vec_silu_f16(const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { @@ -2121,7 +2121,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE ggml_float sum = 0.0; for (int i = 0; i < n; ++i) { - sum += x[i]; + sum += (ggml_float)x[i]; } *s = sum; #else @@ -2131,7 +2131,7 @@ inline static void ggml_vec_sum_f32(const int n, float * s, const float * x) { inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { #ifndef GGML_USE_ACCELERATE - ggml_float max = -INFINITY; + float max = -INFINITY; for (int i = 0; i < n; ++i) { max = MAX(max, x[i]); } @@ -2141,7 +2141,10 @@ inline static void ggml_vec_max_f32(const int n, float * s, const float * x) { #endif } -inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { ggml_vec_norm_f32(n, s, x); *s = 1./(*s); } +inline static void ggml_vec_norm_inv_f32(const int n, float * s, const float * x) { + ggml_vec_norm_f32(n, s, x); + *s = 1.f/(*s); +} // // logging @@ -2540,7 +2543,7 @@ struct ggml_context * ggml_init(struct ggml_init_params params) { const float f = table_f32_f16[i] = GGML_COMPUTE_FP16_TO_FP32(ii); table_gelu_f16[i] = GGML_FP32_TO_FP16(ggml_gelu_f32(f)); table_silu_f16[i] = GGML_FP32_TO_FP16(ggml_silu_f32(f)); - table_exp_f16[i] = GGML_FP32_TO_FP16(exp(f)); + table_exp_f16[i] = GGML_FP32_TO_FP16(expf(f)); } const uint64_t t_end = ggml_time_us(); UNUSED(t_end); @@ -5583,7 +5586,7 @@ static void ggml_compute_forward_norm_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - const ggml_float eps = 1e-5f; // TODO: make this a parameter + const float eps = 1e-5f; // TODO: make this a parameter // TODO: optimize for (int i03 = 0; i03 < ne03; i03++) { @@ -5591,23 +5594,24 @@ static void ggml_compute_forward_norm_f32( for (int i01 = ith; i01 < ne01; i01 += nth) { const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - ggml_float mean = 0.0; + ggml_float sum = 0.0; for (int i00 = 0; i00 < ne00; i00++) { - mean += x[i00]; + sum += (ggml_float)x[i00]; } - mean /= ne00; + float mean = sum/ne00; float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); ggml_float sum2 = 0.0; for (int i00 = 0; i00 < ne00; i00++) { - ggml_float v = x[i00] - mean; + float v = x[i00] - mean; y[i00] = v; - sum2 += v*v; + sum2 += (ggml_float)(v*v); } - const float scale = 1.0/sqrt(sum2/ne00 + eps); + float variance = sum2/ne00; + const float scale = 1.0f/sqrtf(variance + eps); ggml_vec_scale_f32(ne00, y, scale); } @@ -5665,7 +5669,7 @@ static void ggml_compute_forward_rms_norm_f32( const size_t nb2 = dst->nb[2]; const size_t nb3 = dst->nb[3]; - const ggml_float eps = 1e-6f; // TODO: make this a parameter + const float eps = 1e-6f; // TODO: make this a parameter // TODO: optimize for (int i03 = 0; i03 < ne03; i03++) { @@ -5673,12 +5677,12 @@ static void ggml_compute_forward_rms_norm_f32( for (int i01 = ith; i01 < ne01; i01 += nth) { const float * x = (float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03); - ggml_float mean = 0.0; + ggml_float sum = 0.0; for (int i00 = 0; i00 < ne00; i00++) { - mean += x[i00] * x[i00]; + sum += (ggml_float)(x[i00] * x[i00]); } - mean /= ne00; + float mean = sum/ne00; float * y = (float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3); @@ -5687,7 +5691,7 @@ static void ggml_compute_forward_rms_norm_f32( // y[i00] = x[i00]; // } - const float scale = 1.0/sqrt(mean + eps); + const float scale = 1.0f/sqrtf(mean + eps); ggml_vec_scale_f32(ne00, y, scale); } @@ -6913,12 +6917,12 @@ static void ggml_compute_forward_soft_max_f32( ggml_fp16_t s = GGML_FP32_TO_FP16(p[i] - max); memcpy(&scvt, &s, sizeof(scvt)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt]); - sum += val; + sum += (ggml_float)val; p[i] = val; } } - assert(sum > 0.0f); + assert(sum > 0.0); sum = 1.0/sum; ggml_vec_scale_f32(nc, p, sum); @@ -6994,16 +6998,16 @@ static void ggml_compute_forward_rope_f32( const int p = (mode == 0 ? n_past + i2 : i2); for (int i1 = 0; i1 < ne1; i1++) { for (int i0 = 0; i0 < n_dims; i0 += 2) { - const double theta = pow(10000.0, ((double)-i0)/n_dims); + const float theta = powf(10000.0, ((float)-i0)/n_dims); - const double cos_theta = cos(p*theta); - const double sin_theta = sin(p*theta); + const float cos_theta = cosf(p*theta); + const float sin_theta = sinf(p*theta); const float * const src = (float *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - double x0 = src[0]; - double x1 = src[1]; + const float x0 = src[0]; + const float x1 = src[1]; dst_data[0] = x0*cos_theta - x1*sin_theta; dst_data[1] = x0*sin_theta + x1*cos_theta; @@ -7050,16 +7054,16 @@ static void ggml_compute_forward_rope_f16( const int p = (mode == 0 ? n_past + i2 : i2); for (int i1 = 0; i1 < ne1; i1++) { for (int i0 = 0; i0 < n_dims; i0 += 2) { - const double theta = pow(10000.0, ((double)-i0)/n_dims); + const float theta = powf(10000.0, ((float)-i0)/n_dims); - const double cos_theta = cos(p*theta); - const double sin_theta = sin(p*theta); + const float cos_theta = cosf(p*theta); + const float sin_theta = sinf(p*theta); const ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); ggml_fp16_t * dst_data = (ggml_fp16_t *)((char *) dst->data + i3*nb3 + i2*nb2 + i1*nb1 + i0*nb0); - double x0 = ggml_fp16_to_fp32(src[0]); - double x1 = ggml_fp16_to_fp32(src[1]); + const float x0 = ggml_fp16_to_fp32(src[0]); + const float x1 = ggml_fp16_to_fp32(src[1]); dst_data[0] = ggml_fp32_to_fp16(x0*cos_theta - x1*sin_theta); dst_data[1] = ggml_fp32_to_fp16(x0*sin_theta + x1*cos_theta); @@ -7735,7 +7739,7 @@ static void ggml_compute_forward_flash_attn_f32( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - const float scale = 1.0/sqrt((double) D); + const float scale = 1.0f/sqrtf(D); //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); @@ -7782,7 +7786,7 @@ static void ggml_compute_forward_flash_attn_f32( float max = -INFINITY; ggml_vec_max_f32(M, &max, S); - float sum = 0.0f; + ggml_float sum = 0.0; { #ifdef GGML_SOFT_MAX_ACCELERATE max = -max; @@ -7803,7 +7807,7 @@ static void ggml_compute_forward_flash_attn_f32( ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); - sump[j] += val; + sump[j] += (ggml_float)val; SS[j] = val; } } @@ -7815,7 +7819,7 @@ static void ggml_compute_forward_flash_attn_f32( #endif } - assert(sum > 0.0f); + assert(sum > 0.0); sum = 1.0/sum; ggml_vec_scale_f32(M, S, sum); @@ -7944,7 +7948,7 @@ static void ggml_compute_forward_flash_attn_f16( const int ir0 = dr*ith; const int ir1 = MIN(ir0 + dr, nr); - const float scale = 1.0/sqrt((double) D); + const float scale = 1.0f/sqrtf(D); //printf("P=%d N=%d D=%d ir0=%d ir1=%d scale = %f\n", P, N, D, ir0, ir1, scale); @@ -8008,7 +8012,7 @@ static void ggml_compute_forward_flash_attn_f16( float max = -INFINITY; ggml_vec_max_f32(M, &max, S); - float sum = 0.0f; + ggml_float sum = 0.0; { #ifdef GGML_SOFT_MAX_ACCELERATE max = -max; @@ -8029,7 +8033,7 @@ static void ggml_compute_forward_flash_attn_f16( ggml_fp16_t s = GGML_FP32_TO_FP16(SS[j] - max); memcpy(&scvt[j], &s, sizeof(uint16_t)); const float val = GGML_FP16_TO_FP32(table_exp_f16[scvt[j]]); - sump[j] += val; + sump[j] += (ggml_float)val; SS[j] = val; } } @@ -8041,7 +8045,7 @@ static void ggml_compute_forward_flash_attn_f16( #endif } - assert(sum > 0.0f); + assert(sum > 0.0); sum = 1.0/sum; ggml_vec_scale_f32(M, S, sum); @@ -9566,7 +9570,7 @@ label=\"%d [%d, %d] | %s", fprintf(fp, " \"%p\" [ \ style = filled; fillcolor = %s; shape = record; \ label=\"%.1e\"; ]\n", - (void *) node, color, ggml_get_f32_1d(node, 0)); + (void *) node, color, (double)ggml_get_f32_1d(node, 0)); } else { fprintf(fp, " \"%p\" [ \ style = filled; fillcolor = %s; shape = record; \ @@ -9804,7 +9808,7 @@ static enum ggml_opt_result ggml_opt_adam( if (params.past <= t) { const float rate = (pf[t%params.past] - fx)/fx; - if (fabs(rate) < params.delta) { + if (fabsf(rate) < params.delta) { return GGML_OPT_OK; } } @@ -9883,7 +9887,7 @@ static enum ggml_opt_result linesearch_backtracking( const float dec = 0.5f; const float inc = 2.1f; - if (*step <= 0.) { + if (*step <= 0.f) { return GGML_LINESEARCH_INVALID_PARAMETERS; } @@ -9971,7 +9975,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( struct ggml_cgraph * gb) { if (params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_WOLFE || params.lbfgs.linesearch == GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE) { - if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1. <= params.lbfgs.wolfe) { + if (params.lbfgs.wolfe <= params.lbfgs.ftol || 1.f <= params.lbfgs.wolfe) { return GGML_OPT_INVALID_WOLFE; } } @@ -10092,8 +10096,8 @@ static enum ggml_opt_result ggml_opt_lbfgs( GGML_PRINT_DEBUG("f = %10.6f\n", ggml_get_f32_1d(f, 0)); - if (xnorm < 1.0) { - xnorm = 1.0; + if (xnorm < 1.0f) { + xnorm = 1.0f; } if (gnorm/xnorm <= params.lbfgs.eps) { // converged @@ -10106,7 +10110,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( if (params.past <= k) { const float rate = (pf[k%params.past] - fx)/fx; - if (fabs(rate) < params.delta) { + if (fabsf(rate) < params.delta) { return GGML_OPT_OK; } } diff --git a/llama.cpp b/llama.cpp index b0eab2e72..ee7eb8ea7 100644 --- a/llama.cpp +++ b/llama.cpp @@ -779,8 +779,8 @@ static bool llama_model_load( // progress if (progress_callback) { - double current_file_progress = double(size_t(fin.tellg()) - file_offset) / double(file_size - file_offset); - double current_progress = (double(i) + current_file_progress) / double(n_parts); + float current_file_progress = float(size_t(fin.tellg()) - file_offset) / float(file_size - file_offset); + float current_progress = (float(i) + current_file_progress) / float(n_parts); progress_callback(current_progress, progress_callback_user_data); } if (model.n_loaded % 8 == 0) { @@ -922,7 +922,7 @@ static bool llama_eval_internal( struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_embd)/n_head))); + ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head))); // KQ_masked = mask_past(KQ_scaled) struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); @@ -1240,12 +1240,12 @@ static std::vector llama_tokenize(const llama_vocab & vocab, co // sampling // -static void sample_top_k(std::vector> & logits_id, int top_k) { +static void sample_top_k(std::vector> & logits_id, int top_k) { // find the top k tokens std::partial_sort( logits_id.begin(), logits_id.begin() + top_k, logits_id.end(), - [](const std::pair & a, const std::pair & b) { + [](const std::pair & a, const std::pair & b) { return a.first > b.first; }); @@ -1256,9 +1256,9 @@ static llama_vocab::id llama_sample_top_p_top_k( llama_context & lctx, const std::vector & last_n_tokens, int top_k, - double top_p, - double temp, - double repeat_penalty) { + float top_p, + float temp, + float repeat_penalty) { auto & rng = lctx.rng; const int n_logits = lctx.model.hparams.n_vocab; @@ -1266,17 +1266,17 @@ static llama_vocab::id llama_sample_top_p_top_k( const auto & logits = lctx.logits; const auto * plogits = logits.data() + logits.size() - n_logits; - std::vector> logits_id; + std::vector> logits_id; logits_id.reserve(n_logits); { - const double scale = 1.0/temp; + const float scale = 1.0f/temp; for (int i = 0; i < n_logits; ++i) { // repetition penalty from ctrl paper (https://arxiv.org/abs/1909.05858) // credit https://github.com/facebookresearch/llama/compare/main...shawwn:llama:main if (std::find(last_n_tokens.begin(), last_n_tokens.end(), i) != last_n_tokens.end()) { // if score < 0 then repetition penalty has to multiplied to reduce the previous token probability - if (plogits[i] < 0.0) { + if (plogits[i] < 0.0f) { logits_id.push_back(std::make_pair(plogits[i]*scale*repeat_penalty, i)); } else { logits_id.push_back(std::make_pair(plogits[i]*scale/repeat_penalty, i)); @@ -1289,18 +1289,18 @@ static llama_vocab::id llama_sample_top_p_top_k( sample_top_k(logits_id, top_k); - double maxl = -std::numeric_limits::infinity(); + float maxl = -std::numeric_limits::infinity(); for (const auto & kv : logits_id) { maxl = std::max(maxl, kv.first); } // compute probs for the top k tokens - std::vector probs; + std::vector probs; probs.reserve(logits_id.size()); double sum = 0.0; for (const auto & kv : logits_id) { - double p = exp(kv.first - maxl); + const float p = expf(kv.first - maxl); probs.push_back(p); sum += p; } @@ -1310,8 +1310,8 @@ static llama_vocab::id llama_sample_top_p_top_k( p /= sum; } - if (top_p < 1.0f) { - double cumsum = 0.0f; + if (top_p < 1.0) { + double cumsum = 0.0; for (int i = 0; i < (int) probs.size(); i++) { cumsum += probs[i]; if (cumsum >= top_p) { @@ -1590,7 +1590,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s } for (int i = 0; i < (int) hist_cur.size(); ++i) { - printf("%5.3f ", hist_cur[i] / (float)nelements); + printf("%5.3f ", hist_cur[i] / float(nelements)); } printf("\n"); } else { @@ -1613,7 +1613,7 @@ static bool llama_model_quantize_internal(const std::string & fname_inp, const s printf("%s: hist: ", __func__); for (int i = 0; i < (int) hist_all.size(); ++i) { - printf("%5.3f ", hist_all[i] / (float)sum_all); + printf("%5.3f ", hist_all[i] / float(sum_all)); } printf("\n"); } @@ -1795,9 +1795,9 @@ llama_token llama_sample_top_p_top_k( const llama_token * last_n_tokens_data, int last_n_tokens_size, int top_k, - double top_p, - double temp, - double repeat_penalty) { + float top_p, + float temp, + float repeat_penalty) { const int64_t t_start_sample_us = ggml_time_us(); llama_token result = 0; @@ -1828,11 +1828,11 @@ void llama_print_timings(struct llama_context * ctx) { const int32_t n_p_eval = std::max(1, ctx->n_p_eval); fprintf(stderr, "\n"); - fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f); - fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_sample_us, n_sample, 1e-3f * ctx->t_sample_us / n_sample); - fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3f * ctx->t_p_eval_us, n_p_eval, 1e-3f * ctx->t_p_eval_us / n_p_eval); - fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->t_eval_us, n_eval, 1e-3f * ctx->t_eval_us / n_eval); - fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f); + fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0); + fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_sample_us, n_sample, 1e-3 * ctx->t_sample_us / n_sample); + fprintf(stderr, "%s: prompt eval time = %8.2f ms / %5d tokens (%8.2f ms per token)\n", __func__, 1e-3 * ctx->t_p_eval_us, n_p_eval, 1e-3 * ctx->t_p_eval_us / n_p_eval); + fprintf(stderr, "%s: eval time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3 * ctx->t_eval_us, n_eval, 1e-3 * ctx->t_eval_us / n_eval); + fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0); } void llama_reset_timings(struct llama_context * ctx) { diff --git a/llama.h b/llama.h index d3f4cae61..f5a576c1e 100644 --- a/llama.h +++ b/llama.h @@ -45,7 +45,7 @@ extern "C" { } llama_token_data; - typedef void (*llama_progress_callback)(double progress, void *ctx); + typedef void (*llama_progress_callback)(float progress, void *ctx); struct llama_context_params { int n_ctx; // text context @@ -134,9 +134,9 @@ extern "C" { const llama_token * last_n_tokens_data, int last_n_tokens_size, int top_k, - double top_p, - double temp, - double repeat_penalty); + float top_p, + float temp, + float repeat_penalty); // Performance information LLAMA_API void llama_print_timings(struct llama_context * ctx); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index b44d7fe7e..157d7336e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -5,5 +5,6 @@ function(llama_add_test source) add_test(NAME ${TEST_TARGET} COMMAND $ ${ARGN}) endfunction() +# llama_add_test(test-double-float.c) # SLOW llama_add_test(test-quantize.c) llama_add_test(test-tokenizer-0.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin) diff --git a/tests/test-double-float.c b/tests/test-double-float.c new file mode 100644 index 000000000..89dafc9f2 --- /dev/null +++ b/tests/test-double-float.c @@ -0,0 +1,53 @@ +// These tests may take a long time! +// They are to prove that conversion from double to float of various functions in ggml.c doesn't affect the result. +// This is done by checking all finite (non-NaN, non-infinite) floats. + +#undef NDEBUG +#include +#include +#include +#include + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wdouble-promotion" + +// ggml.c::quantize_row_q4_0_reference +inline static uint8_t round_orig(float v0) { return ((int8_t) (round(v0))) + 8; } + +// ggml.c::ggml_silu_f32 +inline static float silu_orig(float x) { + return x/(1.0 + exp(-x)); +} + +#pragma GCC diagnostic pop + +// ggml.c::quantize_row_q4_0_reference +inline static uint8_t round_float(float v0) { return (int8_t)roundf(v0) + 8; } + +// ggml.c::ggml_silu_f32 +inline static float silu_float(float x) { + return x/(1.0f + expf(-x)); +} + +int main(void) { + uint32_t x = UINT32_MAX; + do { + float f = *(float *)&x; + assert(!isfinite(f) || (round_orig(f) == round_float(f))); + } while (x--); + +#ifdef __F16C__ + // GELU and SILU implementations are used with a FP16 lookup table. + // The original and float-only results are not equal for all inputs after converting to FP16. + // GELU is an approximation anyway (tanh), not tested here. + // For SILU, verify that the results are at least the closest floating point numbers, if the FP16 values don't match. + for (x = 0; x <= UINT16_MAX; x++) { + float f = _cvtsh_ss(x); + const float so = silu_orig(f); + const float sf = silu_float(f); + assert( (_cvtss_sh(so, 0) == _cvtss_sh(sf, 0)) + || (nextafterf(so, sf) == sf) + || (nextafterf(sf, so) == so)); + } +#endif +} From d502bc7c9d9d6dfb3a09aea404395b666d7b374d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 28 Mar 2023 19:51:55 +0300 Subject: [PATCH 10/12] tests : free llama context at the end of the test --- CMakeLists.txt | 3 ++- tests/test-tokenizer-0.cpp | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 241be4c15..d7b0eba29 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -129,13 +129,14 @@ if (LLAMA_ALL_WARNINGS) -Wshadow -Wstrict-prototypes -Wpointer-arith + -Wno-unused-function ) set(cxx_flags -Wall -Wextra -Wpedantic -Wcast-qual - -Wdouble-promotion + -Wno-unused-function ) else() # todo : msvc diff --git a/tests/test-tokenizer-0.cpp b/tests/test-tokenizer-0.cpp index 382055324..55b086dae 100644 --- a/tests/test-tokenizer-0.cpp +++ b/tests/test-tokenizer-0.cpp @@ -77,5 +77,7 @@ int main(int argc, char **argv) { } } + llama_free(ctx); + return 0; } From 96f9c0506fa81cada6f96f45768c34f45406c4bb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Tue, 28 Mar 2023 20:01:09 +0300 Subject: [PATCH 11/12] ci : make ctest verbose, hopefully we see what is wrong with the sanitizer --- .github/workflows/build.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 26b451943..cb35a3298 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -62,7 +62,7 @@ jobs: id: cmake_test run: | cd build - ctest --output-on-failure + ctest --verbose ubuntu-latest-cmake-sanitizer: runs-on: ubuntu-latest @@ -98,7 +98,7 @@ jobs: id: cmake_test run: | cd build - ctest --output-on-failure + ctest --verbose macOS-latest-make: runs-on: macos-latest @@ -143,7 +143,7 @@ jobs: id: cmake_test run: | cd build - ctest --output-on-failure + ctest --verbose windows-latest-cmake: runs-on: windows-latest @@ -185,7 +185,7 @@ jobs: if: ${{ matrix.build != 'avx512' || env.HAS_AVX512F == '1' }} # Test AVX-512 only when possible run: | cd build - ctest -C Release --output-on-failure + ctest -C Release --verbose - name: Get commit hash id: commit From 692ce3164ef1201ecb9cfad315cc0a08b965adb8 Mon Sep 17 00:00:00 2001 From: "DooWoong Lee (David)" Date: Wed, 29 Mar 2023 02:02:34 +0900 Subject: [PATCH 12/12] py : removed unused `model` variable and verified that the code functions correctly with `vocab_only` setting. Also confirmed that the code works as expected after running with reduced memory usage due to deletion of no-longer-needed variable. (#547) --- convert-pth-to-ggml.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py index ccf2c57b1..d83f8a137 100644 --- a/convert-pth-to-ggml.py +++ b/convert-pth-to-ggml.py @@ -145,13 +145,11 @@ def main(): print(f"Extracting only the vocab from '{fname_model}'\n") - model = torch.load(fname_model, map_location="cpu") with open(fname_out, "wb") as fout: write_header(fout, hparams, ftype) write_tokens(fout, tokenizer) - del model print(f"Done. Output file: {fname_out}\n")