From 4aff73fa6538af2fe3a6448b9bebb62807da7794 Mon Sep 17 00:00:00 2001 From: Alexander Krivutsenko Date: Thu, 21 Dec 2023 18:01:55 +0100 Subject: [PATCH] Fixed default MSVS build --- CMakeLists.txt | 3 + common/common.cpp | 1 + common/log.h | 1 + examples/baby-llama/baby-llama.cpp | 1 + examples/infill/infill.cpp | 12 +- examples/infill/infill.cpp.bak | 765 +++++++++++++++++++++++++ examples/main/main.cpp | 12 +- examples/main/main.cpp.bak | 873 +++++++++++++++++++++++++++++ examples/quantize/quantize.cpp | 1 + ggml-quants.c | 558 +++++++++--------- ggml-quants.h | 92 +-- ggml.c | 30 +- ggml.h | 4 +- 13 files changed, 2003 insertions(+), 350 deletions(-) create mode 100644 examples/infill/infill.cpp.bak create mode 100644 examples/main/main.cpp.bak diff --git a/CMakeLists.txt b/CMakeLists.txt index e3cd43ab3..fa3104306 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -500,6 +500,9 @@ if (LLAMA_CUBLAS) endif() if (WIN32) + if (MSVC) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /bigobj") + endif() add_compile_definitions(_CRT_SECURE_NO_WARNINGS) if (BUILD_SHARED_LIBS) diff --git a/common/common.cpp b/common/common.cpp index 93d5483e4..8d3871f79 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -16,6 +16,7 @@ #include #include #include +#include #if defined(__APPLE__) && defined(__MACH__) #include diff --git a/common/log.h b/common/log.h index e4e1b9f4f..5745719df 100644 --- a/common/log.h +++ b/common/log.h @@ -8,6 +8,7 @@ #include #include #include +#include // -------------------------------- // diff --git a/examples/baby-llama/baby-llama.cpp b/examples/baby-llama/baby-llama.cpp index 2dc2988d3..b461965e8 100644 --- a/examples/baby-llama/baby-llama.cpp +++ b/examples/baby-llama/baby-llama.cpp @@ -7,6 +7,7 @@ #include #include #include +#include #if defined(_MSC_VER) #pragma warning(disable: 4244 4267) // possible loss of data diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 4a7827876..87a81e5d0 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -101,6 +101,13 @@ static void sigint_handler(int signo) { } #endif +#if defined (_WIN32) + BOOL ConsoleCtrlHandler(DWORD ctrl_type) + { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + } +#endif + int main(int argc, char ** argv) { gpt_params params; llama_sampling_params & sparams = params.sparams; @@ -344,10 +351,7 @@ int main(int argc, char ** argv) { sigint_action.sa_flags = 0; sigaction(SIGINT, &sigint_action, NULL); #elif defined (_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; - }; - SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); + SetConsoleCtrlHandler(reinterpret_cast(ConsoleCtrlHandler), TRUE); #endif LOG_TEE("%s: interactive mode on.\n", __func__); diff --git a/examples/infill/infill.cpp.bak b/examples/infill/infill.cpp.bak new file mode 100644 index 000000000..4a7827876 --- /dev/null +++ b/examples/infill/infill.cpp.bak @@ -0,0 +1,765 @@ +#include "common.h" + +#include "console.h" +#include "llama.h" +#include "grammar-parser.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static llama_context ** g_ctx; +static llama_model ** g_model; +static gpt_params * g_params; +static std::vector * g_input_tokens; +static std::ostringstream * g_output_ss; +static std::vector * g_output_tokens; + +static bool is_interacting = false; + +static void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const std::vector & input_tokens, const std::string & output, + const std::vector & output_tokens +) { + if (params.logdir.empty()) { + return; + } + + const std::string timestamp = get_sortable_timestamp(); + + const bool success = create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: infill\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Generation Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + dump_string_yaml_multiline(logfile, "output", output.c_str()); + dump_vector_int_yaml(logfile, "output_tokens", output_tokens); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +static void sigint_handler(int signo) { + if (signo == SIGINT) { + if (!is_interacting) { + is_interacting = true; + } else { + console::cleanup(); + printf("\n"); + llama_print_timings(*g_ctx); + write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); + _exit(130); + } + } +} +#endif + +int main(int argc, char ** argv) { + gpt_params params; + llama_sampling_params & sparams = params.sparams; + g_params = ¶ms; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("infill", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); +#endif // LOG_DISABLE_LOGS + + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); + + if (params.logits_all) { + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.embedding) { + printf("\n************\n"); + printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.n_ctx != 0 && params.n_ctx < 8) { + LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + if (params.instruct) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for instruct mode\n", __func__); + printf("************\n\n"); + + return 0; + } + if (params.chatml) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for chatml mode\n", __func__); + printf("************\n\n"); + + return 0; + } + if (!params.antiprompt.empty()) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for antiprompt mode\n", __func__); + printf("************\n\n"); + + return 0; + } + if (!params.interactive_first && (params.input_prefix.empty() && params.input_suffix.empty())) { + printf("\n************\n"); + printf("%s: please use '--interactive_first' or specify '--in_prefix' and/or '--in_suffix'\n", __func__); + printf("************\n\n"); + + return 0; + } + if (params.random_prompt) { + printf("\n************\n"); + printf("%s: please use the 'main' tool for random prompt mode\n", __func__); + printf("************\n\n"); + + return 0; + } + if (!params.path_prompt_cache.empty()) { + printf("\n************\n"); + printf("%s: infill does not support prompt caching\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.rope_freq_base != 0.0) { + LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 0.0) { + LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + } + + LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + LOG_TEE("%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + + LOG("%s: llama backend init\n", __func__); + llama_backend_init(params.numa); + + llama_model * model; + llama_context * ctx; + llama_context * ctx_guidance = NULL; + g_model = &model; + g_ctx = &ctx; + + // load the model and apply lora adapter, if any + LOG("%s: load the model and apply lora adapter, if any\n", __func__); + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (sparams.cfg_scale > 1.f) { + struct llama_context_params lparams = llama_context_params_from_gpt_params(params); + ctx_guidance = llama_new_context_with_model(model, lparams); + } + + if (model == NULL) { + LOG_TEE("%s: error: unable to load model\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx = llama_n_ctx(ctx); + LOG("n_ctx: %d\n", n_ctx); + + if (n_ctx > n_ctx_train) { + LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, n_ctx); + } + + // print system information + { + LOG_TEE("\n"); + LOG_TEE("%s\n", get_system_info(params).c_str()); + } + const bool add_bos = llama_should_add_bos_token(model); + LOG("add_bos: %d\n", add_bos); + + bool suff_rm_leading_spc = params.escape; + if (suff_rm_leading_spc && params.input_suffix.find_first_of(" ") == 0 && params.input_suffix.size() > 1) { + params.input_suffix.erase(0, 1); + suff_rm_leading_spc = false; + } + std::vector embd_inp; + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + const int space_token = 29871; + if (suff_rm_leading_spc && inp_sfx[0] == space_token) { + inp_sfx.erase(inp_sfx.begin()); + } + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); + if (add_bos) { + inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); + } + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + embd_inp = inp_pfx; + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp.push_back(llama_token_middle(model)); + + LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); + LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + + // Should not run without any tokens + if (embd_inp.empty()) { + embd_inp.push_back(llama_token_bos(model)); + LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + } + + // Tokenize negative prompt + std::vector guidance_inp; + int guidance_offset = 0; + int original_prompt_len = 0; + if (ctx_guidance) { + LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); + + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos); + LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); + + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos); + LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); + + original_prompt_len = original_inp.size(); + guidance_offset = (int)guidance_inp.size() - original_prompt_len; + LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); + LOG("guidance_offset: %s", log_tostr(guidance_offset)); + } + + if ((int) embd_inp.size() > n_ctx - 4) { + LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + return 1; + } + + // number of tokens to keep when resetting context + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size()) { + params.n_keep = (int)embd_inp.size(); + } + + LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); + LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); + + + // enable interactive mode if interactive start is specified + if (params.interactive_first) { + params.interactive = true; + } + + if (params.verbose_prompt) { + LOG_TEE("\n"); + LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + } + + if (ctx_guidance) { + LOG_TEE("\n"); + LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); + LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); + for (int i = 0; i < (int) guidance_inp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); + } + } + + if (params.n_keep > 0) { + LOG_TEE("%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.n_keep; i++) { + LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + } + LOG_TEE("'\n"); + } + LOG_TEE("\n"); + } + + if (params.interactive) { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + + LOG_TEE("%s: interactive mode on.\n", __func__); + + if (params.input_prefix_bos) { + LOG_TEE("Input prefix with BOS\n"); + } + + if (!params.input_prefix.empty()) { + LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + } + + if (!params.input_suffix.empty()) { + LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + } + } + LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); + LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_TEE("\n\n"); + + LOG_TEE("\n##### Infill mode #####\n\n"); + if (params.infill) { + printf("\n************\n"); + printf("no need to specify '--infill', always running infill\n"); + printf("************\n\n"); + } + if (params.interactive) { + const char *control_message; + if (params.multiline_input) { + control_message = " - To return control to LLaMa, end your input with '\\'.\n" + " - To return control without starting a new line, end your input with '/'.\n"; + } else { + control_message = " - Press Return to return control to LLaMa.\n" + " - To return control without starting a new line, end your input with '/'.\n" + " - If you want to submit another line, end your input with '\\'.\n"; + } + LOG_TEE("== Running in interactive mode. ==\n"); +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) + LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); +#endif + LOG_TEE( "%s\n", control_message); + + is_interacting = params.interactive_first; + } + + bool input_echo = true; + + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; + int n_past_guidance = 0; + + std::vector input_tokens; g_input_tokens = &input_tokens; + std::vector output_tokens; g_output_tokens = &output_tokens; + std::ostringstream output_ss; g_output_ss = &output_ss; + + // the first thing we will do is to output the prompt, so set color accordingly + console::set_display(console::prompt); + + std::vector embd; + std::vector embd_guidance; + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); + + while (n_remain != 0 || params.interactive) { + // predict + if (!embd.empty()) { + // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + int max_embd_size = n_ctx - 4; + + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int) embd.size() > max_embd_size) { + const int skipped_tokens = (int) embd.size() - max_embd_size; + embd.resize(max_embd_size); + + console::set_display(console::error); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + console::set_display(console::reset); + fflush(stdout); + } + + // infinite text generation via context swapping + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { + if (params.n_predict == -2) { + LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep - 1; + const int n_discard = n_left/2; + + LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + if (ctx_guidance) { + n_past_guidance -= n_discard; + } + + LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); + + LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + + } + + // evaluate tokens in batches + // embd is typically prepared beforehand to fit within a batch, but not always + + if (ctx_guidance) { + int input_size = 0; + llama_token * input_buf = NULL; + + if (n_past_guidance < (int) guidance_inp.size()) { + // Guidance context should have the same data with these modifications: + // + // * Replace the initial prompt + // * Shift everything by guidance_offset + embd_guidance = guidance_inp; + if (embd.begin() + original_prompt_len < embd.end()) { + embd_guidance.insert( + embd_guidance.end(), + embd.begin() + original_prompt_len, + embd.end() + ); + } + + input_buf = embd_guidance.data(); + input_size = embd_guidance.size(); + + LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str()); + } else { + input_buf = embd.data(); + input_size = embd.size(); + } + + for (int i = 0; i < input_size; i += params.n_batch) { + int n_eval = std::min(input_size - i, params.n_batch); + if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + n_past_guidance += n_eval; + } + } + + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { + int n_eval = (int) embd.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + + LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + n_past += n_eval; + + LOG("n_past = %d\n", n_past); + } + + } + + embd.clear(); + embd_guidance.clear(); + + if ((int) embd_inp.size() <= n_consumed && !is_interacting) { + + const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); + + llama_sampling_accept(ctx_sampling, ctx, id, true); + + LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); + + embd.push_back(id); + + // echo this to console + input_echo = true; + + // decrement remaining sampling budget + --n_remain; + + LOG("n_remain: %d\n", n_remain); + } else { + // some user input remains from prompt or interaction, forward it to processing + LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + + // push the prompt in the sampling context in order to apply repetition penalties later + // for the prompt, we don't apply grammar rules + llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); + + ++n_consumed; + if ((int) embd.size() >= params.n_batch) { + break; + } + } + } + + // display text + if (input_echo) { + for (auto id : embd) { + const std::string token_str = llama_token_to_piece(ctx, id); + printf("%s", token_str.c_str()); + + if (embd.size() > 1) { + input_tokens.push_back(id); + } else { + output_tokens.push_back(id); + output_ss << token_str; + } + } + fflush(stdout); + } + // reset color to default if we there is no pending user input + if (input_echo && (int) embd_inp.size() == n_consumed) { + console::set_display(console::reset); + } + + // if not currently processing queued inputs; + if ((int) embd_inp.size() <= n_consumed) { + + // deal with eot token in infill mode + if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ + if(is_interacting && !params.interactive_first) { + // print an eot token + printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + } + fflush(stdout); + printf("\n"); + console::set_display(console::user_input); + std::string buffer; + std::string line; + bool another_line=true; + // set a new prefix via stdin + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + // check if we got an empty line, if so we use the old input + if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { + params.input_prefix = buffer; + } + buffer.clear(); + // set a new suffix via stdin + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + // check if we got an empty line + if (!buffer.empty() && !(buffer.length() == 1 && buffer[0] == '\n')) { + params.input_suffix = buffer; + } + buffer.clear(); + // done taking input, reset color + console::set_display(console::reset); + + if (params.escape) { + //process escape sequences, for the initial prompt this is done in common.cpp when we load the params, but for the interactive mode we need to do it here + process_escapes(params.input_prefix); + process_escapes(params.input_suffix); + } + suff_rm_leading_spc = params.escape; + if (suff_rm_leading_spc && params.input_suffix.find_first_of(' ') == 0 && params.input_suffix.size() > 1) { + params.input_suffix.erase(0, 1); + suff_rm_leading_spc = false; + } + // tokenize new prefix and suffix + std::vector inp_pfx = ::llama_tokenize(ctx, params.input_prefix, false); + std::vector inp_sfx = ::llama_tokenize(ctx, params.input_suffix, false); + if (suff_rm_leading_spc && inp_sfx[0] == space_token) { + inp_sfx.erase(inp_sfx.begin()); + } + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); + if (add_bos) { + inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); + } + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); + embd_inp = inp_pfx; + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + embd_inp.push_back(llama_token_middle(model)); + embd.clear(); + embd_guidance.clear(); + n_remain = params.n_predict; + n_past = 0; + n_consumed = 0; + // LOG_TEE("took new input\n"); + is_interacting = false; + } + // deal with end of text token in interactive mode + else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { + LOG("found EOS token\n"); + + if (params.interactive) { + + is_interacting = true; + printf("\n"); + console::set_display(console::user_input); + fflush(stdout); + } + } + + if (n_past > 0 && is_interacting && !params.interactive) { + LOG("waiting for user input\n"); + + if (params.input_prefix_bos) { + LOG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_token_bos(model)); + } + + std::string buffer; + if (!params.input_prefix.empty()) { + LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + buffer += params.input_prefix; + printf("%s", buffer.c_str()); + } + + std::string line; + bool another_line = true; + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + + // done taking input, reset color + console::set_display(console::reset); + + // Add tokens to embd only if the input buffer is non-empty + // Entering a empty line lets the user pass control back + if (buffer.length() > 1) { + // append input suffix if any + if (!params.input_suffix.empty()) { + LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + buffer += params.input_suffix; + printf("%s", params.input_suffix.c_str()); + } + + LOG("buffer: '%s'\n", buffer.c_str()); + + const size_t original_size = embd_inp.size(); + + const auto line_inp = ::llama_tokenize(ctx, buffer, false); + LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + + for (size_t i = original_size; i < embd_inp.size(); ++i) { + const llama_token token = embd_inp[i]; + output_tokens.push_back(token); + output_ss << llama_token_to_piece(ctx, token); + } + + n_remain -= line_inp.size(); + LOG("n_remain: %d\n", n_remain); + } else { + LOG("empty line, passing control back\n"); + } + + input_echo = false; // do not echo this again + } + + if (n_past > 0) { + if (is_interacting) { + llama_sampling_reset(ctx_sampling); + } + is_interacting = false; + } + } + + // end of text token + if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) { + break; + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). + if (params.interactive && n_remain <= 0 && params.n_predict >= 0) { + n_remain = params.n_predict; + is_interacting = true; + } + } + if (!params.interactive && n_remain <= 0) { + printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); + fflush(stdout); + } + + llama_print_timings(ctx); + write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); + + if (ctx_guidance) { llama_free(ctx_guidance); } + llama_free(ctx); + llama_free_model(model); + + llama_sampling_free(ctx_sampling); + llama_backend_free(); + +#ifndef LOG_DISABLE_LOGS + LOG_TEE("Log end\n"); +#endif // LOG_DISABLE_LOGS + + return 0; +} + diff --git a/examples/main/main.cpp b/examples/main/main.cpp index c096f110b..2108ef617 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -100,6 +100,13 @@ static void sigint_handler(int signo) { } #endif +#if defined (_WIN32) +BOOL ConsoleCtrlHandler(DWORD ctrl_type) +{ + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; +} +#endif + static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { (void) level; (void) user_data; @@ -392,10 +399,7 @@ int main(int argc, char ** argv) { sigint_action.sa_flags = 0; sigaction(SIGINT, &sigint_action, NULL); #elif defined (_WIN32) - auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { - return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; - }; - SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); + SetConsoleCtrlHandler(reinterpret_cast(ConsoleCtrlHandler), TRUE); #endif LOG_TEE("%s: interactive mode on.\n", __func__); diff --git a/examples/main/main.cpp.bak b/examples/main/main.cpp.bak new file mode 100644 index 000000000..c096f110b --- /dev/null +++ b/examples/main/main.cpp.bak @@ -0,0 +1,873 @@ +#include "common.h" + +#include "console.h" +#include "llama.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) +#include +#include +#elif defined (_WIN32) +#define WIN32_LEAN_AND_MEAN +#ifndef NOMINMAX +#define NOMINMAX +#endif +#include +#include +#endif + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +static llama_context ** g_ctx; +static llama_model ** g_model; +static gpt_params * g_params; +static std::vector * g_input_tokens; +static std::ostringstream * g_output_ss; +static std::vector * g_output_tokens; +static bool is_interacting = false; + + +static void write_logfile( + const llama_context * ctx, const gpt_params & params, const llama_model * model, + const std::vector & input_tokens, const std::string & output, + const std::vector & output_tokens +) { + if (params.logdir.empty()) { + return; + } + + const std::string timestamp = get_sortable_timestamp(); + + const bool success = create_directory_with_parents(params.logdir); + if (!success) { + fprintf(stderr, "%s: warning: failed to create logdir %s, cannot write logfile\n", + __func__, params.logdir.c_str()); + return; + } + + const std::string logfile_path = params.logdir + timestamp + ".yml"; + FILE * logfile = fopen(logfile_path.c_str(), "w"); + + if (logfile == NULL) { + fprintf(stderr, "%s: failed to open logfile %s\n", __func__, logfile_path.c_str()); + return; + } + + fprintf(logfile, "binary: main\n"); + char model_desc[128]; + llama_model_desc(model, model_desc, sizeof(model_desc)); + dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc); + + fprintf(logfile, "\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "# Generation Results #\n"); + fprintf(logfile, "######################\n"); + fprintf(logfile, "\n"); + + dump_string_yaml_multiline(logfile, "output", output.c_str()); + dump_vector_int_yaml(logfile, "output_tokens", output_tokens); + + llama_dump_timing_info_yaml(logfile, ctx); + fclose(logfile); +} + +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) +static void sigint_handler(int signo) { + if (signo == SIGINT) { + if (!is_interacting) { + is_interacting = true; + } else { + console::cleanup(); + printf("\n"); + llama_print_timings(*g_ctx); + write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens); + _exit(130); + } + } +} +#endif + +static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + LOG_TEE("%s", text); +} + +int main(int argc, char ** argv) { + gpt_params params; + g_params = ¶ms; + + if (!gpt_params_parse(argc, argv, params)) { + return 1; + } + llama_sampling_params & sparams = params.sparams; + +#ifndef LOG_DISABLE_LOGS + log_set_target(log_filename_generator("main", "log")); + LOG_TEE("Log start\n"); + log_dump_cmdline(argc, argv); + llama_log_set(llama_log_callback_logTee, nullptr); +#endif // LOG_DISABLE_LOGS + + // TODO: Dump params ? + //LOG("Params perplexity: %s\n", LOG_TOSTR(params.perplexity)); + + // save choice to use color for later + // (note for later: this is a slightly awkward choice) + console::init(params.simple_io, params.use_color); + atexit([]() { console::cleanup(); }); + + if (params.logits_all) { + printf("\n************\n"); + printf("%s: please use the 'perplexity' tool for perplexity calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.embedding) { + printf("\n************\n"); + printf("%s: please use the 'embedding' tool for embedding calculations\n", __func__); + printf("************\n\n"); + + return 0; + } + + if (params.n_ctx != 0 && params.n_ctx < 8) { + LOG_TEE("%s: warning: minimum context size is 8, using minimum size.\n", __func__); + params.n_ctx = 8; + } + + if (params.rope_freq_base != 0.0) { + LOG_TEE("%s: warning: changing RoPE frequency base to %g.\n", __func__, params.rope_freq_base); + } + + if (params.rope_freq_scale != 0.0) { + LOG_TEE("%s: warning: scaling RoPE frequency by %g.\n", __func__, params.rope_freq_scale); + } + + LOG_TEE("%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); + LOG_TEE("%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); + + if (params.seed == LLAMA_DEFAULT_SEED) { + params.seed = time(NULL); + } + + LOG_TEE("%s: seed = %u\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.random_prompt) { + params.prompt = gpt_random_prompt(rng); + } + + LOG("%s: llama backend init\n", __func__); + llama_backend_init(params.numa); + + llama_model * model; + llama_context * ctx; + llama_context * ctx_guidance = NULL; + g_model = &model; + g_ctx = &ctx; + + // load the model and apply lora adapter, if any + LOG("%s: load the model and apply lora adapter, if any\n", __func__); + std::tie(model, ctx) = llama_init_from_gpt_params(params); + if (sparams.cfg_scale > 1.f) { + struct llama_context_params lparams = llama_context_params_from_gpt_params(params); + ctx_guidance = llama_new_context_with_model(model, lparams); + } + + if (model == NULL) { + LOG_TEE("%s: error: unable to load model\n", __func__); + return 1; + } + + const int n_ctx_train = llama_n_ctx_train(model); + const int n_ctx = llama_n_ctx(ctx); + LOG("n_ctx: %d\n", n_ctx); + + if (n_ctx > n_ctx_train) { + LOG_TEE("%s: warning: model was trained on only %d context tokens (%d specified)\n", + __func__, n_ctx_train, n_ctx); + } + + // print system information + { + LOG_TEE("\n"); + LOG_TEE("%s\n", get_system_info(params).c_str()); + } + + std::string path_session = params.path_prompt_cache; + std::vector session_tokens; + + if (!path_session.empty()) { + LOG_TEE("%s: attempting to load saved session from '%s'\n", __func__, path_session.c_str()); + + // fopen to check for existing session + FILE * fp = std::fopen(path_session.c_str(), "rb"); + if (fp != NULL) { + std::fclose(fp); + + session_tokens.resize(n_ctx); + size_t n_token_count_out = 0; + if (!llama_load_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.capacity(), &n_token_count_out)) { + LOG_TEE("%s: error: failed to load session file '%s'\n", __func__, path_session.c_str()); + return 1; + } + session_tokens.resize(n_token_count_out); + llama_set_rng_seed(ctx, params.seed); + + LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int) session_tokens.size()); + } else { + LOG_TEE("%s: session file does not exist, will create\n", __func__); + } + } + + const bool add_bos = llama_should_add_bos_token(model); + LOG("add_bos: %d\n", add_bos); + + std::vector embd_inp; + + if (params.interactive_first || params.instruct || params.chatml || !params.prompt.empty() || session_tokens.empty()) { + LOG("tokenize the prompt\n"); + if (params.chatml) { + params.prompt = "<|im_start|>system\n" + params.prompt + "<|im_end|>"; + } + embd_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + } else { + LOG("use session tokens\n"); + embd_inp = session_tokens; + } + + LOG("prompt: \"%s\"\n", log_tostr(params.prompt)); + LOG("tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + + // Should not run without any tokens + if (embd_inp.empty()) { + embd_inp.push_back(llama_token_bos(model)); + LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); + } + + // Tokenize negative prompt + std::vector guidance_inp; + int guidance_offset = 0; + int original_prompt_len = 0; + if (ctx_guidance) { + LOG("cfg_negative_prompt: \"%s\"\n", log_tostr(sparams.cfg_negative_prompt)); + + guidance_inp = ::llama_tokenize(ctx_guidance, sparams.cfg_negative_prompt, add_bos, true); + LOG("guidance_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx_guidance, guidance_inp).c_str()); + + std::vector original_inp = ::llama_tokenize(ctx, params.prompt, add_bos, true); + LOG("original_inp tokenized: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, original_inp).c_str()); + + original_prompt_len = original_inp.size(); + guidance_offset = (int)guidance_inp.size() - original_prompt_len; + LOG("original_prompt_len: %s", log_tostr(original_prompt_len)); + LOG("guidance_offset: %s", log_tostr(guidance_offset)); + } + + if ((int) embd_inp.size() > n_ctx - 4) { + LOG_TEE("%s: error: prompt is too long (%d tokens, max %d)\n", __func__, (int) embd_inp.size(), n_ctx - 4); + return 1; + } + + // debug message about similarity of saved session, if applicable + size_t n_matching_session_tokens = 0; + if (!session_tokens.empty()) { + for (llama_token id : session_tokens) { + if (n_matching_session_tokens >= embd_inp.size() || id != embd_inp[n_matching_session_tokens]) { + break; + } + n_matching_session_tokens++; + } + if (params.prompt.empty() && n_matching_session_tokens == embd_inp.size()) { + LOG_TEE("%s: using full prompt from session file\n", __func__); + } else if (n_matching_session_tokens >= embd_inp.size()) { + LOG_TEE("%s: session file has exact match for prompt!\n", __func__); + } else if (n_matching_session_tokens < (embd_inp.size() / 2)) { + LOG_TEE("%s: warning: session file has low similarity to prompt (%zu / %zu tokens); will mostly be reevaluated\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } else { + LOG_TEE("%s: session file matches %zu / %zu tokens of prompt\n", + __func__, n_matching_session_tokens, embd_inp.size()); + } + + // remove any "future" tokens that we might have inherited from the previous session + llama_kv_cache_seq_rm(ctx, -1, n_matching_session_tokens, -1); + } + + LOGLN( + "recalculate the cached logits (check): embd_inp.empty() %s, n_matching_session_tokens %zu, embd_inp.size() %zu, session_tokens.size() %zu, embd_inp.size() %zu", + log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size()); + + // if we will use the cache for the full prompt without reaching the end of the cache, force + // reevaluation of the last token token to recalculate the cached logits + if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) { + LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1); + + session_tokens.resize(embd_inp.size() - 1); + } + + // number of tokens to keep when resetting context + if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct || params.chatml) { + params.n_keep = (int)embd_inp.size(); + } + + // prefix & suffix for instruct mode + const auto inp_pfx = ::llama_tokenize(ctx, "\n\n### Instruction:\n\n", add_bos, true); + const auto inp_sfx = ::llama_tokenize(ctx, "\n\n### Response:\n\n", false, true); + + LOG("inp_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_pfx).c_str()); + LOG("inp_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, inp_sfx).c_str()); + + // chatml prefix & suffix + const auto cml_pfx = ::llama_tokenize(ctx, "\n<|im_start|>user\n", add_bos, true); + const auto cml_sfx = ::llama_tokenize(ctx, "<|im_end|>\n<|im_start|>assistant\n", false, true); + + LOG("cml_pfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_pfx).c_str()); + LOG("cml_sfx: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, cml_sfx).c_str()); + + // in instruct mode, we inject a prefix and a suffix to each input by the user + if (params.instruct) { + params.interactive_first = true; + params.antiprompt.push_back("### Instruction:\n\n"); + } + // similar for chatml mode + else if (params.chatml) { + params.interactive_first = true; + params.antiprompt.push_back("<|im_start|>user\n"); + } + + // enable interactive mode if interactive start is specified + if (params.interactive_first) { + params.interactive = true; + } + + if (params.verbose_prompt) { + LOG_TEE("\n"); + LOG_TEE("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + LOG_TEE("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size()); + for (int i = 0; i < (int) embd_inp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", embd_inp[i], llama_token_to_piece(ctx, embd_inp[i]).c_str()); + } + + if (ctx_guidance) { + LOG_TEE("\n"); + LOG_TEE("%s: negative prompt: '%s'\n", __func__, sparams.cfg_negative_prompt.c_str()); + LOG_TEE("%s: number of tokens in negative prompt = %zu\n", __func__, guidance_inp.size()); + for (int i = 0; i < (int) guidance_inp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", guidance_inp[i], llama_token_to_piece(ctx, guidance_inp[i]).c_str()); + } + } + + if (params.n_keep > 0) { + LOG_TEE("%s: static prompt based on n_keep: '", __func__); + for (int i = 0; i < params.n_keep; i++) { + LOG_TEE("%s", llama_token_to_piece(ctx, embd_inp[i]).c_str()); + } + LOG_TEE("'\n"); + } + LOG_TEE("\n"); + } + + if (params.interactive) { +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) + struct sigaction sigint_action; + sigint_action.sa_handler = sigint_handler; + sigemptyset (&sigint_action.sa_mask); + sigint_action.sa_flags = 0; + sigaction(SIGINT, &sigint_action, NULL); +#elif defined (_WIN32) + auto console_ctrl_handler = +[](DWORD ctrl_type) -> BOOL { + return (ctrl_type == CTRL_C_EVENT) ? (sigint_handler(SIGINT), true) : false; + }; + SetConsoleCtrlHandler(reinterpret_cast(console_ctrl_handler), true); +#endif + + LOG_TEE("%s: interactive mode on.\n", __func__); + + if (!params.antiprompt.empty()) { + for (const auto & antiprompt : params.antiprompt) { + LOG_TEE("Reverse prompt: '%s'\n", antiprompt.c_str()); + if (params.verbose_prompt) { + auto tmp = ::llama_tokenize(ctx, antiprompt, false, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + } + + if (params.input_prefix_bos) { + LOG_TEE("Input prefix with BOS\n"); + } + + if (!params.input_prefix.empty()) { + LOG_TEE("Input prefix: '%s'\n", params.input_prefix.c_str()); + if (params.verbose_prompt) { + auto tmp = ::llama_tokenize(ctx, params.input_prefix, true, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + + if (!params.input_suffix.empty()) { + LOG_TEE("Input suffix: '%s'\n", params.input_suffix.c_str()); + if (params.verbose_prompt) { + auto tmp = ::llama_tokenize(ctx, params.input_suffix, false, true); + for (int i = 0; i < (int) tmp.size(); i++) { + LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx, tmp[i]).c_str()); + } + } + } + } + LOG_TEE("sampling: \n%s\n", llama_sampling_print(sparams).c_str()); + LOG_TEE("sampling order: \n%s\n", llama_sampling_order_print(sparams).c_str()); + LOG_TEE("generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep); + LOG_TEE("\n\n"); + + if (params.interactive) { + const char *control_message; + if (params.multiline_input) { + control_message = " - To return control to LLaMa, end your input with '\\'.\n" + " - To return control without starting a new line, end your input with '/'.\n"; + } else { + control_message = " - Press Return to return control to LLaMa.\n" + " - To return control without starting a new line, end your input with '/'.\n" + " - If you want to submit another line, end your input with '\\'.\n"; + } + LOG_TEE("== Running in interactive mode. ==\n"); +#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32) + LOG_TEE( " - Press Ctrl+C to interject at any time.\n"); +#endif + LOG_TEE( "%s\n", control_message); + + is_interacting = params.interactive_first; + } + + bool is_antiprompt = false; + bool input_echo = true; + bool need_to_save_session = !path_session.empty() && n_matching_session_tokens < embd_inp.size(); + + int n_past = 0; + int n_remain = params.n_predict; + int n_consumed = 0; + int n_session_consumed = 0; + int n_past_guidance = 0; + + std::vector input_tokens; g_input_tokens = &input_tokens; + std::vector output_tokens; g_output_tokens = &output_tokens; + std::ostringstream output_ss; g_output_ss = &output_ss; + + // the first thing we will do is to output the prompt, so set color accordingly + console::set_display(console::prompt); + + std::vector embd; + std::vector embd_guidance; + + struct llama_sampling_context * ctx_sampling = llama_sampling_init(sparams); + + while ((n_remain != 0 && !is_antiprompt) || params.interactive) { + // predict + if (!embd.empty()) { + // Note: n_ctx - 4 here is to match the logic for commandline prompt handling via + // --prompt or --file which uses the same value. + int max_embd_size = n_ctx - 4; + + // Ensure the input doesn't exceed the context size by truncating embd if necessary. + if ((int) embd.size() > max_embd_size) { + const int skipped_tokens = (int) embd.size() - max_embd_size; + embd.resize(max_embd_size); + + console::set_display(console::error); + printf("<>", skipped_tokens, skipped_tokens != 1 ? "s" : ""); + console::set_display(console::reset); + fflush(stdout); + } + + // infinite text generation via context swapping + // if we run out of context: + // - take the n_keep first tokens from the original prompt (via n_past) + // - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches + if (n_past + (int) embd.size() + std::max(0, guidance_offset) > n_ctx) { + if (params.n_predict == -2) { + LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict); + break; + } + + const int n_left = n_past - params.n_keep - 1; + const int n_discard = n_left/2; + + LOG("context full, swapping: n_past = %d, n_left = %d, n_ctx = %d, n_keep = %d, n_discard = %d\n", + n_past, n_left, n_ctx, params.n_keep, n_discard); + + llama_kv_cache_seq_rm (ctx, 0, params.n_keep + 1 , params.n_keep + n_discard + 1); + llama_kv_cache_seq_shift(ctx, 0, params.n_keep + 1 + n_discard, n_past, -n_discard); + + n_past -= n_discard; + + if (ctx_guidance) { + n_past_guidance -= n_discard; + } + + LOG("after swap: n_past = %d, n_past_guidance = %d\n", n_past, n_past_guidance); + + LOG("embd: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + + LOG("clear session path\n"); + path_session.clear(); + } + + // try to reuse a matching prefix from the loaded session instead of re-eval (via n_past) + if (n_session_consumed < (int) session_tokens.size()) { + size_t i = 0; + for ( ; i < embd.size(); i++) { + if (embd[i] != session_tokens[n_session_consumed]) { + session_tokens.resize(n_session_consumed); + break; + } + + n_past++; + n_session_consumed++; + + if (n_session_consumed >= (int) session_tokens.size()) { + ++i; + break; + } + } + if (i > 0) { + embd.erase(embd.begin(), embd.begin() + i); + } + } + + // evaluate tokens in batches + // embd is typically prepared beforehand to fit within a batch, but not always + if (ctx_guidance) { + int input_size = 0; + llama_token * input_buf = NULL; + + if (n_past_guidance < (int) guidance_inp.size()) { + // Guidance context should have the same data with these modifications: + // + // * Replace the initial prompt + // * Shift everything by guidance_offset + embd_guidance = guidance_inp; + if (embd.begin() + original_prompt_len < embd.end()) { + embd_guidance.insert( + embd_guidance.end(), + embd.begin() + original_prompt_len, + embd.end() + ); + } + + input_buf = embd_guidance.data(); + input_size = embd_guidance.size(); + + LOG("guidance context: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_guidance).c_str()); + } else { + input_buf = embd.data(); + input_size = embd.size(); + } + + for (int i = 0; i < input_size; i += params.n_batch) { + int n_eval = std::min(input_size - i, params.n_batch); + if (llama_decode(ctx_guidance, llama_batch_get_one(input_buf + i, n_eval, n_past_guidance, 0))) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + n_past_guidance += n_eval; + } + } + + for (int i = 0; i < (int) embd.size(); i += params.n_batch) { + int n_eval = (int) embd.size() - i; + if (n_eval > params.n_batch) { + n_eval = params.n_batch; + } + + LOG("eval: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd).c_str()); + + if (llama_decode(ctx, llama_batch_get_one(&embd[i], n_eval, n_past, 0))) { + LOG_TEE("%s : failed to eval\n", __func__); + return 1; + } + + n_past += n_eval; + + LOG("n_past = %d\n", n_past); + } + + if (!embd.empty() && !path_session.empty()) { + session_tokens.insert(session_tokens.end(), embd.begin(), embd.end()); + n_session_consumed = session_tokens.size(); + } + } + + embd.clear(); + embd_guidance.clear(); + + if ((int) embd_inp.size() <= n_consumed && !is_interacting) { + // optionally save the session on first sample (for faster prompt loading next time) + if (!path_session.empty() && need_to_save_session && !params.prompt_cache_ro) { + need_to_save_session = false; + llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + + LOG("saved session to %s\n", path_session.c_str()); + } + + const llama_token id = llama_sampling_sample(ctx_sampling, ctx, ctx_guidance); + + llama_sampling_accept(ctx_sampling, ctx, id, true); + + LOG("last: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, ctx_sampling->prev).c_str()); + + embd.push_back(id); + + // echo this to console + input_echo = true; + + // decrement remaining sampling budget + --n_remain; + + LOG("n_remain: %d\n", n_remain); + } else { + // some user input remains from prompt or interaction, forward it to processing + LOG("embd_inp.size(): %d, n_consumed: %d\n", (int) embd_inp.size(), n_consumed); + while ((int) embd_inp.size() > n_consumed) { + embd.push_back(embd_inp[n_consumed]); + + // push the prompt in the sampling context in order to apply repetition penalties later + // for the prompt, we don't apply grammar rules + llama_sampling_accept(ctx_sampling, ctx, embd_inp[n_consumed], false); + + ++n_consumed; + if ((int) embd.size() >= params.n_batch) { + break; + } + } + } + + // display text + if (input_echo) { + for (auto id : embd) { + const std::string token_str = llama_token_to_piece(ctx, id); + printf("%s", token_str.c_str()); + + if (embd.size() > 1) { + input_tokens.push_back(id); + } else { + output_tokens.push_back(id); + output_ss << token_str; + } + } + fflush(stdout); + } + // reset color to default if there is no pending user input + if (input_echo && (int) embd_inp.size() == n_consumed) { + console::set_display(console::reset); + } + + // if not currently processing queued inputs; + if ((int) embd_inp.size() <= n_consumed) { + // check for reverse prompt in the last n_prev tokens + if (!params.antiprompt.empty()) { + const int n_prev = 32; + const std::string last_output = llama_sampling_prev_str(ctx_sampling, ctx, n_prev); + + is_antiprompt = false; + // Check if each of the reverse prompts appears at the end of the output. + // If we're not running interactively, the reverse prompt might be tokenized with some following characters + // so we'll compensate for that by widening the search window a bit. + for (std::string & antiprompt : params.antiprompt) { + size_t extra_padding = params.interactive ? 0 : 2; + size_t search_start_pos = last_output.length() > static_cast(antiprompt.length() + extra_padding) + ? last_output.length() - static_cast(antiprompt.length() + extra_padding) + : 0; + + if (last_output.find(antiprompt, search_start_pos) != std::string::npos) { + if (params.interactive) { + is_interacting = true; + } + is_antiprompt = true; + break; + } + } + + if (is_antiprompt) { + LOG("found antiprompt: %s\n", last_output.c_str()); + } + } + + // deal with end of text token in interactive mode + if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { + LOG("found EOS token\n"); + + if (params.interactive) { + if (!params.antiprompt.empty()) { + // tokenize and inject first reverse prompt + const auto first_antiprompt = ::llama_tokenize(ctx, params.antiprompt.front(), false, true); + embd_inp.insert(embd_inp.end(), first_antiprompt.begin(), first_antiprompt.end()); + is_antiprompt = true; + } + + is_interacting = true; + printf("\n"); + } else if (params.instruct || params.chatml) { + is_interacting = true; + } + } + + if (n_past > 0 && is_interacting) { + LOG("waiting for user input\n"); + + if (params.instruct || params.chatml) { + printf("\n> "); + } + + if (params.input_prefix_bos) { + LOG("adding input prefix BOS token\n"); + embd_inp.push_back(llama_token_bos(model)); + } + + std::string buffer; + if (!params.input_prefix.empty()) { + LOG("appending input prefix: '%s'\n", params.input_prefix.c_str()); + printf("%s", params.input_prefix.c_str()); + } + + // color user input only + console::set_display(console::user_input); + + std::string line; + bool another_line = true; + do { + another_line = console::readline(line, params.multiline_input); + buffer += line; + } while (another_line); + + // done taking input, reset color + console::set_display(console::reset); + + // Add tokens to embd only if the input buffer is non-empty + // Entering a empty line lets the user pass control back + if (buffer.length() > 1) { + // append input suffix if any + if (!params.input_suffix.empty()) { + LOG("appending input suffix: '%s'\n", params.input_suffix.c_str()); + printf("%s", params.input_suffix.c_str()); + } + + LOG("buffer: '%s'\n", buffer.c_str()); + + const size_t original_size = embd_inp.size(); + + // instruct mode: insert instruction prefix + if (params.instruct && !is_antiprompt) { + LOG("inserting instruction prefix\n"); + n_consumed = embd_inp.size(); + embd_inp.insert(embd_inp.end(), inp_pfx.begin(), inp_pfx.end()); + } + // chatml mode: insert user chat prefix + if (params.chatml && !is_antiprompt) { + LOG("inserting chatml prefix\n"); + n_consumed = embd_inp.size(); + embd_inp.insert(embd_inp.end(), cml_pfx.begin(), cml_pfx.end()); + } + if (params.escape) { + process_escapes(buffer); + } + + const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true); + const auto line_inp = ::llama_tokenize(ctx, buffer, false, false); + const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true); + LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str()); + + embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end()); + embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end()); + embd_inp.insert(embd_inp.end(), line_sfx.begin(), line_sfx.end()); + + // instruct mode: insert response suffix + if (params.instruct) { + LOG("inserting instruction suffix\n"); + embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); + } + // chatml mode: insert assistant chat suffix + if (params.chatml) { + LOG("inserting chatml suffix\n"); + embd_inp.insert(embd_inp.end(), cml_sfx.begin(), cml_sfx.end()); + } + + for (size_t i = original_size; i < embd_inp.size(); ++i) { + const llama_token token = embd_inp[i]; + output_tokens.push_back(token); + output_ss << llama_token_to_piece(ctx, token); + } + + n_remain -= line_inp.size(); + LOG("n_remain: %d\n", n_remain); + } else { + LOG("empty line, passing control back\n"); + } + + input_echo = false; // do not echo this again + } + + if (n_past > 0) { + if (is_interacting) { + llama_sampling_reset(ctx_sampling); + } + is_interacting = false; + } + } + + // end of text token + if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive || params.chatml)) { + LOG_TEE(" [end of text]\n"); + break; + } + + // In interactive mode, respect the maximum number of tokens and drop back to user input when reached. + // We skip this logic when n_predict == -1 (infinite) or -2 (stop at context size). + if (params.interactive && n_remain <= 0 && params.n_predict >= 0) { + n_remain = params.n_predict; + is_interacting = true; + } + } + + if (!path_session.empty() && params.prompt_cache_all && !params.prompt_cache_ro) { + LOG_TEE("\n%s: saving final output to session file '%s'\n", __func__, path_session.c_str()); + llama_save_session_file(ctx, path_session.c_str(), session_tokens.data(), session_tokens.size()); + } + + llama_print_timings(ctx); + write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens); + + if (ctx_guidance) { llama_free(ctx_guidance); } + llama_free(ctx); + llama_free_model(model); + + llama_sampling_free(ctx_sampling); + llama_backend_free(); + +#ifndef LOG_DISABLE_LOGS + LOG_TEE("Log end\n"); +#endif // LOG_DISABLE_LOGS + + return 0; +} diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp index d27ea5e91..5d3a13043 100644 --- a/examples/quantize/quantize.cpp +++ b/examples/quantize/quantize.cpp @@ -5,6 +5,7 @@ #include #include #include +#include struct quant_option { std::string name; diff --git a/ggml-quants.c b/ggml-quants.c index 0e8163a16..2ec0203a0 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -425,7 +425,7 @@ static const uint64_t table_b2b_1[1 << 8] = { B8(10, 00) }; // (!b) << 4 #endif // reference implementation for deterministic creation of model files -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k) { +void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -462,11 +462,11 @@ void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict } } -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k) { +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { quantize_row_q4_0_reference(x, y, k); } -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k) { +void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k) { const int qk = QK4_1; assert(k % qk == 0); @@ -503,11 +503,11 @@ void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict } } -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k) { +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { quantize_row_q4_1_reference(x, y, k); } -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) { +void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -551,11 +551,11 @@ void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict } } -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k) { +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { quantize_row_q5_0_reference(x, y, k); } -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k) { +void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k) { const int qk = QK5_1; assert(k % qk == 0); @@ -599,12 +599,12 @@ void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict } } -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k) { +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { quantize_row_q5_1_reference(x, y, k); } // reference implementation for deterministic creation of model files -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k) { +void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k) { assert(k % QK8_0 == 0); const int nb = k / QK8_0; @@ -629,12 +629,12 @@ void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict } } -void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(QK8_0 == 32); assert(k % QK8_0 == 0); const int nb = k / QK8_0; - block_q8_0 * restrict y = vy; + block_q8_0 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -818,7 +818,7 @@ void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) { } // reference implementation for deterministic creation of model files -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) { +void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k) { assert(QK8_1 == 32); assert(k % QK8_1 == 0); const int nb = k / QK8_1; @@ -853,11 +853,11 @@ void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict } } -void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK8_1 == 0); const int nb = k / QK8_1; - block_q8_1 * restrict y = vy; + block_q8_1 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) for (int i = 0; i < nb; i++) { @@ -1071,7 +1071,7 @@ void quantize_row_q8_1(const float * restrict x, void * restrict vy, int k) { #endif } -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK4_0; assert(k % qk == 0); @@ -1091,7 +1091,7 @@ void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int } } -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK4_1; assert(k % qk == 0); @@ -1112,7 +1112,7 @@ void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int } } -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK5_0; assert(k % qk == 0); @@ -1138,7 +1138,7 @@ void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int } } -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k) { +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK5_1; assert(k % qk == 0); @@ -1165,7 +1165,7 @@ void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int } } -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k) { +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { static const int qk = QK8_0; assert(k % qk == 0); @@ -1195,7 +1195,7 @@ static inline int nearest_int(float fval) { return (i & 0x007fffff) - 0x00400000; } -static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, int rmse_type) { +static float make_qx_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, int rmse_type) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1259,7 +1259,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t * return scale; } -static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * restrict L, bool do_rmse) { +static float make_q3_quants(int n, int nmax, const float * GGML_RESTRICT x, int8_t * GGML_RESTRICT L, bool do_rmse) { float max = 0; float amax = 0; for (int i = 0; i < n; ++i) { @@ -1318,7 +1318,7 @@ static float make_q3_quants(int n, int nmax, const float * restrict x, int8_t * return 1/iscale; } -static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t * restrict L, float * restrict the_min, +static float make_qkx1_quants(int n, int nmax, const float * GGML_RESTRICT x, uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, int ntry, float alpha) { float min = x[0]; float max = x[0]; @@ -1361,8 +1361,8 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t return scale; } -static float make_qkx2_quants(int n, int nmax, const float * restrict x, const float * restrict weights, - uint8_t * restrict L, float * restrict the_min, uint8_t * restrict Laux, +static float make_qkx2_quants(int n, int nmax, const float * GGML_RESTRICT x, const float * GGML_RESTRICT weights, + uint8_t * GGML_RESTRICT L, float * GGML_RESTRICT the_min, uint8_t * GGML_RESTRICT Laux, float rmin, float rdelta, int nstep, bool use_mad) { float min = x[0]; float max = x[0]; @@ -1443,7 +1443,7 @@ static float make_qkx2_quants(int n, int nmax, const float * restrict x, const f } #if QK_K == 256 -static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * restrict d, uint8_t * restrict m) { +static inline void get_scale_min_k4(int j, const uint8_t * GGML_RESTRICT q, uint8_t * GGML_RESTRICT d, uint8_t * GGML_RESTRICT m) { if (j < 4) { *d = q[j] & 63; *m = q[j + 4] & 63; } else { @@ -1455,7 +1455,7 @@ static inline void get_scale_min_k4(int j, const uint8_t * restrict q, uint8_t * //========================- 2-bit (de)-quantization -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k) { +void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1532,7 +1532,7 @@ void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict } } -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k) { +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1578,15 +1578,15 @@ void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int } } -void quantize_row_q2_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { quantize_row_q2_K_reference(x, vy, k); } -size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q2_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q2_K * restrict y = (block_q2_K *)dst + j/QK_K; + block_q2_K * GGML_RESTRICT y = (block_q2_K *)dst + j/QK_K; quantize_row_q2_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q2_K)); @@ -1594,7 +1594,7 @@ size_t ggml_quantize_q2_K(const float * restrict src, void * restrict dst, int n //========================= 3-bit (de)-quantization -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k) { +void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1708,7 +1708,7 @@ void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict } #if QK_K == 256 -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1722,8 +1722,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + const uint8_t * GGML_RESTRICT q = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; uint8_t m = 1; memcpy(aux, x[i].scales, 12); @@ -1758,7 +1758,7 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } } #else -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k) { +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); assert(QK_K == 64); const int nb = k / QK_K; @@ -1767,8 +1767,8 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; + const uint8_t * GGML_RESTRICT q = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; const float d1 = d_all * ((x[i].scales[0] & 0xF) - 8); const float d2 = d_all * ((x[i].scales[0] >> 4) - 8); @@ -1791,15 +1791,15 @@ void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int } #endif -void quantize_row_q3_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { quantize_row_q3_K_reference(x, vy, k); } -size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q3_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q3_K * restrict y = (block_q3_K *)dst + j/QK_K; + block_q3_K * GGML_RESTRICT y = (block_q3_K *)dst + j/QK_K; quantize_row_q3_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q3_K)); @@ -1807,7 +1807,7 @@ size_t ggml_quantize_q3_K(const float * restrict src, void * restrict dst, int n // ====================== 4-bit (de)-quantization -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k) { +void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1914,7 +1914,7 @@ void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict } } -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k) { +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -1953,18 +1953,18 @@ void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int } } -void quantize_row_q4_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK_K == 0); - block_q4_K * restrict y = vy; + block_q4_K * GGML_RESTRICT y = vy; quantize_row_q4_K_reference(x, y, k); } -size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q4_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { assert(k % QK_K == 0); (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q4_K * restrict y = (block_q4_K *)dst + j/QK_K; + block_q4_K * GGML_RESTRICT y = (block_q4_K *)dst + j/QK_K; quantize_row_q4_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q4_K)); @@ -1972,7 +1972,7 @@ size_t ggml_quantize_q4_K(const float * restrict src, void * restrict dst, int n // ====================== 5-bit (de)-quantization -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k) { +void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2042,8 +2042,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * GGML_RESTRICT qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].qs; memset(qh, 0, QK_K/8); uint8_t m1 = 1, m2 = 2; @@ -2090,8 +2090,8 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } - uint8_t * restrict qh = y[i].qh; - uint8_t * restrict ql = y[i].qs; + uint8_t * GGML_RESTRICT qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].qs; memset(qh, 0, QK_K/8); for (int j = 0; j < 32; ++j) { @@ -2114,7 +2114,7 @@ void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict } } -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k) { +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2143,7 +2143,7 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } #else float d = GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict s = x[i].scales; + const int8_t * GGML_RESTRICT s = x[i].scales; for (int l = 0; l < 8; ++l) { y[l+ 0] = d * s[0] * ((ql[l+ 0] & 0xF) - (qh[l] & 0x01 ? 0 : 16)); y[l+ 8] = d * s[0] * ((ql[l+ 8] & 0xF) - (qh[l] & 0x02 ? 0 : 16)); @@ -2159,18 +2159,18 @@ void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int } } -void quantize_row_q5_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK_K == 0); - block_q5_K * restrict y = vy; + block_q5_K * GGML_RESTRICT y = vy; quantize_row_q5_K_reference(x, y, k); } -size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n, int k, int64_t * restrict hist) { +size_t ggml_quantize_q5_K(const float * GGML_RESTRICT src, void * GGML_RESTRICT dst, int n, int k, int64_t * GGML_RESTRICT hist) { assert(k % QK_K == 0); (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q5_K * restrict y = (block_q5_K *)dst + j/QK_K; + block_q5_K * GGML_RESTRICT y = (block_q5_K *)dst + j/QK_K; quantize_row_q5_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q5_K)); @@ -2178,7 +2178,7 @@ size_t ggml_quantize_q5_K(const float * restrict src, void * restrict dst, int n // ====================== 6-bit (de)-quantization -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k) { +void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2228,8 +2228,8 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } - uint8_t * restrict ql = y[i].ql; - uint8_t * restrict qh = y[i].qh; + uint8_t * GGML_RESTRICT ql = y[i].ql; + uint8_t * GGML_RESTRICT qh = y[i].qh; #if QK_K == 256 for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { @@ -2260,7 +2260,7 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict } } -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k) { +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2268,9 +2268,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int const float d = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict ql = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT ql = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT sc = x[i].scales; #if QK_K == 256 for (int n = 0; n < QK_K; n += 128) { @@ -2307,9 +2307,9 @@ void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int } } -void quantize_row_q6_K(const float * restrict x, void * restrict vy, int k) { +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT vy, int k) { assert(k % QK_K == 0); - block_q6_K * restrict y = vy; + block_q6_K * GGML_RESTRICT y = vy; quantize_row_q6_K_reference(x, y, k); } @@ -2318,7 +2318,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * (void)hist; // TODO: collect histograms for (int j = 0; j < n; j += k) { - block_q6_K * restrict y = (block_q6_K *)dst + j/QK_K; + block_q6_K * GGML_RESTRICT y = (block_q6_K *)dst + j/QK_K; quantize_row_q6_K_reference(src + j, y, k); } return (n/QK_K*sizeof(block_q6_K)); @@ -2326,7 +2326,7 @@ size_t ggml_quantize_q6_K(const float * src, void * dst, int n, int k, int64_t * //===================================== Q8_K ============================================== -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k) { +void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2363,7 +2363,7 @@ void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict } } -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k) { +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k) { assert(k % QK_K == 0); const int nb = k / QK_K; @@ -2374,7 +2374,7 @@ void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int } } -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k) { +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k) { quantize_row_q8_K_reference(x, y, k); } @@ -2423,14 +2423,14 @@ static inline __m128i get_scale_shuffle(int i) { } #endif -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - const block_q4_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q4_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -2439,10 +2439,10 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q4_0 * restrict x0 = &x[i + 0]; - const block_q4_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q4_0 * GGML_RESTRICT x0 = &x[i + 0]; + const block_q4_0 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); const int8x16_t s8b = vdupq_n_s8(0x8); @@ -2733,14 +2733,14 @@ void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, #endif } -void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_1_q8_1(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); - const block_q4_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q4_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; // TODO: add WASM SIMD #if defined(__ARM_NEON) @@ -2752,10 +2752,10 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q4_1 * restrict x0 = &x[i + 0]; - const block_q4_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i + 0]; - const block_q8_1 * restrict y1 = &y[i + 1]; + const block_q4_1 * GGML_RESTRICT x0 = &x[i + 0]; + const block_q4_1 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[i + 0]; + const block_q8_1 * GGML_RESTRICT y1 = &y[i + 1]; summs += GGML_FP16_TO_FP32(x0->m) * y0->s + GGML_FP16_TO_FP32(x1->m) * y1->s; @@ -2893,15 +2893,15 @@ void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_0_q8_0(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_0); - const block_q5_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q5_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -2916,10 +2916,10 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q5_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q5_0 * GGML_RESTRICT x0 = &x[i]; + const block_q5_0 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i]; + const block_q8_0 * GGML_RESTRICT y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3000,8 +3000,8 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { - const block_q5_0 * restrict x0 = &x[i]; - const block_q8_0 * restrict y0 = &y[i]; + const block_q5_0 * GGML_RESTRICT x0 = &x[i]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i]; const v128_t m4b = wasm_i8x16_splat(0x0F); @@ -3199,15 +3199,15 @@ void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_1_q8_1(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_1; const int nb = n / qk; assert(n % qk == 0); assert(qk == QK5_1); - const block_q5_1 * restrict x = vx; - const block_q8_1 * restrict y = vy; + const block_q5_1 * GGML_RESTRICT x = vx; + const block_q8_1 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3225,10 +3225,10 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q5_1 * restrict x1 = &x[i + 1]; - const block_q8_1 * restrict y0 = &y[i]; - const block_q8_1 * restrict y1 = &y[i + 1]; + const block_q5_1 * GGML_RESTRICT x0 = &x[i]; + const block_q5_1 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_1 * GGML_RESTRICT y0 = &y[i]; + const block_q8_1 * GGML_RESTRICT y1 = &y[i + 1]; const uint8x16_t m4b = vdupq_n_u8(0x0F); @@ -3314,8 +3314,8 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri // TODO: check if unrolling this is better for (int i = 0; i < nb; ++i) { - const block_q5_1 * restrict x0 = &x[i]; - const block_q8_1 * restrict y0 = &y[i]; + const block_q5_1 * GGML_RESTRICT x0 = &x[i]; + const block_q8_1 * GGML_RESTRICT y0 = &y[i]; summs += GGML_FP16_TO_FP32(x0->m) * y0->s; @@ -3518,14 +3518,14 @@ void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restri #endif } -void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q8_0_q8_0(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { const int qk = QK8_0; const int nb = n / qk; assert(n % qk == 0); - const block_q8_0 * restrict x = vx; - const block_q8_0 * restrict y = vy; + const block_q8_0 * GGML_RESTRICT x = vx; + const block_q8_0 * GGML_RESTRICT y = vy; #if defined(__ARM_NEON) float32x4_t sumv0 = vdupq_n_f32(0.0f); @@ -3534,10 +3534,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri assert(nb % 2 == 0); // TODO: handle odd nb for (int i = 0; i < nb; i += 2) { - const block_q8_0 * restrict x0 = &x[i + 0]; - const block_q8_0 * restrict x1 = &x[i + 1]; - const block_q8_0 * restrict y0 = &y[i + 0]; - const block_q8_0 * restrict y1 = &y[i + 1]; + const block_q8_0 * GGML_RESTRICT x0 = &x[i + 0]; + const block_q8_0 * GGML_RESTRICT x1 = &x[i + 1]; + const block_q8_0 * GGML_RESTRICT y0 = &y[i + 0]; + const block_q8_0 * GGML_RESTRICT y1 = &y[i + 1]; const int8x16_t x0_0 = vld1q_s8(x0->qs); const int8x16_t x0_1 = vld1q_s8(x0->qs + 16); @@ -3642,10 +3642,10 @@ void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restri } #if QK_K == 256 -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -3667,9 +3667,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint8_t * restrict sc = x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint8_t * GGML_RESTRICT sc = x[i].scales; const uint8x16_t mins_and_scales = vld1q_u8(sc); const uint8x16_t scales = vandq_u8(mins_and_scales, m4); @@ -3746,8 +3746,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); const __m128i scales8 = _mm_and_si128(mins_and_scales, m4); @@ -3813,8 +3813,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float dall = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // load mins and scales from block_q2_K.scales[QK_K/16] const __m128i mins_and_scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -4035,10 +4035,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q2_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { - const block_q2_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q2_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4061,9 +4061,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const float dmin = -y[i].d * (float)x[i].dmin; - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; aux32[0] = sc[0] & 0x0f0f0f0f; aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4114,8 +4114,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; + const uint8_t * GGML_RESTRICT db = (const uint8_t *)&ud; + const uint8_t * GGML_RESTRICT mb = (const uint8_t *)&um; float summs = 0; @@ -4126,10 +4126,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; ud = (sc[0] >> 0) & 0x0f0f0f0f; um = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4166,8 +4166,8 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri __m256 acc = _mm256_setzero_ps(); uint32_t ud, um; - const uint8_t * restrict db = (const uint8_t *)&ud; - const uint8_t * restrict mb = (const uint8_t *)&um; + const uint8_t * GGML_RESTRICT db = (const uint8_t *)&ud; + const uint8_t * GGML_RESTRICT mb = (const uint8_t *)&um; float summs = 0; @@ -4178,10 +4178,10 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; ud = (sc[0] >> 0) & 0x0f0f0f0f; um = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4227,9 +4227,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const float dmin = -y[i].d * (float)x[i].dmin; - const uint8_t * restrict q2 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - const uint32_t * restrict sc = (const uint32_t *)x[i].scales; + const uint8_t * GGML_RESTRICT q2 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + const uint32_t * GGML_RESTRICT sc = (const uint32_t *)x[i].scales; aux32[0] = sc[0] & 0x0f0f0f0f; aux32[1] = (sc[0] >> 4) & 0x0f0f0f0f; @@ -4311,14 +4311,14 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); const uint32_t kmask1 = 0x03030303; const uint32_t kmask2 = 0x0f0f0f0f; - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4346,9 +4346,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -4454,8 +4454,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales memcpy(aux, x[i].scales, 12); @@ -4559,8 +4559,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; // Set up scales aux = (const uint32_t *)x[i].scales; @@ -4694,9 +4694,9 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict qh = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(aux, x[i].scales, 12); utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4); @@ -4806,11 +4806,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) a[l] = q3[l] & 3; @@ -4855,11 +4855,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q3_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q3_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q3_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -4947,8 +4947,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5018,8 +5018,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5098,8 +5098,8 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint16_t a = *(const uint16_t *)x[i].scales; aux16[0] = a & 0x0f0f; @@ -5173,10 +5173,10 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q3 = x[i].qs; - const uint8_t * restrict hm = x[i].hmask; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; + const uint8_t * GGML_RESTRICT q3 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].hmask; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 8; ++l) { a[l+ 0] = (int8_t)((q3[l+0] >> 0) & 3) - (hm[l] & 0x01 ? 0 : 4); a[l+ 8] = (int8_t)((q3[l+8] >> 0) & 3) - (hm[l] & 0x02 ? 0 : 4); @@ -5213,11 +5213,11 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -5262,8 +5262,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; int32_t sumi1 = 0; int32_t sumi2 = 0; @@ -5334,8 +5334,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri utmp[2] = uaux; utmp[0] &= kmask1; - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i mins_and_scales = _mm256_cvtepu8_epi16(_mm_set_epi32(utmp[3], utmp[2], utmp[1], utmp[0])); @@ -5393,8 +5393,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -5494,8 +5494,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri vint32m1_t sumi = __riscv_vredsum_vs_i32m1_i32m1(prod, __riscv_vmv_v_x_i32m1(0, 1), vl); sumf -= dmin * __riscv_vmv_x_s_i32m1_i32(sumi); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; vl = 32; @@ -5548,10 +5548,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); a += 32; @@ -5594,11 +5594,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif } #else -void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q4_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q4_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q4_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -5618,14 +5618,14 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri float sum_mins = 0.f; uint16_t aux16[2]; - const uint8_t * restrict scales = (const uint8_t *)aux16; + const uint8_t * GGML_RESTRICT scales = (const uint8_t *)aux16; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint16_t * restrict a = (const uint16_t *)x[i].scales; + const uint16_t * GGML_RESTRICT a = (const uint16_t *)x[i].scales; aux16[0] = a[0] & 0x0f0f; aux16[1] = (a[0] >> 4) & 0x0f0f; @@ -5698,8 +5698,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); const __m256i q4l = _mm256_and_si256(q4bits, m4); @@ -5744,8 +5744,8 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri summs += m * (scales[2] * (y[i].bsums[0] + y[i].bsums[1]) + scales[3] * (y[i].bsums[2] + y[i].bsums[3])); - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m256i q4bits = _mm256_loadu_si256((const __m256i*)q4); const __m128i q4bits_0 = _mm256_extractf128_si256(q4bits, 0); @@ -5778,16 +5778,16 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #elif defined __riscv_v_intrinsic uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; + const uint8_t * GGML_RESTRICT scales = (const uint8_t *)s16; float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const uint16_t * restrict b = (const uint16_t *)x[i].scales; + const uint16_t * GGML_RESTRICT b = (const uint16_t *)x[i].scales; s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; @@ -5827,17 +5827,17 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri memset(sums, 0, 8*sizeof(float)); uint16_t s16[2]; - const uint8_t * restrict scales = (const uint8_t *)s16; + const uint8_t * GGML_RESTRICT scales = (const uint8_t *)s16; float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; - uint8_t * restrict a = aux8; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + uint8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 32; ++l) a[l+ 0] = q4[l] & 0xF; for (int l = 0; l < 32; ++l) a[l+32] = q4[l] >> 4; - const uint16_t * restrict b = (const uint16_t *)x[i].scales; + const uint16_t * GGML_RESTRICT b = (const uint16_t *)x[i].scales; s16[0] = b[0] & 0x0f0f; s16[1] = (b[0] >> 4) & 0x0f0f; @@ -5861,11 +5861,11 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri #endif #if QK_K == 256 -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -5911,9 +5911,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const uint8_t * scales = (const uint8_t *)utmp; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; ggml_uint8x16x2_t qhbits = ggml_vld1q_u8_x2(qh); @@ -5976,8 +5976,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; #if QK_K == 256 const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6065,8 +6065,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); const float dmin = -y[i].d * GGML_FP16_TO_FP32(x[i].dmin); - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memcpy(utmp, x[i].scales, 12); utmp[3] = ((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4); @@ -6163,9 +6163,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri vl = 8; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; const float dmin = GGML_FP16_TO_FP32(x[i].dmin) * y[i].d; @@ -6249,11 +6249,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; uint8_t m = 1; for (int j = 0; j < QK_K/64; ++j) { for (int l = 0; l < 32; ++l) a[l] = (int8_t)(q4[l] & 0xF); @@ -6302,11 +6302,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q5_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q5_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q5_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -6328,9 +6328,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const int8_t * sc = x[i].scales; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const uint8x8_t qhbits = vld1_u8(qh); @@ -6387,8 +6387,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6433,8 +6433,8 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q5 = x[i].qs; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); @@ -6490,9 +6490,9 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * (float)x[i].d; const int8_t * sc = x[i].scales; - const uint8_t * restrict q5 = x[i].qs; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q5 = x[i].qs; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; vint32m1_t vzero = __riscv_vmv_v_x_i32m1(0, 1); @@ -6560,10 +6560,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].qs; - const uint8_t * restrict hm = x[i].qh; - const int8_t * restrict q8 = y[i].qs; - int8_t * restrict a = aux8; + const uint8_t * GGML_RESTRICT q4 = x[i].qs; + const uint8_t * GGML_RESTRICT hm = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; + int8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 32; ++l) { a[l+ 0] = q4[l] & 0xF; a[l+32] = q4[l] >> 4; @@ -6574,7 +6574,7 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri } const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const int8_t * restrict sc = x[i].scales; + const int8_t * GGML_RESTRICT sc = x[i].scales; for (int j = 0; j < QK_K/16; ++j) { const float dl = d * sc[j]; @@ -6591,11 +6591,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri #if QK_K == 256 -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -6618,11 +6618,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; const ggml_int16x8x2_t q8sums = ggml_vld1q_s16_x2(y[i].bsums); const int8x16_t scales = vld1q_s8(scale); @@ -6750,9 +6750,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -6830,9 +6830,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m128i scales = _mm_loadu_si128((const __m128i*)x[i].scales); @@ -6942,11 +6942,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; size_t vl; @@ -7030,11 +7030,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int j = 0; j < QK_K; j += 128) { for (int l = 0; l < 32; ++l) { a[l + 0] = (int8_t)((q4[l + 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; @@ -7067,11 +7067,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri #else -void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) { +void ggml_vec_dot_q6_K_q8_K(const int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy) { assert(n % QK_K == 0); - const block_q6_K * restrict x = vx; - const block_q8_K * restrict y = vy; + const block_q6_K * GGML_RESTRICT x = vx; + const block_q8_K * GGML_RESTRICT y = vy; const int nb = n / QK_K; @@ -7094,11 +7094,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = (float)x[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; int32_t isum = 0; @@ -7157,9 +7157,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); @@ -7214,9 +7214,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d); - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; const __m64 scales_1 = _mm_set1_pi8(x[i].scales[0]); const __m64 scales_2 = _mm_set1_pi8(x[i].scales[1]); @@ -7281,11 +7281,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri const float d_all = (float)x[i].d; - const uint8_t * restrict q6 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q6 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; - const int8_t * restrict scale = x[i].scales; + const int8_t * GGML_RESTRICT scale = x[i].scales; int32_t isum = 0; @@ -7350,11 +7350,11 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri float sumf = 0; for (int i = 0; i < nb; ++i) { - const uint8_t * restrict q4 = x[i].ql; - const uint8_t * restrict qh = x[i].qh; - const int8_t * restrict q8 = y[i].qs; + const uint8_t * GGML_RESTRICT q4 = x[i].ql; + const uint8_t * GGML_RESTRICT qh = x[i].qh; + const int8_t * GGML_RESTRICT q8 = y[i].qs; memset(aux32, 0, 8*sizeof(int32_t)); - int8_t * restrict a = aux8; + int8_t * GGML_RESTRICT a = aux8; for (int l = 0; l < 16; ++l) { a[l+ 0] = (int8_t)((q4[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32; a[l+16] = (int8_t)((q4[l+16] & 0xF) | (((qh[l] >> 2) & 3) << 4)) - 32; diff --git a/ggml-quants.h b/ggml-quants.h index 70c12c274..34b99af61 100644 --- a/ggml-quants.h +++ b/ggml-quants.h @@ -167,58 +167,58 @@ static_assert(sizeof(block_q8_K) == sizeof(float) + QK_K + QK_K/16*sizeof(int16_ // Quantization -void quantize_row_q4_0_reference(const float * restrict x, block_q4_0 * restrict y, int k); -void quantize_row_q4_1_reference(const float * restrict x, block_q4_1 * restrict y, int k); -void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k); -void quantize_row_q5_1_reference(const float * restrict x, block_q5_1 * restrict y, int k); -void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * restrict y, int k); -void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k); +void quantize_row_q4_0_reference(const float * GGML_RESTRICT x, block_q4_0 * GGML_RESTRICT y, int k); +void quantize_row_q4_1_reference(const float * GGML_RESTRICT x, block_q4_1 * GGML_RESTRICT y, int k); +void quantize_row_q5_0_reference(const float * GGML_RESTRICT x, block_q5_0 * GGML_RESTRICT y, int k); +void quantize_row_q5_1_reference(const float * GGML_RESTRICT x, block_q5_1 * GGML_RESTRICT y, int k); +void quantize_row_q8_0_reference(const float * GGML_RESTRICT x, block_q8_0 * GGML_RESTRICT y, int k); +void quantize_row_q8_1_reference(const float * GGML_RESTRICT x, block_q8_1 * GGML_RESTRICT y, int k); -void quantize_row_q2_K_reference(const float * restrict x, block_q2_K * restrict y, int k); -void quantize_row_q3_K_reference(const float * restrict x, block_q3_K * restrict y, int k); -void quantize_row_q4_K_reference(const float * restrict x, block_q4_K * restrict y, int k); -void quantize_row_q5_K_reference(const float * restrict x, block_q5_K * restrict y, int k); -void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict y, int k); -void quantize_row_q8_K_reference(const float * restrict x, block_q8_K * restrict y, int k); +void quantize_row_q2_K_reference(const float * GGML_RESTRICT x, block_q2_K * GGML_RESTRICT y, int k); +void quantize_row_q3_K_reference(const float * GGML_RESTRICT x, block_q3_K * GGML_RESTRICT y, int k); +void quantize_row_q4_K_reference(const float * GGML_RESTRICT x, block_q4_K * GGML_RESTRICT y, int k); +void quantize_row_q5_K_reference(const float * GGML_RESTRICT x, block_q5_K * GGML_RESTRICT y, int k); +void quantize_row_q6_K_reference(const float * GGML_RESTRICT x, block_q6_K * GGML_RESTRICT y, int k); +void quantize_row_q8_K_reference(const float * GGML_RESTRICT x, block_q8_K * GGML_RESTRICT y, int k); -void quantize_row_q4_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_1(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_0(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_1(const float * restrict x, void * restrict y, int k); +void quantize_row_q4_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q4_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q5_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q5_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q8_0(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q8_1(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); -void quantize_row_q2_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q3_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q4_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q5_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q6_K(const float * restrict x, void * restrict y, int k); -void quantize_row_q8_K(const float * restrict x, void * restrict y, int k); +void quantize_row_q2_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q3_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q4_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q5_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q6_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); +void quantize_row_q8_K(const float * GGML_RESTRICT x, void * GGML_RESTRICT y, int k); // Dequantization -void dequantize_row_q4_0(const block_q4_0 * restrict x, float * restrict y, int k); -void dequantize_row_q4_1(const block_q4_1 * restrict x, float * restrict y, int k); -void dequantize_row_q5_0(const block_q5_0 * restrict x, float * restrict y, int k); -void dequantize_row_q5_1(const block_q5_1 * restrict x, float * restrict y, int k); -void dequantize_row_q8_0(const block_q8_0 * restrict x, float * restrict y, int k); -//void dequantize_row_q8_1(const block_q8_1 * restrict x, float * restrict y, int k); +void dequantize_row_q4_0(const block_q4_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q4_1(const block_q4_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q5_0(const block_q5_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q5_1(const block_q5_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q8_0(const block_q8_0 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +//void dequantize_row_q8_1(const block_q8_1 * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); -void dequantize_row_q2_K(const block_q2_K * restrict x, float * restrict y, int k); -void dequantize_row_q3_K(const block_q3_K * restrict x, float * restrict y, int k); -void dequantize_row_q4_K(const block_q4_K * restrict x, float * restrict y, int k); -void dequantize_row_q5_K(const block_q5_K * restrict x, float * restrict y, int k); -void dequantize_row_q6_K(const block_q6_K * restrict x, float * restrict y, int k); -void dequantize_row_q8_K(const block_q8_K * restrict x, float * restrict y, int k); +void dequantize_row_q2_K(const block_q2_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q3_K(const block_q3_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q4_K(const block_q4_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q5_K(const block_q5_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q6_K(const block_q6_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); +void dequantize_row_q8_K(const block_q8_K * GGML_RESTRICT x, float * GGML_RESTRICT y, int k); // Dot product -void ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_1_q8_1(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q4_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q4_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q5_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q5_1_q8_1(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q8_0_q8_0(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); -void ggml_vec_dot_q2_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q4_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); -void ggml_vec_dot_q6_K_q8_K(int n, float * restrict s, const void * restrict vx, const void * restrict vy); +void ggml_vec_dot_q2_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q4_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q5_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); +void ggml_vec_dot_q6_K_q8_K(int n, float * GGML_RESTRICT s, const void * GGML_RESTRICT vx, const void * GGML_RESTRICT vy); diff --git a/ggml.c b/ggml.c index 6da65bd92..a39385fee 100644 --- a/ggml.c +++ b/ggml.c @@ -391,8 +391,8 @@ int64_t ggml_cycles_per_ms(void) { static const size_t CACHE_LINE_SIZE_F32 = CACHE_LINE_SIZE/sizeof(float); -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y); -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y); +static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y); +static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y); static const ggml_type_traits_t type_traits[GGML_TYPE_COUNT] = { [GGML_TYPE_I8] = { @@ -1142,7 +1142,7 @@ inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) inline static void ggml_vec_mul_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]*y[i]; } inline static void ggml_vec_div_f32 (const int n, float * z, const float * x, const float * y) { for (int i = 0; i < n; ++i) z[i] = x[i]/y[i]; } -static void ggml_vec_dot_f32(const int n, float * restrict s, const float * restrict x, const float * restrict y) { +static void ggml_vec_dot_f32(const int n, float * GGML_RESTRICT s, const float * GGML_RESTRICT x, const float * GGML_RESTRICT y) { #ifdef GGML_SIMD float sumf = 0.0f; const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1179,7 +1179,7 @@ static void ggml_vec_dot_f32(const int n, float * restrict s, const float * rest *s = sumf; } -static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * restrict x, ggml_fp16_t * restrict y) { +static void ggml_vec_dot_f16(const int n, float * GGML_RESTRICT s, ggml_fp16_t * GGML_RESTRICT x, ggml_fp16_t * GGML_RESTRICT y) { ggml_float sumf = 0.0; #if defined(GGML_SIMD) @@ -1217,10 +1217,10 @@ static void ggml_vec_dot_f16(const int n, float * restrict s, ggml_fp16_t * rest // compute GGML_VEC_DOT_UNROLL dot products at once // xs - x row stride in bytes -inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) { +inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GGML_RESTRICT s, void * GGML_RESTRICT xv, ggml_fp16_t * GGML_RESTRICT y) { ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 }; - ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL]; + ggml_fp16_t * GGML_RESTRICT x[GGML_VEC_DOT_UNROLL]; for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) { x[i] = (ggml_fp16_t *) ((char *) xv + i*xs); @@ -1270,7 +1270,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * re } } -inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float * restrict x, const float v) { +inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const float * GGML_RESTRICT x, const float v) { #if defined(GGML_SIMD) const int np = (n & ~(GGML_F32_STEP - 1)); @@ -1302,10 +1302,10 @@ inline static void ggml_vec_mad_f32(const int n, float * restrict y, const float } // xs and vs are byte strides of x and v -inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * restrict y, const float * restrict xv, const float * restrict vv) { +inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int vs, float * GGML_RESTRICT y, const float * GGML_RESTRICT xv, const float * GGML_RESTRICT vv) { - const float * restrict x[GGML_VEC_MAD_UNROLL]; - const float * restrict v[GGML_VEC_MAD_UNROLL]; + const float * GGML_RESTRICT x[GGML_VEC_MAD_UNROLL]; + const float * GGML_RESTRICT v[GGML_VEC_MAD_UNROLL]; for (int i = 0; i < GGML_VEC_MAD_UNROLL; ++i) { x[i] = (const float *) ((const char *) xv + i*xs); @@ -18337,7 +18337,7 @@ size_t ggml_quantize_q4_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK4_0; for (int b = 0; b < n; b += k) { - block_q4_0 * restrict y = (block_q4_0 *) dst + b/QK4_0; + block_q4_0 * GGML_RESTRICT y = (block_q4_0 *) dst + b/QK4_0; quantize_row_q4_0_reference(src + b, y, k); @@ -18360,7 +18360,7 @@ size_t ggml_quantize_q4_1(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK4_1; for (int b = 0; b < n; b += k) { - block_q4_1 * restrict y = (block_q4_1 *) dst + b/QK4_1; + block_q4_1 * GGML_RESTRICT y = (block_q4_1 *) dst + b/QK4_1; quantize_row_q4_1_reference(src + b, y, k); @@ -18383,7 +18383,7 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK5_0; for (int b = 0; b < n; b += k) { - block_q5_0 * restrict y = (block_q5_0 *)dst + b/QK5_0; + block_q5_0 * GGML_RESTRICT y = (block_q5_0 *)dst + b/QK5_0; quantize_row_q5_0_reference(src + b, y, k); @@ -18413,7 +18413,7 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK5_1; for (int b = 0; b < n; b += k) { - block_q5_1 * restrict y = (block_q5_1 *)dst + b/QK5_1; + block_q5_1 * GGML_RESTRICT y = (block_q5_1 *)dst + b/QK5_1; quantize_row_q5_1_reference(src + b, y, k); @@ -18443,7 +18443,7 @@ size_t ggml_quantize_q8_0(const float * src, void * dst, int n, int k, int64_t * const int nb = k / QK8_0; for (int b = 0; b < n; b += k) { - block_q8_0 * restrict y = (block_q8_0 *)dst + b/QK8_0; + block_q8_0 * GGML_RESTRICT y = (block_q8_0 *)dst + b/QK8_0; quantize_row_q8_0_reference(src + b, y, k); diff --git a/ggml.h b/ggml.h index beacdc8be..d5010e917 100644 --- a/ggml.h +++ b/ggml.h @@ -2217,8 +2217,8 @@ extern "C" { // Internal types and functions exposed for tests and benchmarks // -#ifdef __cplusplus -// restrict not standard in C++ +#if defined(__cplusplus) || defined(_MSC_VER) +// GGML_RESTRICT not standard in C++ #define GGML_RESTRICT #else #define GGML_RESTRICT restrict