From d5ab29757ebc59a30f03e408294ec20628a6374e Mon Sep 17 00:00:00 2001 From: Marcus Dunn <51931484+MarcusDunn@users.noreply.github.com> Date: Thu, 29 Feb 2024 00:17:23 -0800 Subject: [PATCH 01/26] llama : constified `llama_set_state_data`'s `src` (#5774) --- llama.cpp | 6 +++--- llama.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 30d5eb32d..62699ce52 100644 --- a/llama.cpp +++ b/llama.cpp @@ -12545,8 +12545,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) { } // Sets the state reading from the specified source address -size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { - uint8_t * inp = src; +size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) { + const uint8_t * inp = src; // set rng { @@ -12555,7 +12555,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) { GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE); - std::string rng_str((char *)inp, rng_size); inp += rng_size; + std::string rng_str((const char *)inp, rng_size); inp += rng_size; std::istringstream rng_ss(rng_str); rng_ss >> ctx->rng; diff --git a/llama.h b/llama.h index a6823bb2b..4d0ebe37d 100644 --- a/llama.h +++ b/llama.h @@ -575,7 +575,7 @@ extern "C" { // Returns the number of bytes read LLAMA_API size_t llama_set_state_data( struct llama_context * ctx, - uint8_t * src); + const uint8_t * src); // Save/load session file LLAMA_API bool llama_load_session_file( From 052051d8ae4639a1c3c61e7da3237bcc572469d4 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Thu, 29 Feb 2024 21:42:11 +0100 Subject: [PATCH 02/26] Server: normalize naming (#5779) * server: normalize naming * fix spacing --- examples/server/server.cpp | 370 ++++++++++++++++--------------------- examples/server/utils.hpp | 186 ++++++++++++------- 2 files changed, 277 insertions(+), 279 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 080fa9bd5..bf20e0cf1 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -33,8 +33,7 @@ using json = nlohmann::json; -struct server_params -{ +struct server_params { std::string hostname = "127.0.0.1"; std::vector api_keys; std::string public_path = "examples/server/public"; @@ -49,103 +48,50 @@ struct server_params bool server_verbose = false; bool server_log_json = true; -static size_t common_part(const std::vector &a, const std::vector &b) -{ - size_t i; - for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) - { - } - return i; -} - -enum stop_type -{ +enum stop_type { STOP_FULL, STOP_PARTIAL, }; -static bool ends_with(const std::string &str, const std::string &suffix) -{ - return str.size() >= suffix.size() && - 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); -} +// TODO: can become bool if we can't find use of more states +enum slot_state { + IDLE, + PROCESSING, +}; -static size_t find_partial_stop_string(const std::string &stop, - const std::string &text) -{ - if (!text.empty() && !stop.empty()) - { - const char text_last_char = text.back(); - for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) - { - if (stop[char_index] == text_last_char) - { - const std::string current_partial = stop.substr(0, char_index + 1); - if (ends_with(text, current_partial)) - { - return text.size() - char_index - 1; - } - } - } - } - return std::string::npos; -} +enum slot_command { + NONE, + LOAD_PROMPT, + RELEASE, +}; -// TODO: reuse llama_detokenize -template -static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) -{ - std::string ret; - for (; begin != end; ++begin) - { - ret += llama_token_to_piece(ctx, *begin); - } - return ret; -} +struct slot_params { + bool stream = true; + bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt -// format incomplete utf-8 multibyte character for output -static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) -{ - std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); - // if the size is 1 and first bit is 1, meaning it's a partial character - // (size > 1 meaning it's already a known token) - if (out.size() == 1 && (out[0] & 0x80) == 0x80) - { - std::stringstream ss; - ss << std::hex << (out[0] & 0xff); - std::string res(ss.str()); - out = "byte: \\x" + res; - } - return out; -} + uint32_t seed = -1; // RNG seed + int32_t n_keep = 0; // number of tokens to keep from initial prompt + int32_t n_predict = -1; // new tokens to predict -// convert a vector of completion_token_output to json -static json probs_vector_to_json(const llama_context *ctx, const std::vector &probs) -{ - json out = json::array(); - for (const auto &prob : probs) - { - json probs_for_token = json::array(); - for (const auto &p : prob.probs) - { - std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); - probs_for_token.push_back(json - { - {"tok_str", tok_str}, - {"prob", p.prob}, - }); - } - std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); - out.push_back(json{ - {"content", tok_str}, - {"probs", probs_for_token}, - }); - } - return out; -} + std::vector antiprompt; -struct llama_client_slot -{ + json input_prefix; + json input_suffix; +}; + +struct slot_image { + int32_t id; + + bool request_encode_image = false; + float * image_embedding = nullptr; + int32_t image_tokens = 0; + + clip_image_u8 * img_data; + + std::string prefix_prompt; // before of this image +}; + +struct server_slot { int id; int task_id = -1; @@ -165,8 +111,8 @@ struct llama_client_slot int32_t i_batch = -1; int32_t n_predict = -1; - int32_t num_prompt_tokens = 0; - int32_t num_prompt_tokens_processed = 0; + int32_t n_prompt_tokens = 0; + int32_t n_prompt_tokens_processed = 0; json prompt; std::string generated_text; @@ -201,8 +147,8 @@ struct llama_client_slot std::vector images; // stats - size_t sent_count = 0; - size_t sent_token_probs_index = 0; + size_t n_sent_text = 0; // number of sent text character + size_t n_sent_token_probs = 0; int64_t t_start_process_prompt; int64_t t_start_genereration; @@ -214,7 +160,7 @@ struct llama_client_slot int multitask_id = -1; void reset() { - num_prompt_tokens = 0; + n_prompt_tokens = 0; generated_text = ""; truncated = false; stopped_eos = false; @@ -222,16 +168,15 @@ struct llama_client_slot stopped_limit = false; stopping_word = ""; n_past = 0; - sent_count = 0; - sent_token_probs_index = 0; + n_sent_text = 0; + n_sent_token_probs = 0; infill = false; ga_i = 0; n_past_se = 0; generated_token_probs.clear(); - for (slot_image & img : images) - { + for (slot_image & img : images) { free(img.image_embedding); if (img.img_data) { clip_image_u8_free(img.img_data); @@ -243,19 +188,15 @@ struct llama_client_slot } bool has_budget(gpt_params &global_params) { - if (params.n_predict == -1 && global_params.n_predict == -1) - { + if (params.n_predict == -1 && global_params.n_predict == -1) { return true; // limitless } n_remaining = -1; - if (params.n_predict != -1) - { + if (params.n_predict != -1) { n_remaining = params.n_predict - n_decoded; - } - else if (global_params.n_predict != -1) - { + } else if (global_params.n_predict != -1) { n_remaining = global_params.n_predict - n_decoded; } @@ -271,8 +212,7 @@ struct llama_client_slot } void add_token_string(const completion_token_output &token) { - if (command == RELEASE) - { + if (command == RELEASE) { return; } cache_tokens.push_back(token.tok); @@ -290,10 +230,10 @@ struct llama_client_slot json get_formated_timings() { return json { - {"prompt_n", num_prompt_tokens_processed}, + {"prompt_n", n_prompt_tokens_processed}, {"prompt_ms", t_prompt_processing}, - {"prompt_per_token_ms", t_prompt_processing / num_prompt_tokens_processed}, - {"prompt_per_second", 1e3 / t_prompt_processing * num_prompt_tokens_processed}, + {"prompt_per_token_ms", t_prompt_processing / n_prompt_tokens_processed}, + {"prompt_per_second", 1e3 / t_prompt_processing * n_prompt_tokens_processed}, {"predicted_n", n_decoded}, {"predicted_ms", t_token_generation}, @@ -304,18 +244,18 @@ struct llama_client_slot void print_timings() const { char buffer[512]; - double t_token = t_prompt_processing / num_prompt_tokens_processed; - double n_tokens_second = 1e3 / t_prompt_processing * num_prompt_tokens_processed; + double t_token = t_prompt_processing / n_prompt_tokens_processed; + double n_tokens_second = 1e3 / t_prompt_processing * n_prompt_tokens_processed; sprintf(buffer, "prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)", - t_prompt_processing, num_prompt_tokens_processed, + t_prompt_processing, n_prompt_tokens_processed, t_token, n_tokens_second); LOG_INFO(buffer, { - {"slot_id", id}, - {"task_id", task_id}, - {"t_prompt_processing", t_prompt_processing}, - {"num_prompt_tokens_processed", num_prompt_tokens_processed}, - {"t_token", t_token}, - {"n_tokens_second", n_tokens_second}, + {"slot_id", id}, + {"task_id", task_id}, + {"t_prompt_processing", t_prompt_processing}, + {"n_prompt_tokens_processed", n_prompt_tokens_processed}, + {"t_token", t_token}, + {"n_tokens_second", n_tokens_second}, }); t_token = t_token_generation / n_decoded; @@ -343,7 +283,7 @@ struct llama_client_slot } }; -struct llama_metrics { +struct server_metrics { uint64_t n_prompt_tokens_processed_total = 0; uint64_t n_tokens_predicted_total = 0; @@ -354,18 +294,16 @@ struct llama_metrics { uint64_t t_tokens_generation = 0; - void on_prompt_eval(const llama_client_slot &slot) { - n_prompt_tokens_processed_total += slot.num_prompt_tokens_processed; - - n_prompt_tokens_processed += slot.num_prompt_tokens_processed; - t_prompt_processing += slot.t_prompt_processing; + void on_prompt_eval(const server_slot &slot) { + n_prompt_tokens_processed_total += slot.n_prompt_tokens_processed; + n_prompt_tokens_processed += slot.n_prompt_tokens_processed; + t_prompt_processing += slot.t_prompt_processing; } - void on_prediction(const llama_client_slot &slot) { + void on_prediction(const server_slot &slot) { n_tokens_predicted_total += slot.n_decoded; - - n_tokens_predicted += slot.n_decoded; - t_tokens_generation += slot.t_token_generation; + n_tokens_predicted += slot.n_decoded; + t_tokens_generation += slot.t_token_generation; } void reset_bucket() { @@ -404,13 +342,13 @@ struct llama_server_context std::string name_assistant; // slots / clients - std::vector slots; + std::vector slots; json default_generation_settings_for_props; - llama_server_queue queue_tasks; + llama_server_queue queue_tasks; llama_server_response queue_results; - llama_metrics metrics; + server_metrics metrics; ~llama_server_context() { @@ -487,7 +425,7 @@ struct llama_server_context LOG_INFO("initializing slots", {{"n_slots", params.n_parallel}}); for (int i = 0; i < params.n_parallel; i++) { - llama_client_slot slot; + server_slot slot; slot.id = i; slot.n_ctx = n_ctx_slot; @@ -579,11 +517,11 @@ struct llama_server_context return prompt_tokens; } - llama_client_slot* get_slot(int id) { + server_slot* get_slot(int id) { int64_t t_last = ggml_time_us(); - llama_client_slot *last_used = nullptr; + server_slot *last_used = nullptr; - for (llama_client_slot & slot : slots) + for (server_slot & slot : slots) { if (slot.id == id && slot.available()) { @@ -600,7 +538,7 @@ struct llama_server_context return last_used; } - bool launch_slot_with_data(llama_client_slot* &slot, json data) { + bool launch_slot_with_data(server_slot* &slot, json data) { slot_params default_params; llama_sampling_params default_sparams; @@ -888,7 +826,7 @@ struct llama_server_context clean_kv_cache = false; } - void update_system_prompt() { + void system_prompt_update() { kv_cache_clear(); system_tokens.clear(); @@ -933,9 +871,9 @@ struct llama_server_context system_need_update = false; } - void notify_system_prompt_changed() { + void system_prompt_notify() { // release all slots - for (llama_client_slot &slot : slots) + for (server_slot &slot : slots) { slot.release(); } @@ -943,17 +881,17 @@ struct llama_server_context system_need_update = true; } - void process_system_prompt_data(const json &sys_props) { + void system_prompt_process(const json &sys_props) { system_prompt = sys_props.value("prompt", ""); name_user = sys_props.value("anti_prompt", ""); name_assistant = sys_props.value("assistant_name", ""); - notify_system_prompt_changed(); + system_prompt_notify(); } static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, - const stop_type type, llama_client_slot &slot) + const stop_type type, server_slot &slot) { size_t stop_pos = std::string::npos; @@ -975,8 +913,8 @@ struct llama_server_context { if (type == STOP_FULL) { - slot.stopped_word = true; - slot.stopping_word = word; + slot.stopped_word = true; + slot.stopping_word = word; slot.has_next_token = false; } stop_pos = pos; @@ -986,7 +924,7 @@ struct llama_server_context return stop_pos; } - bool process_token(completion_token_output &result, llama_client_slot &slot) { + bool process_token(completion_token_output &result, server_slot &slot) { // remember which tokens were sampled - used for repetition penalties during sampling const std::string token_str = llama_token_to_piece(ctx, result.tok); slot.sampled = result.tok; @@ -1032,7 +970,7 @@ struct llama_server_context if (!incomplete) { - size_t pos = std::min(slot.sent_count, slot.generated_text.size()); + size_t pos = std::min(slot.n_sent_text, slot.generated_text.size()); const std::string str_test = slot.generated_text.substr(pos); bool is_stop_full = false; size_t stop_pos = find_stopping_strings(str_test, token_str.size(), STOP_FULL, slot); @@ -1042,7 +980,7 @@ struct llama_server_context slot.generated_text.erase( slot.generated_text.begin() + pos + stop_pos, slot.generated_text.end()); - pos = std::min(slot.sent_count, slot.generated_text.size()); + pos = std::min(slot.n_sent_text, slot.generated_text.size()); } else { @@ -1055,7 +993,7 @@ struct llama_server_context { // no send the stop word in the response result.text_to_send = slot.generated_text.substr(pos, std::string::npos); - slot.sent_count += result.text_to_send.size(); + slot.n_sent_text += result.text_to_send.size(); // add the token to slot queue and cache } slot.add_token_string(result); @@ -1099,7 +1037,7 @@ struct llama_server_context return slot.has_next_token; // continue } - bool process_images(llama_client_slot &slot) const + bool process_images(server_slot &slot) const { for (slot_image &img : slot.images) { @@ -1132,7 +1070,7 @@ struct llama_server_context queue_results.send(res); } - json get_formated_generation(llama_client_slot &slot) + json get_formated_generation(server_slot &slot) { const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && @@ -1179,7 +1117,7 @@ struct llama_server_context }; } - void send_partial_response(llama_client_slot &slot, completion_token_output tkn) + void send_partial_response(server_slot &slot, completion_token_output tkn) { task_result res; res.id = slot.task_id; @@ -1199,13 +1137,13 @@ struct llama_server_context { std::vector probs_output = {}; const std::vector to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false); - size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size()); - size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size()); + size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size()); + size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size()); if (probs_pos < probs_stop_pos) { probs_output = std::vector(slot.generated_token_probs.begin() + probs_pos, slot.generated_token_probs.begin() + probs_stop_pos); } - slot.sent_token_probs_index = probs_stop_pos; + slot.n_sent_token_probs = probs_stop_pos; res.result_json["completion_probabilities"] = probs_vector_to_json(ctx, probs_output); } @@ -1218,7 +1156,7 @@ struct llama_server_context queue_results.send(res); } - void send_final_response(llama_client_slot &slot) + void send_final_response(server_slot &slot) { task_result res; res.id = slot.task_id; @@ -1233,7 +1171,7 @@ struct llama_server_context {"stop", true}, {"model", params.model_alias}, {"tokens_predicted", slot.n_decoded}, - {"tokens_evaluated", slot.num_prompt_tokens}, + {"tokens_evaluated", slot.n_prompt_tokens}, {"generation_settings", get_formated_generation(slot)}, {"prompt", slot.prompt}, {"truncated", slot.truncated}, @@ -1271,7 +1209,7 @@ struct llama_server_context queue_results.send(res); } - void send_embedding(llama_client_slot &slot) + void send_embedding(server_slot &slot) { task_result res; res.id = slot.task_id; @@ -1282,9 +1220,7 @@ struct llama_server_context const int n_embd = llama_n_embd(model); if (!params.embedding) { - LOG_WARNING("embedding disabled", { - {"params.embedding", params.embedding}, - }); + LOG_WARNING("embedding disabled", {{"params.embedding", params.embedding}}); res.result_json = json { {"embedding", std::vector(n_embd, 0.0f)}, @@ -1296,7 +1232,7 @@ struct llama_server_context std::vector embedding(data, data + n_embd); res.result_json = json { - {"embedding", embedding }, + {"embedding", embedding}, }; } queue_results.send(res); @@ -1345,7 +1281,7 @@ struct llama_server_context } // for multiple images processing - bool ingest_images(llama_client_slot &slot, int n_batch) + bool ingest_images(server_slot &slot, int n_batch) { int image_idx = 0; @@ -1384,7 +1320,17 @@ struct llama_server_context } const int n_embd = llama_n_embd(model); - llama_batch batch_img = { n_eval, nullptr, (img.image_embedding + i * n_embd), nullptr, nullptr, nullptr, nullptr, slot.n_past, 1, 0, }; + llama_batch batch_img = { + n_eval, + nullptr, + (img.image_embedding + i * n_embd), + nullptr, + nullptr, + nullptr, + nullptr, + slot.n_past, + 1, 0 + }; if (llama_decode(ctx, batch_img)) { LOG_TEE("%s : failed to eval image\n", __func__); @@ -1454,7 +1400,7 @@ struct llama_server_context switch (task.type) { case TASK_TYPE_COMPLETION: { - llama_client_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); + server_slot *slot = get_slot(json_value(task.data, "slot_id", -1)); if (slot == nullptr) { // if no slot is available, we defer this task for processing later @@ -1469,10 +1415,10 @@ struct llama_server_context send_error(task, "system prompt can only be updated when all slots are idle"); break; } - process_system_prompt_data(task.data["system_prompt"]); + system_prompt_process(task.data["system_prompt"]); // reset cache_tokens for all slots - for (llama_client_slot &slot : slots) + for (server_slot &slot : slots) { slot.cache_tokens.clear(); slot.n_past = 0; @@ -1512,20 +1458,20 @@ struct llama_server_context int n_idle_slots = 0; int n_processing_slots = 0; - for (llama_client_slot &slot: slots) { + for (server_slot &slot: slots) { json slot_data = get_formated_generation(slot); slot_data["id"] = slot.id; slot_data["task_id"] = slot.task_id; slot_data["state"] = slot.state; slot_data["prompt"] = slot.prompt; slot_data["next_token"] = { - {"has_next_token", slot.has_next_token}, - {"n_remain", slot.n_remaining}, + {"has_next_token", slot.has_next_token}, + {"n_remain", slot.n_remaining}, {"num_tokens_predicted", slot.n_decoded}, - {"stopped_eos", slot.stopped_eos}, - {"stopped_word", slot.stopped_word}, - {"stopped_limit", slot.stopped_limit}, - {"stopping_word", slot.stopping_word}, + {"stopped_eos", slot.stopped_eos}, + {"stopped_word", slot.stopped_word}, + {"stopped_limit", slot.stopped_limit}, + {"stopping_word", slot.stopping_word}, }; if (slot_data["state"] == IDLE) { n_idle_slots++; @@ -1563,10 +1509,10 @@ struct llama_server_context { "n_tokens_predicted", metrics.n_tokens_predicted}, { "t_tokens_generation", metrics.t_tokens_generation}, - { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, - { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, + { "kv_cache_tokens_count", llama_get_kv_cache_token_count(ctx)}, + { "kv_cache_used_cells", llama_get_kv_cache_used_cells(ctx)}, - { "slots", slots_data }, + { "slots", slots_data }, }; metrics.reset_bucket(); queue_results.send(res); @@ -1597,7 +1543,7 @@ struct llama_server_context if (system_need_update) { LOG_INFO("updating system prompt", {}); - update_system_prompt(); + system_prompt_update(); } llama_batch_clear(batch); @@ -1618,7 +1564,7 @@ struct llama_server_context task.target_id = -1; queue_tasks.post(task); - for (llama_client_slot &slot : slots) + for (server_slot &slot : slots) { if (slot.ga_n == 1) { @@ -1754,45 +1700,50 @@ struct llama_server_context prompt_tokens = tokenize(slot.prompt, system_prompt.empty() && add_bos_token); // add BOS if there isn't system prompt } - slot.num_prompt_tokens = prompt_tokens.size(); + slot.n_prompt_tokens = prompt_tokens.size(); if (slot.params.n_keep < 0) { - slot.params.n_keep = slot.num_prompt_tokens; + slot.params.n_keep = slot.n_prompt_tokens; } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); // if input prompt is too big, truncate it - if (slot.num_prompt_tokens >= slot.n_ctx) + if (slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_block_size = n_left / 2; - const int erased_blocks = (slot.num_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; + const int erased_blocks = (slot.n_prompt_tokens - slot.params.n_keep - n_block_size) / n_block_size; - std::vector new_tokens(prompt_tokens.begin(), prompt_tokens.begin() + slot.params.n_keep); - new_tokens.insert(new_tokens.end(), prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, prompt_tokens.end()); + std::vector new_tokens( + prompt_tokens.begin(), + prompt_tokens.begin() + slot.params.n_keep); + new_tokens.insert( + new_tokens.end(), + prompt_tokens.begin() + slot.params.n_keep + erased_blocks * n_block_size, + prompt_tokens.end()); LOG_VERBOSE("input truncated", { - {"n_ctx", slot.n_ctx}, - {"n_keep", slot.params.n_keep}, - {"n_left", n_left}, + {"n_ctx", slot.n_ctx}, + {"n_keep", slot.params.n_keep}, + {"n_left", n_left}, {"new_tokens", tokens_to_str(ctx, new_tokens.cbegin(), new_tokens.cend())}, }); slot.truncated = true; prompt_tokens = new_tokens; - slot.num_prompt_tokens = prompt_tokens.size(); - GGML_ASSERT(slot.num_prompt_tokens < slot.n_ctx); + slot.n_prompt_tokens = prompt_tokens.size(); + GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } if (!slot.params.cache_prompt) { llama_sampling_reset(slot.ctx_sampling); - slot.n_past = 0; + slot.n_past = 0; slot.n_past_se = 0; - slot.ga_i = 0; - slot.num_prompt_tokens_processed = slot.num_prompt_tokens; + slot.ga_i = 0; + slot.n_prompt_tokens_processed = slot.n_prompt_tokens; } else { @@ -1811,7 +1762,7 @@ struct llama_server_context slot.n_past -= 1; } - slot.num_prompt_tokens_processed = slot.num_prompt_tokens - slot.n_past; + slot.n_prompt_tokens_processed = slot.n_prompt_tokens - slot.n_past; if (slot.ga_n != 1) { @@ -1836,13 +1787,13 @@ struct llama_server_context { "slot_id", slot.id }, { "task_id", slot.task_id }, { "n_past", slot.n_past }, - { "num_prompt_tokens_processed", slot.num_prompt_tokens_processed } + { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed } }); } slot.cache_tokens = prompt_tokens; - if (slot.n_past == slot.num_prompt_tokens && slot.n_past > 0) + if (slot.n_past == slot.n_prompt_tokens && slot.n_past > 0) { // we have to evaluate at least 1 token to generate logits. LOG_INFO("we have to evaluate at least 1 token to generate logits", { @@ -1898,8 +1849,8 @@ struct llama_server_context if (has_images && !ingest_images(slot, n_batch)) { LOG_ERROR("failed processing images", { - "slot_id", slot.id, - "task_id", slot.task_id, + {"slot_id", slot.id}, + {"task_id", slot.task_id}, }); // FIXME @phymbert: to be properly tested // early returning without changing the slot state will block the slot for ever @@ -2049,10 +2000,6 @@ struct llama_server_context LOG_VERBOSE("slots updated", {}); return true; } - - void run_on_all_tasks_finished() { - update_slots(); - } }; static void server_print_usage(const char *argv0, const gpt_params ¶ms, @@ -2561,7 +2508,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, std::istreambuf_iterator(), std::back_inserter(systm_content) ); - llama.process_system_prompt_data(json::parse(systm_content)); + llama.system_prompt_process(json::parse(systm_content)); } else if (arg == "-ctk" || arg == "--cache-type-k") { params.cache_type_k = argv[++i]; @@ -2692,7 +2639,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, /* llama.cpp completion api semantics */ static json format_partial_response( - llama_server_context &llama, llama_client_slot *slot, const std::string &content, const std::vector &probs + llama_server_context &llama, server_slot *slot, const std::string &content, const std::vector &probs ) { json res = json { @@ -2748,14 +2695,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo }); } -struct token_translator -{ - llama_context * ctx; - std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } - std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } -}; - -static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, llama_client_slot *slot) +static void append_to_generated_text_from_generated_token_probs(llama_server_context &llama, server_slot *slot) { auto & gtps = slot->generated_token_probs; auto translator = token_translator{llama.ctx}; @@ -3526,8 +3466,8 @@ int main(int argc, char **argv) &llama_server_context::process_single_task, &llama, std::placeholders::_1)); llama.queue_tasks.on_finish_multitask(std::bind( &llama_server_context::on_finish_multitask, &llama, std::placeholders::_1)); - llama.queue_tasks.on_all_tasks_finished(std::bind( - &llama_server_context::run_on_all_tasks_finished, &llama)); + llama.queue_tasks.on_run_slots(std::bind( + &llama_server_context::update_slots, &llama)); llama.queue_results.on_multitask_update(std::bind( &llama_server_queue::update_multitask, &llama.queue_tasks, diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index d7abd7cbb..d98541f26 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -37,10 +37,6 @@ extern bool server_log_json; #define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__) #define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__) -// -// parallel -// - enum server_state { SERVER_STATE_LOADING_MODEL, // Server is starting up, model not fully loaded yet SERVER_STATE_READY, // Server is ready and model is loaded @@ -78,51 +74,8 @@ struct task_multi { std::vector results{}; }; -// TODO: can become bool if we can't find use of more states -enum slot_state -{ - IDLE, - PROCESSING, -}; - -enum slot_command -{ - NONE, - LOAD_PROMPT, - RELEASE, -}; - -struct slot_params -{ - bool stream = true; - bool cache_prompt = false; // remember the prompt to avoid reprocessing all prompt - - uint32_t seed = -1; // RNG seed - int32_t n_keep = 0; // number of tokens to keep from initial prompt - int32_t n_predict = -1; // new tokens to predict - - std::vector antiprompt; - - json input_prefix; - json input_suffix; -}; - -struct slot_image -{ - int32_t id; - - bool request_encode_image = false; - float * image_embedding = nullptr; - int32_t image_tokens = 0; - - clip_image_u8 * img_data; - - std::string prefix_prompt; // before of this image -}; - // completion token output with probabilities -struct completion_token_output -{ +struct completion_token_output { struct token_prob { llama_token tok; @@ -134,8 +87,13 @@ struct completion_token_output std::string text_to_send; }; -static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) -{ +struct token_translator { + llama_context * ctx; + std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); } + std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); } +}; + +static inline void server_log(const char *level, const char *function, int line, const char *message, const nlohmann::ordered_json &extra) { std::stringstream ss_tid; ss_tid << std::this_thread::get_id(); json log = nlohmann::ordered_json{ @@ -183,8 +141,7 @@ static inline void server_log(const char *level, const char *function, int line, // template -static T json_value(const json &body, const std::string &key, const T &default_value) -{ +static T json_value(const json &body, const std::string &key, const T &default_value) { // Fallback null to default value return body.contains(key) && !body.at(key).is_null() ? body.value(key, default_value) @@ -200,8 +157,7 @@ inline bool verify_custom_template(const std::string & tmpl) { } // Format given chat. If tmpl is empty, we take the template from model metadata -inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) -{ +inline std::string format_chat(const struct llama_model * model, const std::string & tmpl, const std::vector & messages) { size_t alloc_size = 0; // vector holding all allocated string to be passed to llama_chat_apply_template std::vector str(messages.size() * 2); @@ -250,7 +206,7 @@ struct llama_server_queue { // callback functions std::function callback_new_task; std::function callback_finish_multitask; - std::function callback_all_task_finished; + std::function callback_run_slots; // Add a new task to the end of the queue int post(task_server task) { @@ -283,14 +239,14 @@ struct llama_server_queue { callback_new_task = callback; } - // Register function to process a multitask + // Register function to process a multitask when it is finished void on_finish_multitask(std::function callback) { callback_finish_multitask = callback; } - // Register the function to be called when the batch of tasks is finished - void on_all_tasks_finished(std::function callback) { - callback_all_task_finished = callback; + // Register the function to be called when all slots data is ready to be processed + void on_run_slots(std::function callback) { + callback_run_slots = callback; } // Call when the state of one slot is changed @@ -312,7 +268,13 @@ struct llama_server_queue { condition_tasks.notify_all(); } - // Start the main loop. + /** + * Main loop consists of these steps: + * - Wait until a new task arrives + * - Process the task (i.e. maybe copy data into slot) + * - Check if multitask is finished + * - Run all slots + */ void start_loop() { running = true; while (true) { @@ -331,8 +293,8 @@ struct llama_server_queue { LOG_VERBOSE("callback_new_task", {{"task_id", task.id}}); callback_new_task(task); } - LOG_VERBOSE("callback_all_task_finished", {}); - // process and update all the multitasks + LOG_VERBOSE("update_multitasks", {}); + // check if we have any finished multitasks auto queue_iterator = queue_multitasks.begin(); while (queue_iterator != queue_multitasks.end()) { @@ -349,8 +311,9 @@ struct llama_server_queue { ++queue_iterator; } } - // all tasks in the current loop is finished - callback_all_task_finished(); + // all tasks in the current loop is processed, slots data is now ready + LOG_VERBOSE("callback_run_slots", {}); + callback_run_slots(); } LOG_VERBOSE("wait for new task", {}); // wait for new task @@ -408,12 +371,14 @@ struct llama_server_response { std::mutex mutex_results; std::condition_variable condition_results; + // add the task_id to the list of tasks waiting for response void add_waiting_task_id(int task_id) { LOG_VERBOSE("waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); waiting_task_ids.insert(task_id); } + // when the request is finished, we can remove task associated with it void remove_waiting_task_id(int task_id) { LOG_VERBOSE("remove waiting for task id", {{"task_id", task_id}}); std::unique_lock lock(mutex_results); @@ -574,3 +539,96 @@ static std::string gen_chatcmplid() chatcmplid << "chatcmpl-" << random_string(); return chatcmplid.str(); } + +// +// other common utils +// + +static size_t common_part(const std::vector &a, const std::vector &b) +{ + size_t i; + for (i = 0; i < a.size() && i < b.size() && a[i] == b[i]; i++) + { + } + return i; +} + +static bool ends_with(const std::string &str, const std::string &suffix) +{ + return str.size() >= suffix.size() && + 0 == str.compare(str.size() - suffix.size(), suffix.size(), suffix); +} + +static size_t find_partial_stop_string(const std::string &stop, + const std::string &text) +{ + if (!text.empty() && !stop.empty()) + { + const char text_last_char = text.back(); + for (int64_t char_index = stop.size() - 1; char_index >= 0; char_index--) + { + if (stop[char_index] == text_last_char) + { + const std::string current_partial = stop.substr(0, char_index + 1); + if (ends_with(text, current_partial)) + { + return text.size() - char_index - 1; + } + } + } + } + return std::string::npos; +} + +// TODO: reuse llama_detokenize +template +static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end) +{ + std::string ret; + for (; begin != end; ++begin) + { + ret += llama_token_to_piece(ctx, *begin); + } + return ret; +} + +// format incomplete utf-8 multibyte character for output +static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token) +{ + std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token); + // if the size is 1 and first bit is 1, meaning it's a partial character + // (size > 1 meaning it's already a known token) + if (out.size() == 1 && (out[0] & 0x80) == 0x80) + { + std::stringstream ss; + ss << std::hex << (out[0] & 0xff); + std::string res(ss.str()); + out = "byte: \\x" + res; + } + return out; +} + +// convert a vector of completion_token_output to json +static json probs_vector_to_json(const llama_context *ctx, const std::vector &probs) +{ + json out = json::array(); + for (const auto &prob : probs) + { + json probs_for_token = json::array(); + for (const auto &p : prob.probs) + { + std::string tok_str = tokens_to_output_formatted_string(ctx, p.tok); + probs_for_token.push_back(json + { + {"tok_str", tok_str}, + {"prob", p.prob}, + }); + } + std::string tok_str = tokens_to_output_formatted_string(ctx, prob.tok); + out.push_back(json{ + {"content", tok_str}, + {"probs", probs_for_token}, + }); + } + return out; +} From 38d152160898b0173ffe4dc7df5daadcbd2eceb0 Mon Sep 17 00:00:00 2001 From: AidanBeltonS <87009434+AidanBeltonS@users.noreply.github.com> Date: Fri, 1 Mar 2024 07:36:47 +0000 Subject: [PATCH 03/26] [SYCL] Use batched mul_mat pathway (#5591) * Use batched mul_mat pathway * rm extra line * Explicitly state scaled data type --------- Co-authored-by: Abhilash Majumder <30946547+abhilash1910@users.noreply.github.com> --- ggml-sycl.cpp | 107 +++++++++++++++++++++----------------------------- 1 file changed, 44 insertions(+), 63 deletions(-) diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index a054ec8b9..6f391b0c6 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -12726,6 +12726,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, GGML_ASSERT(dst->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src1->backend != GGML_BACKEND_TYPE_GPU_SPLIT); + GGML_ASSERT(src1->type == GGML_TYPE_F32 || (src1->ne[2] == 1 && src1->ne[3] == 1)); GGML_ASSERT(ne12 >= ne02 && ne12 % ne02 == 0); @@ -13269,31 +13270,23 @@ static void k_compute_batched_ptrs(const sycl::half *src0_as_f16, int64_t i03 = i13 / r3; int64_t i02 = i12 / r2; - ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; - ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12/2 + i13*nb13/2; - ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; + ptrs_src[0*ne23 + i12 + i13*ne12] = (const char *) src0_as_f16 + i02*nb02 + i03*nb03; + ptrs_src[1*ne23 + i12 + i13*ne12] = (const char *) src1_as_f16 + i12*nb12 + i13*nb13; + ptrs_dst[0*ne23 + i12 + i13*ne12] = ( char *) dst + i12*nbd2 + i13*nbd3; } -static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, - const ggml_tensor *src1, - ggml_tensor *dst) try { +static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, + const ggml_tensor *src1, + ggml_tensor *dst) try { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); + GGML_TENSOR_BINARY_OP_LOCALS - GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb); - - GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne); - - GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb); - - const int64_t ne1 = ggml_nelements(src1); - const int64_t ne = ggml_nelements(dst); + const int64_t ne_dst = ggml_nelements(dst); SYCL_CHECK(ggml_sycl_set_device(g_main_device)); dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; @@ -13312,11 +13305,16 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index]; // convert src1 to fp16 - const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); - GGML_ASSERT(to_fp16_sycl != nullptr); - - sycl_pool_alloc src1_as_f16(ne1); - to_fp16_sycl(src1_ddf, src1_as_f16.get(), ne1, main_stream); + sycl_pool_alloc src1_f16_alloc; + if (src1->type != GGML_TYPE_F16) { + const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); + const int64_t ne_src1 = ggml_nelements(src1); + src1_f16_alloc.alloc(ne_src1); + GGML_ASSERT(to_fp16_sycl != nullptr); + to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream); + } + sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf + : src1_f16_alloc.get(); sycl_pool_alloc dst_f16; char * dst_t; @@ -13337,20 +13335,12 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, const void * alpha = &alpha_f16; const void * beta = &beta_f16; - if (dst->op_params[0] == GGML_PREC_DEFAULT) { - dst_t = (char *) dst_f16.alloc(ne); + // TODO: Renable (dst->op_params[0] =! GGML_PREC_DEFAULT) pathway + // once oneMKL open source supports half, half, float, float: datatypes + dst_t = (char *) dst_f16.alloc(ne_dst); - nbd2 /= sizeof(float) / sizeof(sycl::half); - nbd3 /= sizeof(float) / sizeof(sycl::half); - } else { - dst_t = (char *) dst_ddf; - - cu_compute_type = dpct::library_data_t::real_float; - cu_data_type = dpct::library_data_t::real_float; - - alpha = &alpha_f32; - beta = &beta_f32; - } + nbd2 /= sizeof(float) / sizeof(sycl::half); + nbd3 /= sizeof(float) / sizeof(sycl::half); GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne13 % ne03 == 0); @@ -13386,10 +13376,10 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const char *)src0_as_f16, dpct::library_data_t::real_half, - nb01 / sizeof(sycl::half), src0->nb[2] / sizeof(sycl::half), - (const char *)src1_as_f16.get(), dpct::library_data_t::real_half, - nb11 / sizeof(float), src1->nb[2] / sizeof(float), beta, - (char *)dst_t, cu_data_type, ne01, dst->nb[2] / sizeof(float), + nb01 / nb00, nb02 / nb00, + (const char *)src1_f16, dpct::library_data_t::real_half, + nb11 / nb10, nb12 / nb10, beta, + (char *)dst_t, cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type))); } else { // use syclGemmBatchedEx @@ -13409,44 +13399,35 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0, {sycl::aspect::fp16}); main_stream->submit([&](sycl::handler &cgh) { - const sycl::half *src1_as_f16_get_ct1 = src1_as_f16.get(); - const void **ptrs_src_get_ct3 = ptrs_src.get(); - void **ptrs_dst_get_ct4 = ptrs_dst.get(); - + const void **ptrs_src_get = ptrs_src.get(); + void **ptrs_dst_get = ptrs_dst.get(); + size_t nb12_scaled = src1->type == GGML_TYPE_F16 ? nb12 : nb12 / 2; + size_t nb13_scaled = src1->type == GGML_TYPE_F16 ? nb13 : nb13 / 2; cgh.parallel_for(sycl::nd_range<3>(block_dims, block_dims), [=](sycl::nd_item<3> item_ct1) { k_compute_batched_ptrs( - src0_as_f16, src1_as_f16_get_ct1, - dst_t, ptrs_src_get_ct3, - ptrs_dst_get_ct4, ne12, ne13, ne23, - nb02, nb03, nb12, nb13, nbd2, nbd3, r2, - r3, item_ct1); + src0_as_f16, src1_f16, + dst_t, ptrs_src_get, + ptrs_dst_get, ne12, ne13, ne23, + nb02, nb03, nb12_scaled, nb13_scaled, + nbd2, nbd3, r2, r3, item_ct1); }); }); } - /* - DPCT1010:95: SYCL uses exceptions to report errors and does not use the - error codes. The call was replaced with 0. You need to rewrite this - code. - */ - SYCL_CHECK(0); - SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const void **)(ptrs_src.get() + 0 * ne23), - dpct::library_data_t::real_half, nb01 / sizeof(sycl::half), + dpct::library_data_t::real_half, nb01 / nb00, (const void **)(ptrs_src.get() + 1 * ne23), - dpct::library_data_t::real_half, nb11 / sizeof(float), beta, + dpct::library_data_t::real_half, nb11 / nb10, beta, (void **)(ptrs_dst.get() + 0 * ne23), cu_data_type, ne01, ne23, cu_compute_type))); } #endif - if (dst->op_params[0] == GGML_PREC_DEFAULT) { - const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16); - to_fp32_sycl(dst_f16.get(), dst_ddf, ne, main_stream); - } + const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16); + to_fp32_sycl(dst_f16.get(), dst_ddf, ne_dst, main_stream); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -13491,10 +13472,10 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 // KQV single-batch // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_vec_nc\n"); ggml_sycl_mul_mat_vec_nc(src0, src1, dst); - } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { + } else if (!split && all_on_device && use_xmx && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) { // KQ + KQV multi-batch - // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_mat_batched_sycl\n"); - ggml_sycl_mul_mat_mat_batched_sycl(src0, src1, dst); + // GGML_SYCL_DEBUG("ggml_sycl_mul_mat_batched_sycl\n"); + ggml_sycl_mul_mat_batched_sycl(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat\n"); ggml_sycl_op_mul_mat(src0, src1, dst, ggml_sycl_op_mul_mat_sycl, false); From f105471ef6aa4727afac8240da398590d7277f45 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Mar 2024 09:59:43 +0200 Subject: [PATCH 04/26] server : fix newlines in help (#5785) --- examples/server/server.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/server/server.cpp b/examples/server/server.cpp index bf20e0cf1..45c4aec4d 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2080,8 +2080,8 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" --override-kv KEY=TYPE:VALUE\n"); printf(" advanced option to override model metadata by key. may be specified multiple times.\n"); printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n"); - printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`"); - printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`"); + printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`\n"); + printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`\n"); printf(" --chat-template JINJA_TEMPLATE\n"); printf(" set custom jinja chat template (default: template taken from model's metadata)\n"); printf(" Note: only commonly used templates are accepted, since we don't have jinja parser\n"); From 6ea0f010ff6967034528d9e0b8330b9b0f0b7c13 Mon Sep 17 00:00:00 2001 From: Eve <139727413+netrunnereve@users.noreply.github.com> Date: Fri, 1 Mar 2024 08:54:53 +0000 Subject: [PATCH 05/26] ci : add Ubuntu 22 Vulkan CI run (#5789) --- .github/workflows/build.yml | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 66ad85938..9144f9266 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -145,6 +145,28 @@ jobs: cd build ctest -L main --verbose + ubuntu-22-cmake-vulkan: + runs-on: ubuntu-22.04 + + steps: + - name: Clone + id: checkout + uses: actions/checkout@v3 + + - name: Dependencies + id: depends + run: | + sudo apt-get update + sudo apt-get install build-essential libvulkan-dev + + - name: Build + id: cmake_build + run: | + mkdir build + cd build + cmake -DLLAMA_VULKAN=ON .. + cmake --build . --config Release -j $(nproc) + ubuntu-22-cmake-sycl: runs-on: ubuntu-22.04 From 5cb02b4a012bb16c6c699c0c62c05ffa653eee0f Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 1 Mar 2024 10:08:08 +0100 Subject: [PATCH 06/26] server: allow to override threads server pool with --threads-http (#5794) --- examples/server/README.md | 1 + examples/server/server.cpp | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/examples/server/README.md b/examples/server/README.md index 0e9bd7fd4..ad35306c6 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -18,6 +18,7 @@ The project is under active development, and we are [looking for feedback and co - `--threads N`, `-t N`: Set the number of threads to use during generation. - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation. +- `--threads-http N`: number of threads in the http server pool to process requests (default: `std::thread::hardware_concurrency()`) - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`). - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses. - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096. diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 45c4aec4d..eea987966 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -43,6 +43,7 @@ struct server_params { int32_t write_timeout = 600; bool slots_endpoint = true; bool metrics_endpoint = false; + int n_threads_http = -1; }; bool server_verbose = false; @@ -2012,6 +2013,7 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" --threads-http N number of threads in the http server pool to process requests (default: hardware concurrency)\n"); printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" --rope-scaling {none,linear,yarn}\n"); printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); @@ -2298,6 +2300,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_threads_batch = std::stoi(argv[i]); } + else if (arg == "--threads-http") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + sparams.n_threads_http = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) @@ -3449,6 +3460,11 @@ int main(int argc, char **argv) }*/ //); + if (sparams.n_threads_http > 0) { + log_data["n_threads_http"] = std::to_string(sparams.n_threads_http); + svr.new_task_queue = [&sparams] { return new httplib::ThreadPool(sparams.n_threads_http); }; + } + LOG_INFO("HTTP server listening", log_data); // run the HTTP server in a thread - see comment below std::thread t([&]() From 9600d59e010c18f5872580a21734ea1bf1968d04 Mon Sep 17 00:00:00 2001 From: Douglas Hanley Date: Fri, 1 Mar 2024 03:15:36 -0600 Subject: [PATCH 07/26] unicode : switch to multimap based nfd_map (#5799) * switch to multimap based nfd_map due to compile time issues * simplify multimap keys * dont construct new locale every time --- llama.cpp | 11 +- unicode.h | 566 +++++++++++++++++++++++++++++------------------------- 2 files changed, 312 insertions(+), 265 deletions(-) diff --git a/llama.cpp b/llama.cpp index 62699ce52..a35f07aa4 100644 --- a/llama.cpp +++ b/llama.cpp @@ -8947,10 +8947,10 @@ struct llm_tokenizer_wpm { std::vector codepoints = codepoints_from_utf8(text); std::vector nfd_codepoints; for (uint32_t code : codepoints) { - auto it = nfd_map.find(code); - if (it != nfd_map.end()) { - for (uint32_t c : it->second) { - nfd_codepoints.push_back(c); + auto it = nfd_map.equal_range(code); + if (it.first != it.second) { + for (auto jt = it.first; jt != it.second; jt++) { + nfd_codepoints.push_back(jt->second); } } else { nfd_codepoints.push_back(code); @@ -9001,12 +9001,13 @@ struct llm_tokenizer_wpm { } uint32_t to_lower(uint32_t code) { + static const std::locale locale("en_US.UTF-8"); #if defined(_WIN32) if (code > 0xFFFF) { return code; } #endif - return std::tolower(wchar_t(code), std::locale("en_US.UTF-8")); + return std::tolower(wchar_t(code), locale); } bool is_ascii_punct(uint32_t code) { diff --git a/unicode.h b/unicode.h index 620e2b580..f6be4549b 100644 --- a/unicode.h +++ b/unicode.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include #include #include @@ -223,266 +224,311 @@ static const std::vector> control_ranges = { {0x2B81E, 0x2B81F}, {0x2CEA2, 0x2CEAF}, {0x2EBE1, 0x2F7FF}, {0x2FA1E, 0x2FFFF}, {0x3134B, 0xE00FF}, {0xE01F0, 0x10FFFF}, }; -static const std::unordered_map> nfd_map = { -{0xC0, {0x41, 0x300}}, {0xC1, {0x41, 0x301}}, {0xC2, {0x41, 0x302}}, {0xC3, {0x41, 0x303}}, {0xC4, {0x41, 0x308}}, {0xC5, {0x41, 0x30A}}, {0xC7, {0x43, 0x327}}, {0xC8, {0x45, 0x300}}, -{0xC9, {0x45, 0x301}}, {0xCA, {0x45, 0x302}}, {0xCB, {0x45, 0x308}}, {0xCC, {0x49, 0x300}}, {0xCD, {0x49, 0x301}}, {0xCE, {0x49, 0x302}}, {0xCF, {0x49, 0x308}}, {0xD1, {0x4E, 0x303}}, -{0xD2, {0x4F, 0x300}}, {0xD3, {0x4F, 0x301}}, {0xD4, {0x4F, 0x302}}, {0xD5, {0x4F, 0x303}}, {0xD6, {0x4F, 0x308}}, {0xD9, {0x55, 0x300}}, {0xDA, {0x55, 0x301}}, {0xDB, {0x55, 0x302}}, -{0xDC, {0x55, 0x308}}, {0xDD, {0x59, 0x301}}, {0xE0, {0x61, 0x300}}, {0xE1, {0x61, 0x301}}, {0xE2, {0x61, 0x302}}, {0xE3, {0x61, 0x303}}, {0xE4, {0x61, 0x308}}, {0xE5, {0x61, 0x30A}}, -{0xE7, {0x63, 0x327}}, {0xE8, {0x65, 0x300}}, {0xE9, {0x65, 0x301}}, {0xEA, {0x65, 0x302}}, {0xEB, {0x65, 0x308}}, {0xEC, {0x69, 0x300}}, {0xED, {0x69, 0x301}}, {0xEE, {0x69, 0x302}}, -{0xEF, {0x69, 0x308}}, {0xF1, {0x6E, 0x303}}, {0xF2, {0x6F, 0x300}}, {0xF3, {0x6F, 0x301}}, {0xF4, {0x6F, 0x302}}, {0xF5, {0x6F, 0x303}}, {0xF6, {0x6F, 0x308}}, {0xF9, {0x75, 0x300}}, -{0xFA, {0x75, 0x301}}, {0xFB, {0x75, 0x302}}, {0xFC, {0x75, 0x308}}, {0xFD, {0x79, 0x301}}, {0xFF, {0x79, 0x308}}, {0x100, {0x41, 0x304}}, {0x101, {0x61, 0x304}}, {0x102, {0x41, 0x306}}, -{0x103, {0x61, 0x306}}, {0x104, {0x41, 0x328}}, {0x105, {0x61, 0x328}}, {0x106, {0x43, 0x301}}, {0x107, {0x63, 0x301}}, {0x108, {0x43, 0x302}}, {0x109, {0x63, 0x302}}, {0x10A, {0x43, 0x307}}, -{0x10B, {0x63, 0x307}}, {0x10C, {0x43, 0x30C}}, {0x10D, {0x63, 0x30C}}, {0x10E, {0x44, 0x30C}}, {0x10F, {0x64, 0x30C}}, {0x112, {0x45, 0x304}}, {0x113, {0x65, 0x304}}, {0x114, {0x45, 0x306}}, -{0x115, {0x65, 0x306}}, {0x116, {0x45, 0x307}}, {0x117, {0x65, 0x307}}, {0x118, {0x45, 0x328}}, {0x119, {0x65, 0x328}}, {0x11A, {0x45, 0x30C}}, {0x11B, {0x65, 0x30C}}, {0x11C, {0x47, 0x302}}, -{0x11D, {0x67, 0x302}}, {0x11E, {0x47, 0x306}}, {0x11F, {0x67, 0x306}}, {0x120, {0x47, 0x307}}, {0x121, {0x67, 0x307}}, {0x122, {0x47, 0x327}}, {0x123, {0x67, 0x327}}, {0x124, {0x48, 0x302}}, -{0x125, {0x68, 0x302}}, {0x128, {0x49, 0x303}}, {0x129, {0x69, 0x303}}, {0x12A, {0x49, 0x304}}, {0x12B, {0x69, 0x304}}, {0x12C, {0x49, 0x306}}, {0x12D, {0x69, 0x306}}, {0x12E, {0x49, 0x328}}, -{0x12F, {0x69, 0x328}}, {0x130, {0x49, 0x307}}, {0x134, {0x4A, 0x302}}, {0x135, {0x6A, 0x302}}, {0x136, {0x4B, 0x327}}, {0x137, {0x6B, 0x327}}, {0x139, {0x4C, 0x301}}, {0x13A, {0x6C, 0x301}}, -{0x13B, {0x4C, 0x327}}, {0x13C, {0x6C, 0x327}}, {0x13D, {0x4C, 0x30C}}, {0x13E, {0x6C, 0x30C}}, {0x143, {0x4E, 0x301}}, {0x144, {0x6E, 0x301}}, {0x145, {0x4E, 0x327}}, {0x146, {0x6E, 0x327}}, -{0x147, {0x4E, 0x30C}}, {0x148, {0x6E, 0x30C}}, {0x14C, {0x4F, 0x304}}, {0x14D, {0x6F, 0x304}}, {0x14E, {0x4F, 0x306}}, {0x14F, {0x6F, 0x306}}, {0x150, {0x4F, 0x30B}}, {0x151, {0x6F, 0x30B}}, -{0x154, {0x52, 0x301}}, {0x155, {0x72, 0x301}}, {0x156, {0x52, 0x327}}, {0x157, {0x72, 0x327}}, {0x158, {0x52, 0x30C}}, {0x159, {0x72, 0x30C}}, {0x15A, {0x53, 0x301}}, {0x15B, {0x73, 0x301}}, -{0x15C, {0x53, 0x302}}, {0x15D, {0x73, 0x302}}, {0x15E, {0x53, 0x327}}, {0x15F, {0x73, 0x327}}, {0x160, {0x53, 0x30C}}, {0x161, {0x73, 0x30C}}, {0x162, {0x54, 0x327}}, {0x163, {0x74, 0x327}}, -{0x164, {0x54, 0x30C}}, {0x165, {0x74, 0x30C}}, {0x168, {0x55, 0x303}}, {0x169, {0x75, 0x303}}, {0x16A, {0x55, 0x304}}, {0x16B, {0x75, 0x304}}, {0x16C, {0x55, 0x306}}, {0x16D, {0x75, 0x306}}, -{0x16E, {0x55, 0x30A}}, {0x16F, {0x75, 0x30A}}, {0x170, {0x55, 0x30B}}, {0x171, {0x75, 0x30B}}, {0x172, {0x55, 0x328}}, {0x173, {0x75, 0x328}}, {0x174, {0x57, 0x302}}, {0x175, {0x77, 0x302}}, -{0x176, {0x59, 0x302}}, {0x177, {0x79, 0x302}}, {0x178, {0x59, 0x308}}, {0x179, {0x5A, 0x301}}, {0x17A, {0x7A, 0x301}}, {0x17B, {0x5A, 0x307}}, {0x17C, {0x7A, 0x307}}, {0x17D, {0x5A, 0x30C}}, -{0x17E, {0x7A, 0x30C}}, {0x1A0, {0x4F, 0x31B}}, {0x1A1, {0x6F, 0x31B}}, {0x1AF, {0x55, 0x31B}}, {0x1B0, {0x75, 0x31B}}, {0x1CD, {0x41, 0x30C}}, {0x1CE, {0x61, 0x30C}}, {0x1CF, {0x49, 0x30C}}, -{0x1D0, {0x69, 0x30C}}, {0x1D1, {0x4F, 0x30C}}, {0x1D2, {0x6F, 0x30C}}, {0x1D3, {0x55, 0x30C}}, {0x1D4, {0x75, 0x30C}}, {0x1D5, {0x55, 0x308, 0x304}}, {0x1D6, {0x75, 0x308, 0x304}}, -{0x1D7, {0x55, 0x308, 0x301}}, {0x1D8, {0x75, 0x308, 0x301}}, {0x1D9, {0x55, 0x308, 0x30C}}, {0x1DA, {0x75, 0x308, 0x30C}}, {0x1DB, {0x55, 0x308, 0x300}}, {0x1DC, {0x75, 0x308, 0x300}}, -{0x1DE, {0x41, 0x308, 0x304}}, {0x1DF, {0x61, 0x308, 0x304}}, {0x1E0, {0x41, 0x307, 0x304}}, {0x1E1, {0x61, 0x307, 0x304}}, {0x1E2, {0xC6, 0x304}}, {0x1E3, {0xE6, 0x304}}, {0x1E6, {0x47, 0x30C}}, -{0x1E7, {0x67, 0x30C}}, {0x1E8, {0x4B, 0x30C}}, {0x1E9, {0x6B, 0x30C}}, {0x1EA, {0x4F, 0x328}}, {0x1EB, {0x6F, 0x328}}, {0x1EC, {0x4F, 0x328, 0x304}}, {0x1ED, {0x6F, 0x328, 0x304}}, -{0x1EE, {0x1B7, 0x30C}}, {0x1EF, {0x292, 0x30C}}, {0x1F0, {0x6A, 0x30C}}, {0x1F4, {0x47, 0x301}}, {0x1F5, {0x67, 0x301}}, {0x1F8, {0x4E, 0x300}}, {0x1F9, {0x6E, 0x300}}, {0x1FA, {0x41, 0x30A, 0x301}}, -{0x1FB, {0x61, 0x30A, 0x301}}, {0x1FC, {0xC6, 0x301}}, {0x1FD, {0xE6, 0x301}}, {0x1FE, {0xD8, 0x301}}, {0x1FF, {0xF8, 0x301}}, {0x200, {0x41, 0x30F}}, {0x201, {0x61, 0x30F}}, {0x202, {0x41, 0x311}}, -{0x203, {0x61, 0x311}}, {0x204, {0x45, 0x30F}}, {0x205, {0x65, 0x30F}}, {0x206, {0x45, 0x311}}, {0x207, {0x65, 0x311}}, {0x208, {0x49, 0x30F}}, {0x209, {0x69, 0x30F}}, {0x20A, {0x49, 0x311}}, -{0x20B, {0x69, 0x311}}, {0x20C, {0x4F, 0x30F}}, {0x20D, {0x6F, 0x30F}}, {0x20E, {0x4F, 0x311}}, {0x20F, {0x6F, 0x311}}, {0x210, {0x52, 0x30F}}, {0x211, {0x72, 0x30F}}, {0x212, {0x52, 0x311}}, -{0x213, {0x72, 0x311}}, {0x214, {0x55, 0x30F}}, {0x215, {0x75, 0x30F}}, {0x216, {0x55, 0x311}}, {0x217, {0x75, 0x311}}, {0x218, {0x53, 0x326}}, {0x219, {0x73, 0x326}}, {0x21A, {0x54, 0x326}}, -{0x21B, {0x74, 0x326}}, {0x21E, {0x48, 0x30C}}, {0x21F, {0x68, 0x30C}}, {0x226, {0x41, 0x307}}, {0x227, {0x61, 0x307}}, {0x228, {0x45, 0x327}}, {0x229, {0x65, 0x327}}, {0x22A, {0x4F, 0x308, 0x304}}, -{0x22B, {0x6F, 0x308, 0x304}}, {0x22C, {0x4F, 0x303, 0x304}}, {0x22D, {0x6F, 0x303, 0x304}}, {0x22E, {0x4F, 0x307}}, {0x22F, {0x6F, 0x307}}, {0x230, {0x4F, 0x307, 0x304}}, -{0x231, {0x6F, 0x307, 0x304}}, {0x232, {0x59, 0x304}}, {0x233, {0x79, 0x304}}, {0x340, {0x300}}, {0x341, {0x301}}, {0x343, {0x313}}, {0x344, {0x308, 0x301}}, {0x374, {0x2B9}}, {0x37E, {0x3B}}, -{0x385, {0xA8, 0x301}}, {0x386, {0x391, 0x301}}, {0x387, {0xB7}}, {0x388, {0x395, 0x301}}, {0x389, {0x397, 0x301}}, {0x38A, {0x399, 0x301}}, {0x38C, {0x39F, 0x301}}, {0x38E, {0x3A5, 0x301}}, -{0x38F, {0x3A9, 0x301}}, {0x390, {0x3B9, 0x308, 0x301}}, {0x3AA, {0x399, 0x308}}, {0x3AB, {0x3A5, 0x308}}, {0x3AC, {0x3B1, 0x301}}, {0x3AD, {0x3B5, 0x301}}, {0x3AE, {0x3B7, 0x301}}, -{0x3AF, {0x3B9, 0x301}}, {0x3B0, {0x3C5, 0x308, 0x301}}, {0x3CA, {0x3B9, 0x308}}, {0x3CB, {0x3C5, 0x308}}, {0x3CC, {0x3BF, 0x301}}, {0x3CD, {0x3C5, 0x301}}, {0x3CE, {0x3C9, 0x301}}, -{0x3D3, {0x3D2, 0x301}}, {0x3D4, {0x3D2, 0x308}}, {0x400, {0x415, 0x300}}, {0x401, {0x415, 0x308}}, {0x403, {0x413, 0x301}}, {0x407, {0x406, 0x308}}, {0x40C, {0x41A, 0x301}}, {0x40D, {0x418, 0x300}}, -{0x40E, {0x423, 0x306}}, {0x419, {0x418, 0x306}}, {0x439, {0x438, 0x306}}, {0x450, {0x435, 0x300}}, {0x451, {0x435, 0x308}}, {0x453, {0x433, 0x301}}, {0x457, {0x456, 0x308}}, {0x45C, {0x43A, 0x301}}, -{0x45D, {0x438, 0x300}}, {0x45E, {0x443, 0x306}}, {0x476, {0x474, 0x30F}}, {0x477, {0x475, 0x30F}}, {0x4C1, {0x416, 0x306}}, {0x4C2, {0x436, 0x306}}, {0x4D0, {0x410, 0x306}}, {0x4D1, {0x430, 0x306}}, -{0x4D2, {0x410, 0x308}}, {0x4D3, {0x430, 0x308}}, {0x4D6, {0x415, 0x306}}, {0x4D7, {0x435, 0x306}}, {0x4DA, {0x4D8, 0x308}}, {0x4DB, {0x4D9, 0x308}}, {0x4DC, {0x416, 0x308}}, {0x4DD, {0x436, 0x308}}, -{0x4DE, {0x417, 0x308}}, {0x4DF, {0x437, 0x308}}, {0x4E2, {0x418, 0x304}}, {0x4E3, {0x438, 0x304}}, {0x4E4, {0x418, 0x308}}, {0x4E5, {0x438, 0x308}}, {0x4E6, {0x41E, 0x308}}, {0x4E7, {0x43E, 0x308}}, -{0x4EA, {0x4E8, 0x308}}, {0x4EB, {0x4E9, 0x308}}, {0x4EC, {0x42D, 0x308}}, {0x4ED, {0x44D, 0x308}}, {0x4EE, {0x423, 0x304}}, {0x4EF, {0x443, 0x304}}, {0x4F0, {0x423, 0x308}}, {0x4F1, {0x443, 0x308}}, -{0x4F2, {0x423, 0x30B}}, {0x4F3, {0x443, 0x30B}}, {0x4F4, {0x427, 0x308}}, {0x4F5, {0x447, 0x308}}, {0x4F8, {0x42B, 0x308}}, {0x4F9, {0x44B, 0x308}}, {0x622, {0x627, 0x653}}, {0x623, {0x627, 0x654}}, -{0x624, {0x648, 0x654}}, {0x625, {0x627, 0x655}}, {0x626, {0x64A, 0x654}}, {0x6C0, {0x6D5, 0x654}}, {0x6C2, {0x6C1, 0x654}}, {0x6D3, {0x6D2, 0x654}}, {0x929, {0x928, 0x93C}}, {0x931, {0x930, 0x93C}}, -{0x934, {0x933, 0x93C}}, {0x958, {0x915, 0x93C}}, {0x959, {0x916, 0x93C}}, {0x95A, {0x917, 0x93C}}, {0x95B, {0x91C, 0x93C}}, {0x95C, {0x921, 0x93C}}, {0x95D, {0x922, 0x93C}}, {0x95E, {0x92B, 0x93C}}, -{0x95F, {0x92F, 0x93C}}, {0x9CB, {0x9C7, 0x9BE}}, {0x9CC, {0x9C7, 0x9D7}}, {0x9DC, {0x9A1, 0x9BC}}, {0x9DD, {0x9A2, 0x9BC}}, {0x9DF, {0x9AF, 0x9BC}}, {0xA33, {0xA32, 0xA3C}}, {0xA36, {0xA38, 0xA3C}}, -{0xA59, {0xA16, 0xA3C}}, {0xA5A, {0xA17, 0xA3C}}, {0xA5B, {0xA1C, 0xA3C}}, {0xA5E, {0xA2B, 0xA3C}}, {0xB48, {0xB47, 0xB56}}, {0xB4B, {0xB47, 0xB3E}}, {0xB4C, {0xB47, 0xB57}}, {0xB5C, {0xB21, 0xB3C}}, -{0xB5D, {0xB22, 0xB3C}}, {0xB94, {0xB92, 0xBD7}}, {0xBCA, {0xBC6, 0xBBE}}, {0xBCB, {0xBC7, 0xBBE}}, {0xBCC, {0xBC6, 0xBD7}}, {0xC48, {0xC46, 0xC56}}, {0xCC0, {0xCBF, 0xCD5}}, {0xCC7, {0xCC6, 0xCD5}}, -{0xCC8, {0xCC6, 0xCD6}}, {0xCCA, {0xCC6, 0xCC2}}, {0xCCB, {0xCC6, 0xCC2, 0xCD5}}, {0xD4A, {0xD46, 0xD3E}}, {0xD4B, {0xD47, 0xD3E}}, {0xD4C, {0xD46, 0xD57}}, {0xDDA, {0xDD9, 0xDCA}}, -{0xDDC, {0xDD9, 0xDCF}}, {0xDDD, {0xDD9, 0xDCF, 0xDCA}}, {0xDDE, {0xDD9, 0xDDF}}, {0xF43, {0xF42, 0xFB7}}, {0xF4D, {0xF4C, 0xFB7}}, {0xF52, {0xF51, 0xFB7}}, {0xF57, {0xF56, 0xFB7}}, -{0xF5C, {0xF5B, 0xFB7}}, {0xF69, {0xF40, 0xFB5}}, {0xF73, {0xF71, 0xF72}}, {0xF75, {0xF71, 0xF74}}, {0xF76, {0xFB2, 0xF80}}, {0xF78, {0xFB3, 0xF80}}, {0xF81, {0xF71, 0xF80}}, {0xF93, {0xF92, 0xFB7}}, -{0xF9D, {0xF9C, 0xFB7}}, {0xFA2, {0xFA1, 0xFB7}}, {0xFA7, {0xFA6, 0xFB7}}, {0xFAC, {0xFAB, 0xFB7}}, {0xFB9, {0xF90, 0xFB5}}, {0x1026, {0x1025, 0x102E}}, {0x1B06, {0x1B05, 0x1B35}}, -{0x1B08, {0x1B07, 0x1B35}}, {0x1B0A, {0x1B09, 0x1B35}}, {0x1B0C, {0x1B0B, 0x1B35}}, {0x1B0E, {0x1B0D, 0x1B35}}, {0x1B12, {0x1B11, 0x1B35}}, {0x1B3B, {0x1B3A, 0x1B35}}, {0x1B3D, {0x1B3C, 0x1B35}}, -{0x1B40, {0x1B3E, 0x1B35}}, {0x1B41, {0x1B3F, 0x1B35}}, {0x1B43, {0x1B42, 0x1B35}}, {0x1E00, {0x41, 0x325}}, {0x1E01, {0x61, 0x325}}, {0x1E02, {0x42, 0x307}}, {0x1E03, {0x62, 0x307}}, -{0x1E04, {0x42, 0x323}}, {0x1E05, {0x62, 0x323}}, {0x1E06, {0x42, 0x331}}, {0x1E07, {0x62, 0x331}}, {0x1E08, {0x43, 0x327, 0x301}}, {0x1E09, {0x63, 0x327, 0x301}}, {0x1E0A, {0x44, 0x307}}, -{0x1E0B, {0x64, 0x307}}, {0x1E0C, {0x44, 0x323}}, {0x1E0D, {0x64, 0x323}}, {0x1E0E, {0x44, 0x331}}, {0x1E0F, {0x64, 0x331}}, {0x1E10, {0x44, 0x327}}, {0x1E11, {0x64, 0x327}}, {0x1E12, {0x44, 0x32D}}, -{0x1E13, {0x64, 0x32D}}, {0x1E14, {0x45, 0x304, 0x300}}, {0x1E15, {0x65, 0x304, 0x300}}, {0x1E16, {0x45, 0x304, 0x301}}, {0x1E17, {0x65, 0x304, 0x301}}, {0x1E18, {0x45, 0x32D}}, -{0x1E19, {0x65, 0x32D}}, {0x1E1A, {0x45, 0x330}}, {0x1E1B, {0x65, 0x330}}, {0x1E1C, {0x45, 0x327, 0x306}}, {0x1E1D, {0x65, 0x327, 0x306}}, {0x1E1E, {0x46, 0x307}}, {0x1E1F, {0x66, 0x307}}, -{0x1E20, {0x47, 0x304}}, {0x1E21, {0x67, 0x304}}, {0x1E22, {0x48, 0x307}}, {0x1E23, {0x68, 0x307}}, {0x1E24, {0x48, 0x323}}, {0x1E25, {0x68, 0x323}}, {0x1E26, {0x48, 0x308}}, {0x1E27, {0x68, 0x308}}, -{0x1E28, {0x48, 0x327}}, {0x1E29, {0x68, 0x327}}, {0x1E2A, {0x48, 0x32E}}, {0x1E2B, {0x68, 0x32E}}, {0x1E2C, {0x49, 0x330}}, {0x1E2D, {0x69, 0x330}}, {0x1E2E, {0x49, 0x308, 0x301}}, -{0x1E2F, {0x69, 0x308, 0x301}}, {0x1E30, {0x4B, 0x301}}, {0x1E31, {0x6B, 0x301}}, {0x1E32, {0x4B, 0x323}}, {0x1E33, {0x6B, 0x323}}, {0x1E34, {0x4B, 0x331}}, {0x1E35, {0x6B, 0x331}}, -{0x1E36, {0x4C, 0x323}}, {0x1E37, {0x6C, 0x323}}, {0x1E38, {0x4C, 0x323, 0x304}}, {0x1E39, {0x6C, 0x323, 0x304}}, {0x1E3A, {0x4C, 0x331}}, {0x1E3B, {0x6C, 0x331}}, {0x1E3C, {0x4C, 0x32D}}, -{0x1E3D, {0x6C, 0x32D}}, {0x1E3E, {0x4D, 0x301}}, {0x1E3F, {0x6D, 0x301}}, {0x1E40, {0x4D, 0x307}}, {0x1E41, {0x6D, 0x307}}, {0x1E42, {0x4D, 0x323}}, {0x1E43, {0x6D, 0x323}}, {0x1E44, {0x4E, 0x307}}, -{0x1E45, {0x6E, 0x307}}, {0x1E46, {0x4E, 0x323}}, {0x1E47, {0x6E, 0x323}}, {0x1E48, {0x4E, 0x331}}, {0x1E49, {0x6E, 0x331}}, {0x1E4A, {0x4E, 0x32D}}, {0x1E4B, {0x6E, 0x32D}}, -{0x1E4C, {0x4F, 0x303, 0x301}}, {0x1E4D, {0x6F, 0x303, 0x301}}, {0x1E4E, {0x4F, 0x303, 0x308}}, {0x1E4F, {0x6F, 0x303, 0x308}}, {0x1E50, {0x4F, 0x304, 0x300}}, {0x1E51, {0x6F, 0x304, 0x300}}, -{0x1E52, {0x4F, 0x304, 0x301}}, {0x1E53, {0x6F, 0x304, 0x301}}, {0x1E54, {0x50, 0x301}}, {0x1E55, {0x70, 0x301}}, {0x1E56, {0x50, 0x307}}, {0x1E57, {0x70, 0x307}}, {0x1E58, {0x52, 0x307}}, -{0x1E59, {0x72, 0x307}}, {0x1E5A, {0x52, 0x323}}, {0x1E5B, {0x72, 0x323}}, {0x1E5C, {0x52, 0x323, 0x304}}, {0x1E5D, {0x72, 0x323, 0x304}}, {0x1E5E, {0x52, 0x331}}, {0x1E5F, {0x72, 0x331}}, -{0x1E60, {0x53, 0x307}}, {0x1E61, {0x73, 0x307}}, {0x1E62, {0x53, 0x323}}, {0x1E63, {0x73, 0x323}}, {0x1E64, {0x53, 0x301, 0x307}}, {0x1E65, {0x73, 0x301, 0x307}}, {0x1E66, {0x53, 0x30C, 0x307}}, -{0x1E67, {0x73, 0x30C, 0x307}}, {0x1E68, {0x53, 0x323, 0x307}}, {0x1E69, {0x73, 0x323, 0x307}}, {0x1E6A, {0x54, 0x307}}, {0x1E6B, {0x74, 0x307}}, {0x1E6C, {0x54, 0x323}}, {0x1E6D, {0x74, 0x323}}, -{0x1E6E, {0x54, 0x331}}, {0x1E6F, {0x74, 0x331}}, {0x1E70, {0x54, 0x32D}}, {0x1E71, {0x74, 0x32D}}, {0x1E72, {0x55, 0x324}}, {0x1E73, {0x75, 0x324}}, {0x1E74, {0x55, 0x330}}, {0x1E75, {0x75, 0x330}}, -{0x1E76, {0x55, 0x32D}}, {0x1E77, {0x75, 0x32D}}, {0x1E78, {0x55, 0x303, 0x301}}, {0x1E79, {0x75, 0x303, 0x301}}, {0x1E7A, {0x55, 0x304, 0x308}}, {0x1E7B, {0x75, 0x304, 0x308}}, -{0x1E7C, {0x56, 0x303}}, {0x1E7D, {0x76, 0x303}}, {0x1E7E, {0x56, 0x323}}, {0x1E7F, {0x76, 0x323}}, {0x1E80, {0x57, 0x300}}, {0x1E81, {0x77, 0x300}}, {0x1E82, {0x57, 0x301}}, {0x1E83, {0x77, 0x301}}, -{0x1E84, {0x57, 0x308}}, {0x1E85, {0x77, 0x308}}, {0x1E86, {0x57, 0x307}}, {0x1E87, {0x77, 0x307}}, {0x1E88, {0x57, 0x323}}, {0x1E89, {0x77, 0x323}}, {0x1E8A, {0x58, 0x307}}, {0x1E8B, {0x78, 0x307}}, -{0x1E8C, {0x58, 0x308}}, {0x1E8D, {0x78, 0x308}}, {0x1E8E, {0x59, 0x307}}, {0x1E8F, {0x79, 0x307}}, {0x1E90, {0x5A, 0x302}}, {0x1E91, {0x7A, 0x302}}, {0x1E92, {0x5A, 0x323}}, {0x1E93, {0x7A, 0x323}}, -{0x1E94, {0x5A, 0x331}}, {0x1E95, {0x7A, 0x331}}, {0x1E96, {0x68, 0x331}}, {0x1E97, {0x74, 0x308}}, {0x1E98, {0x77, 0x30A}}, {0x1E99, {0x79, 0x30A}}, {0x1E9B, {0x17F, 0x307}}, {0x1EA0, {0x41, 0x323}}, -{0x1EA1, {0x61, 0x323}}, {0x1EA2, {0x41, 0x309}}, {0x1EA3, {0x61, 0x309}}, {0x1EA4, {0x41, 0x302, 0x301}}, {0x1EA5, {0x61, 0x302, 0x301}}, {0x1EA6, {0x41, 0x302, 0x300}}, -{0x1EA7, {0x61, 0x302, 0x300}}, {0x1EA8, {0x41, 0x302, 0x309}}, {0x1EA9, {0x61, 0x302, 0x309}}, {0x1EAA, {0x41, 0x302, 0x303}}, {0x1EAB, {0x61, 0x302, 0x303}}, {0x1EAC, {0x41, 0x323, 0x302}}, -{0x1EAD, {0x61, 0x323, 0x302}}, {0x1EAE, {0x41, 0x306, 0x301}}, {0x1EAF, {0x61, 0x306, 0x301}}, {0x1EB0, {0x41, 0x306, 0x300}}, {0x1EB1, {0x61, 0x306, 0x300}}, {0x1EB2, {0x41, 0x306, 0x309}}, -{0x1EB3, {0x61, 0x306, 0x309}}, {0x1EB4, {0x41, 0x306, 0x303}}, {0x1EB5, {0x61, 0x306, 0x303}}, {0x1EB6, {0x41, 0x323, 0x306}}, {0x1EB7, {0x61, 0x323, 0x306}}, {0x1EB8, {0x45, 0x323}}, -{0x1EB9, {0x65, 0x323}}, {0x1EBA, {0x45, 0x309}}, {0x1EBB, {0x65, 0x309}}, {0x1EBC, {0x45, 0x303}}, {0x1EBD, {0x65, 0x303}}, {0x1EBE, {0x45, 0x302, 0x301}}, {0x1EBF, {0x65, 0x302, 0x301}}, -{0x1EC0, {0x45, 0x302, 0x300}}, {0x1EC1, {0x65, 0x302, 0x300}}, {0x1EC2, {0x45, 0x302, 0x309}}, {0x1EC3, {0x65, 0x302, 0x309}}, {0x1EC4, {0x45, 0x302, 0x303}}, {0x1EC5, {0x65, 0x302, 0x303}}, -{0x1EC6, {0x45, 0x323, 0x302}}, {0x1EC7, {0x65, 0x323, 0x302}}, {0x1EC8, {0x49, 0x309}}, {0x1EC9, {0x69, 0x309}}, {0x1ECA, {0x49, 0x323}}, {0x1ECB, {0x69, 0x323}}, {0x1ECC, {0x4F, 0x323}}, -{0x1ECD, {0x6F, 0x323}}, {0x1ECE, {0x4F, 0x309}}, {0x1ECF, {0x6F, 0x309}}, {0x1ED0, {0x4F, 0x302, 0x301}}, {0x1ED1, {0x6F, 0x302, 0x301}}, {0x1ED2, {0x4F, 0x302, 0x300}}, -{0x1ED3, {0x6F, 0x302, 0x300}}, {0x1ED4, {0x4F, 0x302, 0x309}}, {0x1ED5, {0x6F, 0x302, 0x309}}, {0x1ED6, {0x4F, 0x302, 0x303}}, {0x1ED7, {0x6F, 0x302, 0x303}}, {0x1ED8, {0x4F, 0x323, 0x302}}, -{0x1ED9, {0x6F, 0x323, 0x302}}, {0x1EDA, {0x4F, 0x31B, 0x301}}, {0x1EDB, {0x6F, 0x31B, 0x301}}, {0x1EDC, {0x4F, 0x31B, 0x300}}, {0x1EDD, {0x6F, 0x31B, 0x300}}, {0x1EDE, {0x4F, 0x31B, 0x309}}, -{0x1EDF, {0x6F, 0x31B, 0x309}}, {0x1EE0, {0x4F, 0x31B, 0x303}}, {0x1EE1, {0x6F, 0x31B, 0x303}}, {0x1EE2, {0x4F, 0x31B, 0x323}}, {0x1EE3, {0x6F, 0x31B, 0x323}}, {0x1EE4, {0x55, 0x323}}, -{0x1EE5, {0x75, 0x323}}, {0x1EE6, {0x55, 0x309}}, {0x1EE7, {0x75, 0x309}}, {0x1EE8, {0x55, 0x31B, 0x301}}, {0x1EE9, {0x75, 0x31B, 0x301}}, {0x1EEA, {0x55, 0x31B, 0x300}}, -{0x1EEB, {0x75, 0x31B, 0x300}}, {0x1EEC, {0x55, 0x31B, 0x309}}, {0x1EED, {0x75, 0x31B, 0x309}}, {0x1EEE, {0x55, 0x31B, 0x303}}, {0x1EEF, {0x75, 0x31B, 0x303}}, {0x1EF0, {0x55, 0x31B, 0x323}}, -{0x1EF1, {0x75, 0x31B, 0x323}}, {0x1EF2, {0x59, 0x300}}, {0x1EF3, {0x79, 0x300}}, {0x1EF4, {0x59, 0x323}}, {0x1EF5, {0x79, 0x323}}, {0x1EF6, {0x59, 0x309}}, {0x1EF7, {0x79, 0x309}}, -{0x1EF8, {0x59, 0x303}}, {0x1EF9, {0x79, 0x303}}, {0x1F00, {0x3B1, 0x313}}, {0x1F01, {0x3B1, 0x314}}, {0x1F02, {0x3B1, 0x313, 0x300}}, {0x1F03, {0x3B1, 0x314, 0x300}}, {0x1F04, {0x3B1, 0x313, 0x301}}, -{0x1F05, {0x3B1, 0x314, 0x301}}, {0x1F06, {0x3B1, 0x313, 0x342}}, {0x1F07, {0x3B1, 0x314, 0x342}}, {0x1F08, {0x391, 0x313}}, {0x1F09, {0x391, 0x314}}, {0x1F0A, {0x391, 0x313, 0x300}}, -{0x1F0B, {0x391, 0x314, 0x300}}, {0x1F0C, {0x391, 0x313, 0x301}}, {0x1F0D, {0x391, 0x314, 0x301}}, {0x1F0E, {0x391, 0x313, 0x342}}, {0x1F0F, {0x391, 0x314, 0x342}}, {0x1F10, {0x3B5, 0x313}}, -{0x1F11, {0x3B5, 0x314}}, {0x1F12, {0x3B5, 0x313, 0x300}}, {0x1F13, {0x3B5, 0x314, 0x300}}, {0x1F14, {0x3B5, 0x313, 0x301}}, {0x1F15, {0x3B5, 0x314, 0x301}}, {0x1F18, {0x395, 0x313}}, -{0x1F19, {0x395, 0x314}}, {0x1F1A, {0x395, 0x313, 0x300}}, {0x1F1B, {0x395, 0x314, 0x300}}, {0x1F1C, {0x395, 0x313, 0x301}}, {0x1F1D, {0x395, 0x314, 0x301}}, {0x1F20, {0x3B7, 0x313}}, -{0x1F21, {0x3B7, 0x314}}, {0x1F22, {0x3B7, 0x313, 0x300}}, {0x1F23, {0x3B7, 0x314, 0x300}}, {0x1F24, {0x3B7, 0x313, 0x301}}, {0x1F25, {0x3B7, 0x314, 0x301}}, {0x1F26, {0x3B7, 0x313, 0x342}}, -{0x1F27, {0x3B7, 0x314, 0x342}}, {0x1F28, {0x397, 0x313}}, {0x1F29, {0x397, 0x314}}, {0x1F2A, {0x397, 0x313, 0x300}}, {0x1F2B, {0x397, 0x314, 0x300}}, {0x1F2C, {0x397, 0x313, 0x301}}, -{0x1F2D, {0x397, 0x314, 0x301}}, {0x1F2E, {0x397, 0x313, 0x342}}, {0x1F2F, {0x397, 0x314, 0x342}}, {0x1F30, {0x3B9, 0x313}}, {0x1F31, {0x3B9, 0x314}}, {0x1F32, {0x3B9, 0x313, 0x300}}, -{0x1F33, {0x3B9, 0x314, 0x300}}, {0x1F34, {0x3B9, 0x313, 0x301}}, {0x1F35, {0x3B9, 0x314, 0x301}}, {0x1F36, {0x3B9, 0x313, 0x342}}, {0x1F37, {0x3B9, 0x314, 0x342}}, {0x1F38, {0x399, 0x313}}, -{0x1F39, {0x399, 0x314}}, {0x1F3A, {0x399, 0x313, 0x300}}, {0x1F3B, {0x399, 0x314, 0x300}}, {0x1F3C, {0x399, 0x313, 0x301}}, {0x1F3D, {0x399, 0x314, 0x301}}, {0x1F3E, {0x399, 0x313, 0x342}}, -{0x1F3F, {0x399, 0x314, 0x342}}, {0x1F40, {0x3BF, 0x313}}, {0x1F41, {0x3BF, 0x314}}, {0x1F42, {0x3BF, 0x313, 0x300}}, {0x1F43, {0x3BF, 0x314, 0x300}}, {0x1F44, {0x3BF, 0x313, 0x301}}, -{0x1F45, {0x3BF, 0x314, 0x301}}, {0x1F48, {0x39F, 0x313}}, {0x1F49, {0x39F, 0x314}}, {0x1F4A, {0x39F, 0x313, 0x300}}, {0x1F4B, {0x39F, 0x314, 0x300}}, {0x1F4C, {0x39F, 0x313, 0x301}}, -{0x1F4D, {0x39F, 0x314, 0x301}}, {0x1F50, {0x3C5, 0x313}}, {0x1F51, {0x3C5, 0x314}}, {0x1F52, {0x3C5, 0x313, 0x300}}, {0x1F53, {0x3C5, 0x314, 0x300}}, {0x1F54, {0x3C5, 0x313, 0x301}}, -{0x1F55, {0x3C5, 0x314, 0x301}}, {0x1F56, {0x3C5, 0x313, 0x342}}, {0x1F57, {0x3C5, 0x314, 0x342}}, {0x1F59, {0x3A5, 0x314}}, {0x1F5B, {0x3A5, 0x314, 0x300}}, {0x1F5D, {0x3A5, 0x314, 0x301}}, -{0x1F5F, {0x3A5, 0x314, 0x342}}, {0x1F60, {0x3C9, 0x313}}, {0x1F61, {0x3C9, 0x314}}, {0x1F62, {0x3C9, 0x313, 0x300}}, {0x1F63, {0x3C9, 0x314, 0x300}}, {0x1F64, {0x3C9, 0x313, 0x301}}, -{0x1F65, {0x3C9, 0x314, 0x301}}, {0x1F66, {0x3C9, 0x313, 0x342}}, {0x1F67, {0x3C9, 0x314, 0x342}}, {0x1F68, {0x3A9, 0x313}}, {0x1F69, {0x3A9, 0x314}}, {0x1F6A, {0x3A9, 0x313, 0x300}}, -{0x1F6B, {0x3A9, 0x314, 0x300}}, {0x1F6C, {0x3A9, 0x313, 0x301}}, {0x1F6D, {0x3A9, 0x314, 0x301}}, {0x1F6E, {0x3A9, 0x313, 0x342}}, {0x1F6F, {0x3A9, 0x314, 0x342}}, {0x1F70, {0x3B1, 0x300}}, -{0x1F71, {0x3B1, 0x301}}, {0x1F72, {0x3B5, 0x300}}, {0x1F73, {0x3B5, 0x301}}, {0x1F74, {0x3B7, 0x300}}, {0x1F75, {0x3B7, 0x301}}, {0x1F76, {0x3B9, 0x300}}, {0x1F77, {0x3B9, 0x301}}, -{0x1F78, {0x3BF, 0x300}}, {0x1F79, {0x3BF, 0x301}}, {0x1F7A, {0x3C5, 0x300}}, {0x1F7B, {0x3C5, 0x301}}, {0x1F7C, {0x3C9, 0x300}}, {0x1F7D, {0x3C9, 0x301}}, {0x1F80, {0x3B1, 0x313, 0x345}}, -{0x1F81, {0x3B1, 0x314, 0x345}}, {0x1F82, {0x3B1, 0x313, 0x300, 0x345}}, {0x1F83, {0x3B1, 0x314, 0x300, 0x345}}, {0x1F84, {0x3B1, 0x313, 0x301, 0x345}}, {0x1F85, {0x3B1, 0x314, 0x301, 0x345}}, -{0x1F86, {0x3B1, 0x313, 0x342, 0x345}}, {0x1F87, {0x3B1, 0x314, 0x342, 0x345}}, {0x1F88, {0x391, 0x313, 0x345}}, {0x1F89, {0x391, 0x314, 0x345}}, {0x1F8A, {0x391, 0x313, 0x300, 0x345}}, -{0x1F8B, {0x391, 0x314, 0x300, 0x345}}, {0x1F8C, {0x391, 0x313, 0x301, 0x345}}, {0x1F8D, {0x391, 0x314, 0x301, 0x345}}, {0x1F8E, {0x391, 0x313, 0x342, 0x345}}, {0x1F8F, {0x391, 0x314, 0x342, 0x345}}, -{0x1F90, {0x3B7, 0x313, 0x345}}, {0x1F91, {0x3B7, 0x314, 0x345}}, {0x1F92, {0x3B7, 0x313, 0x300, 0x345}}, {0x1F93, {0x3B7, 0x314, 0x300, 0x345}}, {0x1F94, {0x3B7, 0x313, 0x301, 0x345}}, -{0x1F95, {0x3B7, 0x314, 0x301, 0x345}}, {0x1F96, {0x3B7, 0x313, 0x342, 0x345}}, {0x1F97, {0x3B7, 0x314, 0x342, 0x345}}, {0x1F98, {0x397, 0x313, 0x345}}, {0x1F99, {0x397, 0x314, 0x345}}, -{0x1F9A, {0x397, 0x313, 0x300, 0x345}}, {0x1F9B, {0x397, 0x314, 0x300, 0x345}}, {0x1F9C, {0x397, 0x313, 0x301, 0x345}}, {0x1F9D, {0x397, 0x314, 0x301, 0x345}}, {0x1F9E, {0x397, 0x313, 0x342, 0x345}}, -{0x1F9F, {0x397, 0x314, 0x342, 0x345}}, {0x1FA0, {0x3C9, 0x313, 0x345}}, {0x1FA1, {0x3C9, 0x314, 0x345}}, {0x1FA2, {0x3C9, 0x313, 0x300, 0x345}}, {0x1FA3, {0x3C9, 0x314, 0x300, 0x345}}, -{0x1FA4, {0x3C9, 0x313, 0x301, 0x345}}, {0x1FA5, {0x3C9, 0x314, 0x301, 0x345}}, {0x1FA6, {0x3C9, 0x313, 0x342, 0x345}}, {0x1FA7, {0x3C9, 0x314, 0x342, 0x345}}, {0x1FA8, {0x3A9, 0x313, 0x345}}, -{0x1FA9, {0x3A9, 0x314, 0x345}}, {0x1FAA, {0x3A9, 0x313, 0x300, 0x345}}, {0x1FAB, {0x3A9, 0x314, 0x300, 0x345}}, {0x1FAC, {0x3A9, 0x313, 0x301, 0x345}}, {0x1FAD, {0x3A9, 0x314, 0x301, 0x345}}, -{0x1FAE, {0x3A9, 0x313, 0x342, 0x345}}, {0x1FAF, {0x3A9, 0x314, 0x342, 0x345}}, {0x1FB0, {0x3B1, 0x306}}, {0x1FB1, {0x3B1, 0x304}}, {0x1FB2, {0x3B1, 0x300, 0x345}}, {0x1FB3, {0x3B1, 0x345}}, -{0x1FB4, {0x3B1, 0x301, 0x345}}, {0x1FB6, {0x3B1, 0x342}}, {0x1FB7, {0x3B1, 0x342, 0x345}}, {0x1FB8, {0x391, 0x306}}, {0x1FB9, {0x391, 0x304}}, {0x1FBA, {0x391, 0x300}}, {0x1FBB, {0x391, 0x301}}, -{0x1FBC, {0x391, 0x345}}, {0x1FBE, {0x3B9}}, {0x1FC1, {0xA8, 0x342}}, {0x1FC2, {0x3B7, 0x300, 0x345}}, {0x1FC3, {0x3B7, 0x345}}, {0x1FC4, {0x3B7, 0x301, 0x345}}, {0x1FC6, {0x3B7, 0x342}}, -{0x1FC7, {0x3B7, 0x342, 0x345}}, {0x1FC8, {0x395, 0x300}}, {0x1FC9, {0x395, 0x301}}, {0x1FCA, {0x397, 0x300}}, {0x1FCB, {0x397, 0x301}}, {0x1FCC, {0x397, 0x345}}, {0x1FCD, {0x1FBF, 0x300}}, -{0x1FCE, {0x1FBF, 0x301}}, {0x1FCF, {0x1FBF, 0x342}}, {0x1FD0, {0x3B9, 0x306}}, {0x1FD1, {0x3B9, 0x304}}, {0x1FD2, {0x3B9, 0x308, 0x300}}, {0x1FD3, {0x3B9, 0x308, 0x301}}, {0x1FD6, {0x3B9, 0x342}}, -{0x1FD7, {0x3B9, 0x308, 0x342}}, {0x1FD8, {0x399, 0x306}}, {0x1FD9, {0x399, 0x304}}, {0x1FDA, {0x399, 0x300}}, {0x1FDB, {0x399, 0x301}}, {0x1FDD, {0x1FFE, 0x300}}, {0x1FDE, {0x1FFE, 0x301}}, -{0x1FDF, {0x1FFE, 0x342}}, {0x1FE0, {0x3C5, 0x306}}, {0x1FE1, {0x3C5, 0x304}}, {0x1FE2, {0x3C5, 0x308, 0x300}}, {0x1FE3, {0x3C5, 0x308, 0x301}}, {0x1FE4, {0x3C1, 0x313}}, {0x1FE5, {0x3C1, 0x314}}, -{0x1FE6, {0x3C5, 0x342}}, {0x1FE7, {0x3C5, 0x308, 0x342}}, {0x1FE8, {0x3A5, 0x306}}, {0x1FE9, {0x3A5, 0x304}}, {0x1FEA, {0x3A5, 0x300}}, {0x1FEB, {0x3A5, 0x301}}, {0x1FEC, {0x3A1, 0x314}}, -{0x1FED, {0xA8, 0x300}}, {0x1FEE, {0xA8, 0x301}}, {0x1FEF, {0x60}}, {0x1FF2, {0x3C9, 0x300, 0x345}}, {0x1FF3, {0x3C9, 0x345}}, {0x1FF4, {0x3C9, 0x301, 0x345}}, {0x1FF6, {0x3C9, 0x342}}, -{0x1FF7, {0x3C9, 0x342, 0x345}}, {0x1FF8, {0x39F, 0x300}}, {0x1FF9, {0x39F, 0x301}}, {0x1FFA, {0x3A9, 0x300}}, {0x1FFB, {0x3A9, 0x301}}, {0x1FFC, {0x3A9, 0x345}}, {0x1FFD, {0xB4}}, {0x2000, {0x2002}}, -{0x2001, {0x2003}}, {0x2126, {0x3A9}}, {0x212A, {0x4B}}, {0x212B, {0x41, 0x30A}}, {0x219A, {0x2190, 0x338}}, {0x219B, {0x2192, 0x338}}, {0x21AE, {0x2194, 0x338}}, {0x21CD, {0x21D0, 0x338}}, -{0x21CE, {0x21D4, 0x338}}, {0x21CF, {0x21D2, 0x338}}, {0x2204, {0x2203, 0x338}}, {0x2209, {0x2208, 0x338}}, {0x220C, {0x220B, 0x338}}, {0x2224, {0x2223, 0x338}}, {0x2226, {0x2225, 0x338}}, -{0x2241, {0x223C, 0x338}}, {0x2244, {0x2243, 0x338}}, {0x2247, {0x2245, 0x338}}, {0x2249, {0x2248, 0x338}}, {0x2260, {0x3D, 0x338}}, {0x2262, {0x2261, 0x338}}, {0x226D, {0x224D, 0x338}}, -{0x226E, {0x3C, 0x338}}, {0x226F, {0x3E, 0x338}}, {0x2270, {0x2264, 0x338}}, {0x2271, {0x2265, 0x338}}, {0x2274, {0x2272, 0x338}}, {0x2275, {0x2273, 0x338}}, {0x2278, {0x2276, 0x338}}, -{0x2279, {0x2277, 0x338}}, {0x2280, {0x227A, 0x338}}, {0x2281, {0x227B, 0x338}}, {0x2284, {0x2282, 0x338}}, {0x2285, {0x2283, 0x338}}, {0x2288, {0x2286, 0x338}}, {0x2289, {0x2287, 0x338}}, -{0x22AC, {0x22A2, 0x338}}, {0x22AD, {0x22A8, 0x338}}, {0x22AE, {0x22A9, 0x338}}, {0x22AF, {0x22AB, 0x338}}, {0x22E0, {0x227C, 0x338}}, {0x22E1, {0x227D, 0x338}}, {0x22E2, {0x2291, 0x338}}, -{0x22E3, {0x2292, 0x338}}, {0x22EA, {0x22B2, 0x338}}, {0x22EB, {0x22B3, 0x338}}, {0x22EC, {0x22B4, 0x338}}, {0x22ED, {0x22B5, 0x338}}, {0x2329, {0x3008}}, {0x232A, {0x3009}}, -{0x2ADC, {0x2ADD, 0x338}}, {0x304C, {0x304B, 0x3099}}, {0x304E, {0x304D, 0x3099}}, {0x3050, {0x304F, 0x3099}}, {0x3052, {0x3051, 0x3099}}, {0x3054, {0x3053, 0x3099}}, {0x3056, {0x3055, 0x3099}}, -{0x3058, {0x3057, 0x3099}}, {0x305A, {0x3059, 0x3099}}, {0x305C, {0x305B, 0x3099}}, {0x305E, {0x305D, 0x3099}}, {0x3060, {0x305F, 0x3099}}, {0x3062, {0x3061, 0x3099}}, {0x3065, {0x3064, 0x3099}}, -{0x3067, {0x3066, 0x3099}}, {0x3069, {0x3068, 0x3099}}, {0x3070, {0x306F, 0x3099}}, {0x3071, {0x306F, 0x309A}}, {0x3073, {0x3072, 0x3099}}, {0x3074, {0x3072, 0x309A}}, {0x3076, {0x3075, 0x3099}}, -{0x3077, {0x3075, 0x309A}}, {0x3079, {0x3078, 0x3099}}, {0x307A, {0x3078, 0x309A}}, {0x307C, {0x307B, 0x3099}}, {0x307D, {0x307B, 0x309A}}, {0x3094, {0x3046, 0x3099}}, {0x309E, {0x309D, 0x3099}}, -{0x30AC, {0x30AB, 0x3099}}, {0x30AE, {0x30AD, 0x3099}}, {0x30B0, {0x30AF, 0x3099}}, {0x30B2, {0x30B1, 0x3099}}, {0x30B4, {0x30B3, 0x3099}}, {0x30B6, {0x30B5, 0x3099}}, {0x30B8, {0x30B7, 0x3099}}, -{0x30BA, {0x30B9, 0x3099}}, {0x30BC, {0x30BB, 0x3099}}, {0x30BE, {0x30BD, 0x3099}}, {0x30C0, {0x30BF, 0x3099}}, {0x30C2, {0x30C1, 0x3099}}, {0x30C5, {0x30C4, 0x3099}}, {0x30C7, {0x30C6, 0x3099}}, -{0x30C9, {0x30C8, 0x3099}}, {0x30D0, {0x30CF, 0x3099}}, {0x30D1, {0x30CF, 0x309A}}, {0x30D3, {0x30D2, 0x3099}}, {0x30D4, {0x30D2, 0x309A}}, {0x30D6, {0x30D5, 0x3099}}, {0x30D7, {0x30D5, 0x309A}}, -{0x30D9, {0x30D8, 0x3099}}, {0x30DA, {0x30D8, 0x309A}}, {0x30DC, {0x30DB, 0x3099}}, {0x30DD, {0x30DB, 0x309A}}, {0x30F4, {0x30A6, 0x3099}}, {0x30F7, {0x30EF, 0x3099}}, {0x30F8, {0x30F0, 0x3099}}, -{0x30F9, {0x30F1, 0x3099}}, {0x30FA, {0x30F2, 0x3099}}, {0x30FE, {0x30FD, 0x3099}}, {0xF900, {0x8C48}}, {0xF901, {0x66F4}}, {0xF902, {0x8ECA}}, {0xF903, {0x8CC8}}, {0xF904, {0x6ED1}}, -{0xF905, {0x4E32}}, {0xF906, {0x53E5}}, {0xF907, {0x9F9C}}, {0xF908, {0x9F9C}}, {0xF909, {0x5951}}, {0xF90A, {0x91D1}}, {0xF90B, {0x5587}}, {0xF90C, {0x5948}}, {0xF90D, {0x61F6}}, {0xF90E, {0x7669}}, -{0xF90F, {0x7F85}}, {0xF910, {0x863F}}, {0xF911, {0x87BA}}, {0xF912, {0x88F8}}, {0xF913, {0x908F}}, {0xF914, {0x6A02}}, {0xF915, {0x6D1B}}, {0xF916, {0x70D9}}, {0xF917, {0x73DE}}, {0xF918, {0x843D}}, -{0xF919, {0x916A}}, {0xF91A, {0x99F1}}, {0xF91B, {0x4E82}}, {0xF91C, {0x5375}}, {0xF91D, {0x6B04}}, {0xF91E, {0x721B}}, {0xF91F, {0x862D}}, {0xF920, {0x9E1E}}, {0xF921, {0x5D50}}, {0xF922, {0x6FEB}}, -{0xF923, {0x85CD}}, {0xF924, {0x8964}}, {0xF925, {0x62C9}}, {0xF926, {0x81D8}}, {0xF927, {0x881F}}, {0xF928, {0x5ECA}}, {0xF929, {0x6717}}, {0xF92A, {0x6D6A}}, {0xF92B, {0x72FC}}, {0xF92C, {0x90CE}}, -{0xF92D, {0x4F86}}, {0xF92E, {0x51B7}}, {0xF92F, {0x52DE}}, {0xF930, {0x64C4}}, {0xF931, {0x6AD3}}, {0xF932, {0x7210}}, {0xF933, {0x76E7}}, {0xF934, {0x8001}}, {0xF935, {0x8606}}, {0xF936, {0x865C}}, -{0xF937, {0x8DEF}}, {0xF938, {0x9732}}, {0xF939, {0x9B6F}}, {0xF93A, {0x9DFA}}, {0xF93B, {0x788C}}, {0xF93C, {0x797F}}, {0xF93D, {0x7DA0}}, {0xF93E, {0x83C9}}, {0xF93F, {0x9304}}, {0xF940, {0x9E7F}}, -{0xF941, {0x8AD6}}, {0xF942, {0x58DF}}, {0xF943, {0x5F04}}, {0xF944, {0x7C60}}, {0xF945, {0x807E}}, {0xF946, {0x7262}}, {0xF947, {0x78CA}}, {0xF948, {0x8CC2}}, {0xF949, {0x96F7}}, {0xF94A, {0x58D8}}, -{0xF94B, {0x5C62}}, {0xF94C, {0x6A13}}, {0xF94D, {0x6DDA}}, {0xF94E, {0x6F0F}}, {0xF94F, {0x7D2F}}, {0xF950, {0x7E37}}, {0xF951, {0x964B}}, {0xF952, {0x52D2}}, {0xF953, {0x808B}}, {0xF954, {0x51DC}}, -{0xF955, {0x51CC}}, {0xF956, {0x7A1C}}, {0xF957, {0x7DBE}}, {0xF958, {0x83F1}}, {0xF959, {0x9675}}, {0xF95A, {0x8B80}}, {0xF95B, {0x62CF}}, {0xF95C, {0x6A02}}, {0xF95D, {0x8AFE}}, {0xF95E, {0x4E39}}, -{0xF95F, {0x5BE7}}, {0xF960, {0x6012}}, {0xF961, {0x7387}}, {0xF962, {0x7570}}, {0xF963, {0x5317}}, {0xF964, {0x78FB}}, {0xF965, {0x4FBF}}, {0xF966, {0x5FA9}}, {0xF967, {0x4E0D}}, {0xF968, {0x6CCC}}, -{0xF969, {0x6578}}, {0xF96A, {0x7D22}}, {0xF96B, {0x53C3}}, {0xF96C, {0x585E}}, {0xF96D, {0x7701}}, {0xF96E, {0x8449}}, {0xF96F, {0x8AAA}}, {0xF970, {0x6BBA}}, {0xF971, {0x8FB0}}, {0xF972, {0x6C88}}, -{0xF973, {0x62FE}}, {0xF974, {0x82E5}}, {0xF975, {0x63A0}}, {0xF976, {0x7565}}, {0xF977, {0x4EAE}}, {0xF978, {0x5169}}, {0xF979, {0x51C9}}, {0xF97A, {0x6881}}, {0xF97B, {0x7CE7}}, {0xF97C, {0x826F}}, -{0xF97D, {0x8AD2}}, {0xF97E, {0x91CF}}, {0xF97F, {0x52F5}}, {0xF980, {0x5442}}, {0xF981, {0x5973}}, {0xF982, {0x5EEC}}, {0xF983, {0x65C5}}, {0xF984, {0x6FFE}}, {0xF985, {0x792A}}, {0xF986, {0x95AD}}, -{0xF987, {0x9A6A}}, {0xF988, {0x9E97}}, {0xF989, {0x9ECE}}, {0xF98A, {0x529B}}, {0xF98B, {0x66C6}}, {0xF98C, {0x6B77}}, {0xF98D, {0x8F62}}, {0xF98E, {0x5E74}}, {0xF98F, {0x6190}}, {0xF990, {0x6200}}, -{0xF991, {0x649A}}, {0xF992, {0x6F23}}, {0xF993, {0x7149}}, {0xF994, {0x7489}}, {0xF995, {0x79CA}}, {0xF996, {0x7DF4}}, {0xF997, {0x806F}}, {0xF998, {0x8F26}}, {0xF999, {0x84EE}}, {0xF99A, {0x9023}}, -{0xF99B, {0x934A}}, {0xF99C, {0x5217}}, {0xF99D, {0x52A3}}, {0xF99E, {0x54BD}}, {0xF99F, {0x70C8}}, {0xF9A0, {0x88C2}}, {0xF9A1, {0x8AAA}}, {0xF9A2, {0x5EC9}}, {0xF9A3, {0x5FF5}}, {0xF9A4, {0x637B}}, -{0xF9A5, {0x6BAE}}, {0xF9A6, {0x7C3E}}, {0xF9A7, {0x7375}}, {0xF9A8, {0x4EE4}}, {0xF9A9, {0x56F9}}, {0xF9AA, {0x5BE7}}, {0xF9AB, {0x5DBA}}, {0xF9AC, {0x601C}}, {0xF9AD, {0x73B2}}, {0xF9AE, {0x7469}}, -{0xF9AF, {0x7F9A}}, {0xF9B0, {0x8046}}, {0xF9B1, {0x9234}}, {0xF9B2, {0x96F6}}, {0xF9B3, {0x9748}}, {0xF9B4, {0x9818}}, {0xF9B5, {0x4F8B}}, {0xF9B6, {0x79AE}}, {0xF9B7, {0x91B4}}, {0xF9B8, {0x96B8}}, -{0xF9B9, {0x60E1}}, {0xF9BA, {0x4E86}}, {0xF9BB, {0x50DA}}, {0xF9BC, {0x5BEE}}, {0xF9BD, {0x5C3F}}, {0xF9BE, {0x6599}}, {0xF9BF, {0x6A02}}, {0xF9C0, {0x71CE}}, {0xF9C1, {0x7642}}, {0xF9C2, {0x84FC}}, -{0xF9C3, {0x907C}}, {0xF9C4, {0x9F8D}}, {0xF9C5, {0x6688}}, {0xF9C6, {0x962E}}, {0xF9C7, {0x5289}}, {0xF9C8, {0x677B}}, {0xF9C9, {0x67F3}}, {0xF9CA, {0x6D41}}, {0xF9CB, {0x6E9C}}, {0xF9CC, {0x7409}}, -{0xF9CD, {0x7559}}, {0xF9CE, {0x786B}}, {0xF9CF, {0x7D10}}, {0xF9D0, {0x985E}}, {0xF9D1, {0x516D}}, {0xF9D2, {0x622E}}, {0xF9D3, {0x9678}}, {0xF9D4, {0x502B}}, {0xF9D5, {0x5D19}}, {0xF9D6, {0x6DEA}}, -{0xF9D7, {0x8F2A}}, {0xF9D8, {0x5F8B}}, {0xF9D9, {0x6144}}, {0xF9DA, {0x6817}}, {0xF9DB, {0x7387}}, {0xF9DC, {0x9686}}, {0xF9DD, {0x5229}}, {0xF9DE, {0x540F}}, {0xF9DF, {0x5C65}}, {0xF9E0, {0x6613}}, -{0xF9E1, {0x674E}}, {0xF9E2, {0x68A8}}, {0xF9E3, {0x6CE5}}, {0xF9E4, {0x7406}}, {0xF9E5, {0x75E2}}, {0xF9E6, {0x7F79}}, {0xF9E7, {0x88CF}}, {0xF9E8, {0x88E1}}, {0xF9E9, {0x91CC}}, {0xF9EA, {0x96E2}}, -{0xF9EB, {0x533F}}, {0xF9EC, {0x6EBA}}, {0xF9ED, {0x541D}}, {0xF9EE, {0x71D0}}, {0xF9EF, {0x7498}}, {0xF9F0, {0x85FA}}, {0xF9F1, {0x96A3}}, {0xF9F2, {0x9C57}}, {0xF9F3, {0x9E9F}}, {0xF9F4, {0x6797}}, -{0xF9F5, {0x6DCB}}, {0xF9F6, {0x81E8}}, {0xF9F7, {0x7ACB}}, {0xF9F8, {0x7B20}}, {0xF9F9, {0x7C92}}, {0xF9FA, {0x72C0}}, {0xF9FB, {0x7099}}, {0xF9FC, {0x8B58}}, {0xF9FD, {0x4EC0}}, {0xF9FE, {0x8336}}, -{0xF9FF, {0x523A}}, {0xFA00, {0x5207}}, {0xFA01, {0x5EA6}}, {0xFA02, {0x62D3}}, {0xFA03, {0x7CD6}}, {0xFA04, {0x5B85}}, {0xFA05, {0x6D1E}}, {0xFA06, {0x66B4}}, {0xFA07, {0x8F3B}}, {0xFA08, {0x884C}}, -{0xFA09, {0x964D}}, {0xFA0A, {0x898B}}, {0xFA0B, {0x5ED3}}, {0xFA0C, {0x5140}}, {0xFA0D, {0x55C0}}, {0xFA10, {0x585A}}, {0xFA12, {0x6674}}, {0xFA15, {0x51DE}}, {0xFA16, {0x732A}}, {0xFA17, {0x76CA}}, -{0xFA18, {0x793C}}, {0xFA19, {0x795E}}, {0xFA1A, {0x7965}}, {0xFA1B, {0x798F}}, {0xFA1C, {0x9756}}, {0xFA1D, {0x7CBE}}, {0xFA1E, {0x7FBD}}, {0xFA20, {0x8612}}, {0xFA22, {0x8AF8}}, {0xFA25, {0x9038}}, -{0xFA26, {0x90FD}}, {0xFA2A, {0x98EF}}, {0xFA2B, {0x98FC}}, {0xFA2C, {0x9928}}, {0xFA2D, {0x9DB4}}, {0xFA2E, {0x90DE}}, {0xFA2F, {0x96B7}}, {0xFA30, {0x4FAE}}, {0xFA31, {0x50E7}}, {0xFA32, {0x514D}}, -{0xFA33, {0x52C9}}, {0xFA34, {0x52E4}}, {0xFA35, {0x5351}}, {0xFA36, {0x559D}}, {0xFA37, {0x5606}}, {0xFA38, {0x5668}}, {0xFA39, {0x5840}}, {0xFA3A, {0x58A8}}, {0xFA3B, {0x5C64}}, {0xFA3C, {0x5C6E}}, -{0xFA3D, {0x6094}}, {0xFA3E, {0x6168}}, {0xFA3F, {0x618E}}, {0xFA40, {0x61F2}}, {0xFA41, {0x654F}}, {0xFA42, {0x65E2}}, {0xFA43, {0x6691}}, {0xFA44, {0x6885}}, {0xFA45, {0x6D77}}, {0xFA46, {0x6E1A}}, -{0xFA47, {0x6F22}}, {0xFA48, {0x716E}}, {0xFA49, {0x722B}}, {0xFA4A, {0x7422}}, {0xFA4B, {0x7891}}, {0xFA4C, {0x793E}}, {0xFA4D, {0x7949}}, {0xFA4E, {0x7948}}, {0xFA4F, {0x7950}}, {0xFA50, {0x7956}}, -{0xFA51, {0x795D}}, {0xFA52, {0x798D}}, {0xFA53, {0x798E}}, {0xFA54, {0x7A40}}, {0xFA55, {0x7A81}}, {0xFA56, {0x7BC0}}, {0xFA57, {0x7DF4}}, {0xFA58, {0x7E09}}, {0xFA59, {0x7E41}}, {0xFA5A, {0x7F72}}, -{0xFA5B, {0x8005}}, {0xFA5C, {0x81ED}}, {0xFA5D, {0x8279}}, {0xFA5E, {0x8279}}, {0xFA5F, {0x8457}}, {0xFA60, {0x8910}}, {0xFA61, {0x8996}}, {0xFA62, {0x8B01}}, {0xFA63, {0x8B39}}, {0xFA64, {0x8CD3}}, -{0xFA65, {0x8D08}}, {0xFA66, {0x8FB6}}, {0xFA67, {0x9038}}, {0xFA68, {0x96E3}}, {0xFA69, {0x97FF}}, {0xFA6A, {0x983B}}, {0xFA6B, {0x6075}}, {0xFA6C, {0x242EE}}, {0xFA6D, {0x8218}}, {0xFA70, {0x4E26}}, -{0xFA71, {0x51B5}}, {0xFA72, {0x5168}}, {0xFA73, {0x4F80}}, {0xFA74, {0x5145}}, {0xFA75, {0x5180}}, {0xFA76, {0x52C7}}, {0xFA77, {0x52FA}}, {0xFA78, {0x559D}}, {0xFA79, {0x5555}}, {0xFA7A, {0x5599}}, -{0xFA7B, {0x55E2}}, {0xFA7C, {0x585A}}, {0xFA7D, {0x58B3}}, {0xFA7E, {0x5944}}, {0xFA7F, {0x5954}}, {0xFA80, {0x5A62}}, {0xFA81, {0x5B28}}, {0xFA82, {0x5ED2}}, {0xFA83, {0x5ED9}}, {0xFA84, {0x5F69}}, -{0xFA85, {0x5FAD}}, {0xFA86, {0x60D8}}, {0xFA87, {0x614E}}, {0xFA88, {0x6108}}, {0xFA89, {0x618E}}, {0xFA8A, {0x6160}}, {0xFA8B, {0x61F2}}, {0xFA8C, {0x6234}}, {0xFA8D, {0x63C4}}, {0xFA8E, {0x641C}}, -{0xFA8F, {0x6452}}, {0xFA90, {0x6556}}, {0xFA91, {0x6674}}, {0xFA92, {0x6717}}, {0xFA93, {0x671B}}, {0xFA94, {0x6756}}, {0xFA95, {0x6B79}}, {0xFA96, {0x6BBA}}, {0xFA97, {0x6D41}}, {0xFA98, {0x6EDB}}, -{0xFA99, {0x6ECB}}, {0xFA9A, {0x6F22}}, {0xFA9B, {0x701E}}, {0xFA9C, {0x716E}}, {0xFA9D, {0x77A7}}, {0xFA9E, {0x7235}}, {0xFA9F, {0x72AF}}, {0xFAA0, {0x732A}}, {0xFAA1, {0x7471}}, {0xFAA2, {0x7506}}, -{0xFAA3, {0x753B}}, {0xFAA4, {0x761D}}, {0xFAA5, {0x761F}}, {0xFAA6, {0x76CA}}, {0xFAA7, {0x76DB}}, {0xFAA8, {0x76F4}}, {0xFAA9, {0x774A}}, {0xFAAA, {0x7740}}, {0xFAAB, {0x78CC}}, {0xFAAC, {0x7AB1}}, -{0xFAAD, {0x7BC0}}, {0xFAAE, {0x7C7B}}, {0xFAAF, {0x7D5B}}, {0xFAB0, {0x7DF4}}, {0xFAB1, {0x7F3E}}, {0xFAB2, {0x8005}}, {0xFAB3, {0x8352}}, {0xFAB4, {0x83EF}}, {0xFAB5, {0x8779}}, {0xFAB6, {0x8941}}, -{0xFAB7, {0x8986}}, {0xFAB8, {0x8996}}, {0xFAB9, {0x8ABF}}, {0xFABA, {0x8AF8}}, {0xFABB, {0x8ACB}}, {0xFABC, {0x8B01}}, {0xFABD, {0x8AFE}}, {0xFABE, {0x8AED}}, {0xFABF, {0x8B39}}, {0xFAC0, {0x8B8A}}, -{0xFAC1, {0x8D08}}, {0xFAC2, {0x8F38}}, {0xFAC3, {0x9072}}, {0xFAC4, {0x9199}}, {0xFAC5, {0x9276}}, {0xFAC6, {0x967C}}, {0xFAC7, {0x96E3}}, {0xFAC8, {0x9756}}, {0xFAC9, {0x97DB}}, {0xFACA, {0x97FF}}, -{0xFACB, {0x980B}}, {0xFACC, {0x983B}}, {0xFACD, {0x9B12}}, {0xFACE, {0x9F9C}}, {0xFACF, {0x2284A}}, {0xFAD0, {0x22844}}, {0xFAD1, {0x233D5}}, {0xFAD2, {0x3B9D}}, {0xFAD3, {0x4018}}, -{0xFAD4, {0x4039}}, {0xFAD5, {0x25249}}, {0xFAD6, {0x25CD0}}, {0xFAD7, {0x27ED3}}, {0xFAD8, {0x9F43}}, {0xFAD9, {0x9F8E}}, {0xFB1D, {0x5D9, 0x5B4}}, {0xFB1F, {0x5F2, 0x5B7}}, {0xFB2A, {0x5E9, 0x5C1}}, -{0xFB2B, {0x5E9, 0x5C2}}, {0xFB2C, {0x5E9, 0x5BC, 0x5C1}}, {0xFB2D, {0x5E9, 0x5BC, 0x5C2}}, {0xFB2E, {0x5D0, 0x5B7}}, {0xFB2F, {0x5D0, 0x5B8}}, {0xFB30, {0x5D0, 0x5BC}}, {0xFB31, {0x5D1, 0x5BC}}, -{0xFB32, {0x5D2, 0x5BC}}, {0xFB33, {0x5D3, 0x5BC}}, {0xFB34, {0x5D4, 0x5BC}}, {0xFB35, {0x5D5, 0x5BC}}, {0xFB36, {0x5D6, 0x5BC}}, {0xFB38, {0x5D8, 0x5BC}}, {0xFB39, {0x5D9, 0x5BC}}, -{0xFB3A, {0x5DA, 0x5BC}}, {0xFB3B, {0x5DB, 0x5BC}}, {0xFB3C, {0x5DC, 0x5BC}}, {0xFB3E, {0x5DE, 0x5BC}}, {0xFB40, {0x5E0, 0x5BC}}, {0xFB41, {0x5E1, 0x5BC}}, {0xFB43, {0x5E3, 0x5BC}}, -{0xFB44, {0x5E4, 0x5BC}}, {0xFB46, {0x5E6, 0x5BC}}, {0xFB47, {0x5E7, 0x5BC}}, {0xFB48, {0x5E8, 0x5BC}}, {0xFB49, {0x5E9, 0x5BC}}, {0xFB4A, {0x5EA, 0x5BC}}, {0xFB4B, {0x5D5, 0x5B9}}, -{0xFB4C, {0x5D1, 0x5BF}}, {0xFB4D, {0x5DB, 0x5BF}}, {0xFB4E, {0x5E4, 0x5BF}}, {0x1109A, {0x11099, 0x110BA}}, {0x1109C, {0x1109B, 0x110BA}}, {0x110AB, {0x110A5, 0x110BA}}, -{0x1112E, {0x11131, 0x11127}}, {0x1112F, {0x11132, 0x11127}}, {0x1134B, {0x11347, 0x1133E}}, {0x1134C, {0x11347, 0x11357}}, {0x114BB, {0x114B9, 0x114BA}}, {0x114BC, {0x114B9, 0x114B0}}, -{0x114BE, {0x114B9, 0x114BD}}, {0x115BA, {0x115B8, 0x115AF}}, {0x115BB, {0x115B9, 0x115AF}}, {0x1D15E, {0x1D157, 0x1D165}}, {0x1D15F, {0x1D158, 0x1D165}}, {0x1D160, {0x1D158, 0x1D165, 0x1D16E}}, -{0x1D161, {0x1D158, 0x1D165, 0x1D16F}}, {0x1D162, {0x1D158, 0x1D165, 0x1D170}}, {0x1D163, {0x1D158, 0x1D165, 0x1D171}}, {0x1D164, {0x1D158, 0x1D165, 0x1D172}}, {0x1D1BB, {0x1D1B9, 0x1D165}}, -{0x1D1BC, {0x1D1BA, 0x1D165}}, {0x1D1BD, {0x1D1B9, 0x1D165, 0x1D16E}}, {0x1D1BE, {0x1D1BA, 0x1D165, 0x1D16E}}, {0x1D1BF, {0x1D1B9, 0x1D165, 0x1D16F}}, {0x1D1C0, {0x1D1BA, 0x1D165, 0x1D16F}}, -{0x2F800, {0x4E3D}}, {0x2F801, {0x4E38}}, {0x2F802, {0x4E41}}, {0x2F803, {0x20122}}, {0x2F804, {0x4F60}}, {0x2F805, {0x4FAE}}, {0x2F806, {0x4FBB}}, {0x2F807, {0x5002}}, {0x2F808, {0x507A}}, -{0x2F809, {0x5099}}, {0x2F80A, {0x50E7}}, {0x2F80B, {0x50CF}}, {0x2F80C, {0x349E}}, {0x2F80D, {0x2063A}}, {0x2F80E, {0x514D}}, {0x2F80F, {0x5154}}, {0x2F810, {0x5164}}, {0x2F811, {0x5177}}, -{0x2F812, {0x2051C}}, {0x2F813, {0x34B9}}, {0x2F814, {0x5167}}, {0x2F815, {0x518D}}, {0x2F816, {0x2054B}}, {0x2F817, {0x5197}}, {0x2F818, {0x51A4}}, {0x2F819, {0x4ECC}}, {0x2F81A, {0x51AC}}, -{0x2F81B, {0x51B5}}, {0x2F81C, {0x291DF}}, {0x2F81D, {0x51F5}}, {0x2F81E, {0x5203}}, {0x2F81F, {0x34DF}}, {0x2F820, {0x523B}}, {0x2F821, {0x5246}}, {0x2F822, {0x5272}}, {0x2F823, {0x5277}}, -{0x2F824, {0x3515}}, {0x2F825, {0x52C7}}, {0x2F826, {0x52C9}}, {0x2F827, {0x52E4}}, {0x2F828, {0x52FA}}, {0x2F829, {0x5305}}, {0x2F82A, {0x5306}}, {0x2F82B, {0x5317}}, {0x2F82C, {0x5349}}, -{0x2F82D, {0x5351}}, {0x2F82E, {0x535A}}, {0x2F82F, {0x5373}}, {0x2F830, {0x537D}}, {0x2F831, {0x537F}}, {0x2F832, {0x537F}}, {0x2F833, {0x537F}}, {0x2F834, {0x20A2C}}, {0x2F835, {0x7070}}, -{0x2F836, {0x53CA}}, {0x2F837, {0x53DF}}, {0x2F838, {0x20B63}}, {0x2F839, {0x53EB}}, {0x2F83A, {0x53F1}}, {0x2F83B, {0x5406}}, {0x2F83C, {0x549E}}, {0x2F83D, {0x5438}}, {0x2F83E, {0x5448}}, -{0x2F83F, {0x5468}}, {0x2F840, {0x54A2}}, {0x2F841, {0x54F6}}, {0x2F842, {0x5510}}, {0x2F843, {0x5553}}, {0x2F844, {0x5563}}, {0x2F845, {0x5584}}, {0x2F846, {0x5584}}, {0x2F847, {0x5599}}, -{0x2F848, {0x55AB}}, {0x2F849, {0x55B3}}, {0x2F84A, {0x55C2}}, {0x2F84B, {0x5716}}, {0x2F84C, {0x5606}}, {0x2F84D, {0x5717}}, {0x2F84E, {0x5651}}, {0x2F84F, {0x5674}}, {0x2F850, {0x5207}}, -{0x2F851, {0x58EE}}, {0x2F852, {0x57CE}}, {0x2F853, {0x57F4}}, {0x2F854, {0x580D}}, {0x2F855, {0x578B}}, {0x2F856, {0x5832}}, {0x2F857, {0x5831}}, {0x2F858, {0x58AC}}, {0x2F859, {0x214E4}}, -{0x2F85A, {0x58F2}}, {0x2F85B, {0x58F7}}, {0x2F85C, {0x5906}}, {0x2F85D, {0x591A}}, {0x2F85E, {0x5922}}, {0x2F85F, {0x5962}}, {0x2F860, {0x216A8}}, {0x2F861, {0x216EA}}, {0x2F862, {0x59EC}}, -{0x2F863, {0x5A1B}}, {0x2F864, {0x5A27}}, {0x2F865, {0x59D8}}, {0x2F866, {0x5A66}}, {0x2F867, {0x36EE}}, {0x2F868, {0x36FC}}, {0x2F869, {0x5B08}}, {0x2F86A, {0x5B3E}}, {0x2F86B, {0x5B3E}}, -{0x2F86C, {0x219C8}}, {0x2F86D, {0x5BC3}}, {0x2F86E, {0x5BD8}}, {0x2F86F, {0x5BE7}}, {0x2F870, {0x5BF3}}, {0x2F871, {0x21B18}}, {0x2F872, {0x5BFF}}, {0x2F873, {0x5C06}}, {0x2F874, {0x5F53}}, -{0x2F875, {0x5C22}}, {0x2F876, {0x3781}}, {0x2F877, {0x5C60}}, {0x2F878, {0x5C6E}}, {0x2F879, {0x5CC0}}, {0x2F87A, {0x5C8D}}, {0x2F87B, {0x21DE4}}, {0x2F87C, {0x5D43}}, {0x2F87D, {0x21DE6}}, -{0x2F87E, {0x5D6E}}, {0x2F87F, {0x5D6B}}, {0x2F880, {0x5D7C}}, {0x2F881, {0x5DE1}}, {0x2F882, {0x5DE2}}, {0x2F883, {0x382F}}, {0x2F884, {0x5DFD}}, {0x2F885, {0x5E28}}, {0x2F886, {0x5E3D}}, -{0x2F887, {0x5E69}}, {0x2F888, {0x3862}}, {0x2F889, {0x22183}}, {0x2F88A, {0x387C}}, {0x2F88B, {0x5EB0}}, {0x2F88C, {0x5EB3}}, {0x2F88D, {0x5EB6}}, {0x2F88E, {0x5ECA}}, {0x2F88F, {0x2A392}}, -{0x2F890, {0x5EFE}}, {0x2F891, {0x22331}}, {0x2F892, {0x22331}}, {0x2F893, {0x8201}}, {0x2F894, {0x5F22}}, {0x2F895, {0x5F22}}, {0x2F896, {0x38C7}}, {0x2F897, {0x232B8}}, {0x2F898, {0x261DA}}, -{0x2F899, {0x5F62}}, {0x2F89A, {0x5F6B}}, {0x2F89B, {0x38E3}}, {0x2F89C, {0x5F9A}}, {0x2F89D, {0x5FCD}}, {0x2F89E, {0x5FD7}}, {0x2F89F, {0x5FF9}}, {0x2F8A0, {0x6081}}, {0x2F8A1, {0x393A}}, -{0x2F8A2, {0x391C}}, {0x2F8A3, {0x6094}}, {0x2F8A4, {0x226D4}}, {0x2F8A5, {0x60C7}}, {0x2F8A6, {0x6148}}, {0x2F8A7, {0x614C}}, {0x2F8A8, {0x614E}}, {0x2F8A9, {0x614C}}, {0x2F8AA, {0x617A}}, -{0x2F8AB, {0x618E}}, {0x2F8AC, {0x61B2}}, {0x2F8AD, {0x61A4}}, {0x2F8AE, {0x61AF}}, {0x2F8AF, {0x61DE}}, {0x2F8B0, {0x61F2}}, {0x2F8B1, {0x61F6}}, {0x2F8B2, {0x6210}}, {0x2F8B3, {0x621B}}, -{0x2F8B4, {0x625D}}, {0x2F8B5, {0x62B1}}, {0x2F8B6, {0x62D4}}, {0x2F8B7, {0x6350}}, {0x2F8B8, {0x22B0C}}, {0x2F8B9, {0x633D}}, {0x2F8BA, {0x62FC}}, {0x2F8BB, {0x6368}}, {0x2F8BC, {0x6383}}, -{0x2F8BD, {0x63E4}}, {0x2F8BE, {0x22BF1}}, {0x2F8BF, {0x6422}}, {0x2F8C0, {0x63C5}}, {0x2F8C1, {0x63A9}}, {0x2F8C2, {0x3A2E}}, {0x2F8C3, {0x6469}}, {0x2F8C4, {0x647E}}, {0x2F8C5, {0x649D}}, -{0x2F8C6, {0x6477}}, {0x2F8C7, {0x3A6C}}, {0x2F8C8, {0x654F}}, {0x2F8C9, {0x656C}}, {0x2F8CA, {0x2300A}}, {0x2F8CB, {0x65E3}}, {0x2F8CC, {0x66F8}}, {0x2F8CD, {0x6649}}, {0x2F8CE, {0x3B19}}, -{0x2F8CF, {0x6691}}, {0x2F8D0, {0x3B08}}, {0x2F8D1, {0x3AE4}}, {0x2F8D2, {0x5192}}, {0x2F8D3, {0x5195}}, {0x2F8D4, {0x6700}}, {0x2F8D5, {0x669C}}, {0x2F8D6, {0x80AD}}, {0x2F8D7, {0x43D9}}, -{0x2F8D8, {0x6717}}, {0x2F8D9, {0x671B}}, {0x2F8DA, {0x6721}}, {0x2F8DB, {0x675E}}, {0x2F8DC, {0x6753}}, {0x2F8DD, {0x233C3}}, {0x2F8DE, {0x3B49}}, {0x2F8DF, {0x67FA}}, {0x2F8E0, {0x6785}}, -{0x2F8E1, {0x6852}}, {0x2F8E2, {0x6885}}, {0x2F8E3, {0x2346D}}, {0x2F8E4, {0x688E}}, {0x2F8E5, {0x681F}}, {0x2F8E6, {0x6914}}, {0x2F8E7, {0x3B9D}}, {0x2F8E8, {0x6942}}, {0x2F8E9, {0x69A3}}, -{0x2F8EA, {0x69EA}}, {0x2F8EB, {0x6AA8}}, {0x2F8EC, {0x236A3}}, {0x2F8ED, {0x6ADB}}, {0x2F8EE, {0x3C18}}, {0x2F8EF, {0x6B21}}, {0x2F8F0, {0x238A7}}, {0x2F8F1, {0x6B54}}, {0x2F8F2, {0x3C4E}}, -{0x2F8F3, {0x6B72}}, {0x2F8F4, {0x6B9F}}, {0x2F8F5, {0x6BBA}}, {0x2F8F6, {0x6BBB}}, {0x2F8F7, {0x23A8D}}, {0x2F8F8, {0x21D0B}}, {0x2F8F9, {0x23AFA}}, {0x2F8FA, {0x6C4E}}, {0x2F8FB, {0x23CBC}}, -{0x2F8FC, {0x6CBF}}, {0x2F8FD, {0x6CCD}}, {0x2F8FE, {0x6C67}}, {0x2F8FF, {0x6D16}}, {0x2F900, {0x6D3E}}, {0x2F901, {0x6D77}}, {0x2F902, {0x6D41}}, {0x2F903, {0x6D69}}, {0x2F904, {0x6D78}}, -{0x2F905, {0x6D85}}, {0x2F906, {0x23D1E}}, {0x2F907, {0x6D34}}, {0x2F908, {0x6E2F}}, {0x2F909, {0x6E6E}}, {0x2F90A, {0x3D33}}, {0x2F90B, {0x6ECB}}, {0x2F90C, {0x6EC7}}, {0x2F90D, {0x23ED1}}, -{0x2F90E, {0x6DF9}}, {0x2F90F, {0x6F6E}}, {0x2F910, {0x23F5E}}, {0x2F911, {0x23F8E}}, {0x2F912, {0x6FC6}}, {0x2F913, {0x7039}}, {0x2F914, {0x701E}}, {0x2F915, {0x701B}}, {0x2F916, {0x3D96}}, -{0x2F917, {0x704A}}, {0x2F918, {0x707D}}, {0x2F919, {0x7077}}, {0x2F91A, {0x70AD}}, {0x2F91B, {0x20525}}, {0x2F91C, {0x7145}}, {0x2F91D, {0x24263}}, {0x2F91E, {0x719C}}, {0x2F91F, {0x243AB}}, -{0x2F920, {0x7228}}, {0x2F921, {0x7235}}, {0x2F922, {0x7250}}, {0x2F923, {0x24608}}, {0x2F924, {0x7280}}, {0x2F925, {0x7295}}, {0x2F926, {0x24735}}, {0x2F927, {0x24814}}, {0x2F928, {0x737A}}, -{0x2F929, {0x738B}}, {0x2F92A, {0x3EAC}}, {0x2F92B, {0x73A5}}, {0x2F92C, {0x3EB8}}, {0x2F92D, {0x3EB8}}, {0x2F92E, {0x7447}}, {0x2F92F, {0x745C}}, {0x2F930, {0x7471}}, {0x2F931, {0x7485}}, -{0x2F932, {0x74CA}}, {0x2F933, {0x3F1B}}, {0x2F934, {0x7524}}, {0x2F935, {0x24C36}}, {0x2F936, {0x753E}}, {0x2F937, {0x24C92}}, {0x2F938, {0x7570}}, {0x2F939, {0x2219F}}, {0x2F93A, {0x7610}}, -{0x2F93B, {0x24FA1}}, {0x2F93C, {0x24FB8}}, {0x2F93D, {0x25044}}, {0x2F93E, {0x3FFC}}, {0x2F93F, {0x4008}}, {0x2F940, {0x76F4}}, {0x2F941, {0x250F3}}, {0x2F942, {0x250F2}}, {0x2F943, {0x25119}}, -{0x2F944, {0x25133}}, {0x2F945, {0x771E}}, {0x2F946, {0x771F}}, {0x2F947, {0x771F}}, {0x2F948, {0x774A}}, {0x2F949, {0x4039}}, {0x2F94A, {0x778B}}, {0x2F94B, {0x4046}}, {0x2F94C, {0x4096}}, -{0x2F94D, {0x2541D}}, {0x2F94E, {0x784E}}, {0x2F94F, {0x788C}}, {0x2F950, {0x78CC}}, {0x2F951, {0x40E3}}, {0x2F952, {0x25626}}, {0x2F953, {0x7956}}, {0x2F954, {0x2569A}}, {0x2F955, {0x256C5}}, -{0x2F956, {0x798F}}, {0x2F957, {0x79EB}}, {0x2F958, {0x412F}}, {0x2F959, {0x7A40}}, {0x2F95A, {0x7A4A}}, {0x2F95B, {0x7A4F}}, {0x2F95C, {0x2597C}}, {0x2F95D, {0x25AA7}}, {0x2F95E, {0x25AA7}}, -{0x2F95F, {0x7AEE}}, {0x2F960, {0x4202}}, {0x2F961, {0x25BAB}}, {0x2F962, {0x7BC6}}, {0x2F963, {0x7BC9}}, {0x2F964, {0x4227}}, {0x2F965, {0x25C80}}, {0x2F966, {0x7CD2}}, {0x2F967, {0x42A0}}, -{0x2F968, {0x7CE8}}, {0x2F969, {0x7CE3}}, {0x2F96A, {0x7D00}}, {0x2F96B, {0x25F86}}, {0x2F96C, {0x7D63}}, {0x2F96D, {0x4301}}, {0x2F96E, {0x7DC7}}, {0x2F96F, {0x7E02}}, {0x2F970, {0x7E45}}, -{0x2F971, {0x4334}}, {0x2F972, {0x26228}}, {0x2F973, {0x26247}}, {0x2F974, {0x4359}}, {0x2F975, {0x262D9}}, {0x2F976, {0x7F7A}}, {0x2F977, {0x2633E}}, {0x2F978, {0x7F95}}, {0x2F979, {0x7FFA}}, -{0x2F97A, {0x8005}}, {0x2F97B, {0x264DA}}, {0x2F97C, {0x26523}}, {0x2F97D, {0x8060}}, {0x2F97E, {0x265A8}}, {0x2F97F, {0x8070}}, {0x2F980, {0x2335F}}, {0x2F981, {0x43D5}}, {0x2F982, {0x80B2}}, -{0x2F983, {0x8103}}, {0x2F984, {0x440B}}, {0x2F985, {0x813E}}, {0x2F986, {0x5AB5}}, {0x2F987, {0x267A7}}, {0x2F988, {0x267B5}}, {0x2F989, {0x23393}}, {0x2F98A, {0x2339C}}, {0x2F98B, {0x8201}}, -{0x2F98C, {0x8204}}, {0x2F98D, {0x8F9E}}, {0x2F98E, {0x446B}}, {0x2F98F, {0x8291}}, {0x2F990, {0x828B}}, {0x2F991, {0x829D}}, {0x2F992, {0x52B3}}, {0x2F993, {0x82B1}}, {0x2F994, {0x82B3}}, -{0x2F995, {0x82BD}}, {0x2F996, {0x82E6}}, {0x2F997, {0x26B3C}}, {0x2F998, {0x82E5}}, {0x2F999, {0x831D}}, {0x2F99A, {0x8363}}, {0x2F99B, {0x83AD}}, {0x2F99C, {0x8323}}, {0x2F99D, {0x83BD}}, -{0x2F99E, {0x83E7}}, {0x2F99F, {0x8457}}, {0x2F9A0, {0x8353}}, {0x2F9A1, {0x83CA}}, {0x2F9A2, {0x83CC}}, {0x2F9A3, {0x83DC}}, {0x2F9A4, {0x26C36}}, {0x2F9A5, {0x26D6B}}, {0x2F9A6, {0x26CD5}}, -{0x2F9A7, {0x452B}}, {0x2F9A8, {0x84F1}}, {0x2F9A9, {0x84F3}}, {0x2F9AA, {0x8516}}, {0x2F9AB, {0x273CA}}, {0x2F9AC, {0x8564}}, {0x2F9AD, {0x26F2C}}, {0x2F9AE, {0x455D}}, {0x2F9AF, {0x4561}}, -{0x2F9B0, {0x26FB1}}, {0x2F9B1, {0x270D2}}, {0x2F9B2, {0x456B}}, {0x2F9B3, {0x8650}}, {0x2F9B4, {0x865C}}, {0x2F9B5, {0x8667}}, {0x2F9B6, {0x8669}}, {0x2F9B7, {0x86A9}}, {0x2F9B8, {0x8688}}, -{0x2F9B9, {0x870E}}, {0x2F9BA, {0x86E2}}, {0x2F9BB, {0x8779}}, {0x2F9BC, {0x8728}}, {0x2F9BD, {0x876B}}, {0x2F9BE, {0x8786}}, {0x2F9BF, {0x45D7}}, {0x2F9C0, {0x87E1}}, {0x2F9C1, {0x8801}}, -{0x2F9C2, {0x45F9}}, {0x2F9C3, {0x8860}}, {0x2F9C4, {0x8863}}, {0x2F9C5, {0x27667}}, {0x2F9C6, {0x88D7}}, {0x2F9C7, {0x88DE}}, {0x2F9C8, {0x4635}}, {0x2F9C9, {0x88FA}}, {0x2F9CA, {0x34BB}}, -{0x2F9CB, {0x278AE}}, {0x2F9CC, {0x27966}}, {0x2F9CD, {0x46BE}}, {0x2F9CE, {0x46C7}}, {0x2F9CF, {0x8AA0}}, {0x2F9D0, {0x8AED}}, {0x2F9D1, {0x8B8A}}, {0x2F9D2, {0x8C55}}, {0x2F9D3, {0x27CA8}}, -{0x2F9D4, {0x8CAB}}, {0x2F9D5, {0x8CC1}}, {0x2F9D6, {0x8D1B}}, {0x2F9D7, {0x8D77}}, {0x2F9D8, {0x27F2F}}, {0x2F9D9, {0x20804}}, {0x2F9DA, {0x8DCB}}, {0x2F9DB, {0x8DBC}}, {0x2F9DC, {0x8DF0}}, -{0x2F9DD, {0x208DE}}, {0x2F9DE, {0x8ED4}}, {0x2F9DF, {0x8F38}}, {0x2F9E0, {0x285D2}}, {0x2F9E1, {0x285ED}}, {0x2F9E2, {0x9094}}, {0x2F9E3, {0x90F1}}, {0x2F9E4, {0x9111}}, {0x2F9E5, {0x2872E}}, -{0x2F9E6, {0x911B}}, {0x2F9E7, {0x9238}}, {0x2F9E8, {0x92D7}}, {0x2F9E9, {0x92D8}}, {0x2F9EA, {0x927C}}, {0x2F9EB, {0x93F9}}, {0x2F9EC, {0x9415}}, {0x2F9ED, {0x28BFA}}, {0x2F9EE, {0x958B}}, -{0x2F9EF, {0x4995}}, {0x2F9F0, {0x95B7}}, {0x2F9F1, {0x28D77}}, {0x2F9F2, {0x49E6}}, {0x2F9F3, {0x96C3}}, {0x2F9F4, {0x5DB2}}, {0x2F9F5, {0x9723}}, {0x2F9F6, {0x29145}}, {0x2F9F7, {0x2921A}}, -{0x2F9F8, {0x4A6E}}, {0x2F9F9, {0x4A76}}, {0x2F9FA, {0x97E0}}, {0x2F9FB, {0x2940A}}, {0x2F9FC, {0x4AB2}}, {0x2F9FD, {0x29496}}, {0x2F9FE, {0x980B}}, {0x2F9FF, {0x980B}}, {0x2FA00, {0x9829}}, -{0x2FA01, {0x295B6}}, {0x2FA02, {0x98E2}}, {0x2FA03, {0x4B33}}, {0x2FA04, {0x9929}}, {0x2FA05, {0x99A7}}, {0x2FA06, {0x99C2}}, {0x2FA07, {0x99FE}}, {0x2FA08, {0x4BCE}}, {0x2FA09, {0x29B30}}, -{0x2FA0A, {0x9B12}}, {0x2FA0B, {0x9C40}}, {0x2FA0C, {0x9CFD}}, {0x2FA0D, {0x4CCE}}, {0x2FA0E, {0x4CED}}, {0x2FA0F, {0x9D67}}, {0x2FA10, {0x2A0CE}}, {0x2FA11, {0x4CF8}}, {0x2FA12, {0x2A105}}, -{0x2FA13, {0x2A20E}}, {0x2FA14, {0x2A291}}, {0x2FA15, {0x9EBB}}, {0x2FA16, {0x4D56}}, {0x2FA17, {0x9EF9}}, {0x2FA18, {0x9EFE}}, {0x2FA19, {0x9F05}}, {0x2FA1A, {0x9F0F}}, {0x2FA1B, {0x9F16}}, -{0x2FA1D, {0x2A600}}, +static const std::multimap nfd_map = { +{0xC0, 0x41}, {0xC0, 0x300}, {0xC1, 0x41}, {0xC1, 0x301}, {0xC2, 0x41}, {0xC2, 0x302}, {0xC3, 0x41}, {0xC3, 0x303}, {0xC4, 0x41}, {0xC4, 0x308}, {0xC5, 0x41}, {0xC5, 0x30A}, {0xC7, 0x43}, +{0xC7, 0x327}, {0xC8, 0x45}, {0xC8, 0x300}, {0xC9, 0x45}, {0xC9, 0x301}, {0xCA, 0x45}, {0xCA, 0x302}, {0xCB, 0x45}, {0xCB, 0x308}, {0xCC, 0x49}, {0xCC, 0x300}, {0xCD, 0x49}, {0xCD, 0x301}, +{0xCE, 0x49}, {0xCE, 0x302}, {0xCF, 0x49}, {0xCF, 0x308}, {0xD1, 0x4E}, {0xD1, 0x303}, {0xD2, 0x4F}, {0xD2, 0x300}, {0xD3, 0x4F}, {0xD3, 0x301}, {0xD4, 0x4F}, {0xD4, 0x302}, {0xD5, 0x4F}, +{0xD5, 0x303}, {0xD6, 0x4F}, {0xD6, 0x308}, {0xD9, 0x55}, {0xD9, 0x300}, {0xDA, 0x55}, {0xDA, 0x301}, {0xDB, 0x55}, {0xDB, 0x302}, {0xDC, 0x55}, {0xDC, 0x308}, {0xDD, 0x59}, {0xDD, 0x301}, +{0xE0, 0x61}, {0xE0, 0x300}, {0xE1, 0x61}, {0xE1, 0x301}, {0xE2, 0x61}, {0xE2, 0x302}, {0xE3, 0x61}, {0xE3, 0x303}, {0xE4, 0x61}, {0xE4, 0x308}, {0xE5, 0x61}, {0xE5, 0x30A}, {0xE7, 0x63}, +{0xE7, 0x327}, {0xE8, 0x65}, {0xE8, 0x300}, {0xE9, 0x65}, {0xE9, 0x301}, {0xEA, 0x65}, {0xEA, 0x302}, {0xEB, 0x65}, {0xEB, 0x308}, {0xEC, 0x69}, {0xEC, 0x300}, {0xED, 0x69}, {0xED, 0x301}, +{0xEE, 0x69}, {0xEE, 0x302}, {0xEF, 0x69}, {0xEF, 0x308}, {0xF1, 0x6E}, {0xF1, 0x303}, {0xF2, 0x6F}, {0xF2, 0x300}, {0xF3, 0x6F}, {0xF3, 0x301}, {0xF4, 0x6F}, {0xF4, 0x302}, {0xF5, 0x6F}, +{0xF5, 0x303}, {0xF6, 0x6F}, {0xF6, 0x308}, {0xF9, 0x75}, {0xF9, 0x300}, {0xFA, 0x75}, {0xFA, 0x301}, {0xFB, 0x75}, {0xFB, 0x302}, {0xFC, 0x75}, {0xFC, 0x308}, {0xFD, 0x79}, {0xFD, 0x301}, +{0xFF, 0x79}, {0xFF, 0x308}, {0x100, 0x41}, {0x100, 0x304}, {0x101, 0x61}, {0x101, 0x304}, {0x102, 0x41}, {0x102, 0x306}, {0x103, 0x61}, {0x103, 0x306}, {0x104, 0x41}, {0x104, 0x328}, {0x105, 0x61}, +{0x105, 0x328}, {0x106, 0x43}, {0x106, 0x301}, {0x107, 0x63}, {0x107, 0x301}, {0x108, 0x43}, {0x108, 0x302}, {0x109, 0x63}, {0x109, 0x302}, {0x10A, 0x43}, {0x10A, 0x307}, {0x10B, 0x63}, +{0x10B, 0x307}, {0x10C, 0x43}, {0x10C, 0x30C}, {0x10D, 0x63}, {0x10D, 0x30C}, {0x10E, 0x44}, {0x10E, 0x30C}, {0x10F, 0x64}, {0x10F, 0x30C}, {0x112, 0x45}, {0x112, 0x304}, {0x113, 0x65}, +{0x113, 0x304}, {0x114, 0x45}, {0x114, 0x306}, {0x115, 0x65}, {0x115, 0x306}, {0x116, 0x45}, {0x116, 0x307}, {0x117, 0x65}, {0x117, 0x307}, {0x118, 0x45}, {0x118, 0x328}, {0x119, 0x65}, +{0x119, 0x328}, {0x11A, 0x45}, {0x11A, 0x30C}, {0x11B, 0x65}, {0x11B, 0x30C}, {0x11C, 0x47}, {0x11C, 0x302}, {0x11D, 0x67}, {0x11D, 0x302}, {0x11E, 0x47}, {0x11E, 0x306}, {0x11F, 0x67}, +{0x11F, 0x306}, {0x120, 0x47}, {0x120, 0x307}, {0x121, 0x67}, {0x121, 0x307}, {0x122, 0x47}, {0x122, 0x327}, {0x123, 0x67}, {0x123, 0x327}, {0x124, 0x48}, {0x124, 0x302}, {0x125, 0x68}, +{0x125, 0x302}, {0x128, 0x49}, {0x128, 0x303}, {0x129, 0x69}, {0x129, 0x303}, {0x12A, 0x49}, {0x12A, 0x304}, {0x12B, 0x69}, {0x12B, 0x304}, {0x12C, 0x49}, {0x12C, 0x306}, {0x12D, 0x69}, +{0x12D, 0x306}, {0x12E, 0x49}, {0x12E, 0x328}, {0x12F, 0x69}, {0x12F, 0x328}, {0x130, 0x49}, {0x130, 0x307}, {0x134, 0x4A}, {0x134, 0x302}, {0x135, 0x6A}, {0x135, 0x302}, {0x136, 0x4B}, +{0x136, 0x327}, {0x137, 0x6B}, {0x137, 0x327}, {0x139, 0x4C}, {0x139, 0x301}, {0x13A, 0x6C}, {0x13A, 0x301}, {0x13B, 0x4C}, {0x13B, 0x327}, {0x13C, 0x6C}, {0x13C, 0x327}, {0x13D, 0x4C}, +{0x13D, 0x30C}, {0x13E, 0x6C}, {0x13E, 0x30C}, {0x143, 0x4E}, {0x143, 0x301}, {0x144, 0x6E}, {0x144, 0x301}, {0x145, 0x4E}, {0x145, 0x327}, {0x146, 0x6E}, {0x146, 0x327}, {0x147, 0x4E}, +{0x147, 0x30C}, {0x148, 0x6E}, {0x148, 0x30C}, {0x14C, 0x4F}, {0x14C, 0x304}, {0x14D, 0x6F}, {0x14D, 0x304}, {0x14E, 0x4F}, {0x14E, 0x306}, {0x14F, 0x6F}, {0x14F, 0x306}, {0x150, 0x4F}, +{0x150, 0x30B}, {0x151, 0x6F}, {0x151, 0x30B}, {0x154, 0x52}, {0x154, 0x301}, {0x155, 0x72}, {0x155, 0x301}, {0x156, 0x52}, {0x156, 0x327}, {0x157, 0x72}, {0x157, 0x327}, {0x158, 0x52}, +{0x158, 0x30C}, {0x159, 0x72}, {0x159, 0x30C}, {0x15A, 0x53}, {0x15A, 0x301}, {0x15B, 0x73}, {0x15B, 0x301}, {0x15C, 0x53}, {0x15C, 0x302}, {0x15D, 0x73}, {0x15D, 0x302}, {0x15E, 0x53}, +{0x15E, 0x327}, {0x15F, 0x73}, {0x15F, 0x327}, {0x160, 0x53}, {0x160, 0x30C}, {0x161, 0x73}, {0x161, 0x30C}, {0x162, 0x54}, {0x162, 0x327}, {0x163, 0x74}, {0x163, 0x327}, {0x164, 0x54}, +{0x164, 0x30C}, {0x165, 0x74}, {0x165, 0x30C}, {0x168, 0x55}, {0x168, 0x303}, {0x169, 0x75}, {0x169, 0x303}, {0x16A, 0x55}, {0x16A, 0x304}, {0x16B, 0x75}, {0x16B, 0x304}, {0x16C, 0x55}, +{0x16C, 0x306}, {0x16D, 0x75}, {0x16D, 0x306}, {0x16E, 0x55}, {0x16E, 0x30A}, {0x16F, 0x75}, {0x16F, 0x30A}, {0x170, 0x55}, {0x170, 0x30B}, {0x171, 0x75}, {0x171, 0x30B}, {0x172, 0x55}, +{0x172, 0x328}, {0x173, 0x75}, {0x173, 0x328}, {0x174, 0x57}, {0x174, 0x302}, {0x175, 0x77}, {0x175, 0x302}, {0x176, 0x59}, {0x176, 0x302}, {0x177, 0x79}, {0x177, 0x302}, {0x178, 0x59}, +{0x178, 0x308}, {0x179, 0x5A}, {0x179, 0x301}, {0x17A, 0x7A}, {0x17A, 0x301}, {0x17B, 0x5A}, {0x17B, 0x307}, {0x17C, 0x7A}, {0x17C, 0x307}, {0x17D, 0x5A}, {0x17D, 0x30C}, {0x17E, 0x7A}, +{0x17E, 0x30C}, {0x1A0, 0x4F}, {0x1A0, 0x31B}, {0x1A1, 0x6F}, {0x1A1, 0x31B}, {0x1AF, 0x55}, {0x1AF, 0x31B}, {0x1B0, 0x75}, {0x1B0, 0x31B}, {0x1CD, 0x41}, {0x1CD, 0x30C}, {0x1CE, 0x61}, +{0x1CE, 0x30C}, {0x1CF, 0x49}, {0x1CF, 0x30C}, {0x1D0, 0x69}, {0x1D0, 0x30C}, {0x1D1, 0x4F}, {0x1D1, 0x30C}, {0x1D2, 0x6F}, {0x1D2, 0x30C}, {0x1D3, 0x55}, {0x1D3, 0x30C}, {0x1D4, 0x75}, +{0x1D4, 0x30C}, {0x1D5, 0x55}, {0x1D5, 0x308}, {0x1D5, 0x304}, {0x1D6, 0x75}, {0x1D6, 0x308}, {0x1D6, 0x304}, {0x1D7, 0x55}, {0x1D7, 0x308}, {0x1D7, 0x301}, {0x1D8, 0x75}, {0x1D8, 0x308}, +{0x1D8, 0x301}, {0x1D9, 0x55}, {0x1D9, 0x308}, {0x1D9, 0x30C}, {0x1DA, 0x75}, {0x1DA, 0x308}, {0x1DA, 0x30C}, {0x1DB, 0x55}, {0x1DB, 0x308}, {0x1DB, 0x300}, {0x1DC, 0x75}, {0x1DC, 0x308}, +{0x1DC, 0x300}, {0x1DE, 0x41}, {0x1DE, 0x308}, {0x1DE, 0x304}, {0x1DF, 0x61}, {0x1DF, 0x308}, {0x1DF, 0x304}, {0x1E0, 0x41}, {0x1E0, 0x307}, {0x1E0, 0x304}, {0x1E1, 0x61}, {0x1E1, 0x307}, +{0x1E1, 0x304}, {0x1E2, 0xC6}, {0x1E2, 0x304}, {0x1E3, 0xE6}, {0x1E3, 0x304}, {0x1E6, 0x47}, {0x1E6, 0x30C}, {0x1E7, 0x67}, {0x1E7, 0x30C}, {0x1E8, 0x4B}, {0x1E8, 0x30C}, {0x1E9, 0x6B}, +{0x1E9, 0x30C}, {0x1EA, 0x4F}, {0x1EA, 0x328}, {0x1EB, 0x6F}, {0x1EB, 0x328}, {0x1EC, 0x4F}, {0x1EC, 0x328}, {0x1EC, 0x304}, {0x1ED, 0x6F}, {0x1ED, 0x328}, {0x1ED, 0x304}, {0x1EE, 0x1B7}, +{0x1EE, 0x30C}, {0x1EF, 0x292}, {0x1EF, 0x30C}, {0x1F0, 0x6A}, {0x1F0, 0x30C}, {0x1F4, 0x47}, {0x1F4, 0x301}, {0x1F5, 0x67}, {0x1F5, 0x301}, {0x1F8, 0x4E}, {0x1F8, 0x300}, {0x1F9, 0x6E}, +{0x1F9, 0x300}, {0x1FA, 0x41}, {0x1FA, 0x30A}, {0x1FA, 0x301}, {0x1FB, 0x61}, {0x1FB, 0x30A}, {0x1FB, 0x301}, {0x1FC, 0xC6}, {0x1FC, 0x301}, {0x1FD, 0xE6}, {0x1FD, 0x301}, {0x1FE, 0xD8}, +{0x1FE, 0x301}, {0x1FF, 0xF8}, {0x1FF, 0x301}, {0x200, 0x41}, {0x200, 0x30F}, {0x201, 0x61}, {0x201, 0x30F}, {0x202, 0x41}, {0x202, 0x311}, {0x203, 0x61}, {0x203, 0x311}, {0x204, 0x45}, +{0x204, 0x30F}, {0x205, 0x65}, {0x205, 0x30F}, {0x206, 0x45}, {0x206, 0x311}, {0x207, 0x65}, {0x207, 0x311}, {0x208, 0x49}, {0x208, 0x30F}, {0x209, 0x69}, {0x209, 0x30F}, {0x20A, 0x49}, +{0x20A, 0x311}, {0x20B, 0x69}, {0x20B, 0x311}, {0x20C, 0x4F}, {0x20C, 0x30F}, {0x20D, 0x6F}, {0x20D, 0x30F}, {0x20E, 0x4F}, {0x20E, 0x311}, {0x20F, 0x6F}, {0x20F, 0x311}, {0x210, 0x52}, +{0x210, 0x30F}, {0x211, 0x72}, {0x211, 0x30F}, {0x212, 0x52}, {0x212, 0x311}, {0x213, 0x72}, {0x213, 0x311}, {0x214, 0x55}, {0x214, 0x30F}, {0x215, 0x75}, {0x215, 0x30F}, {0x216, 0x55}, +{0x216, 0x311}, {0x217, 0x75}, {0x217, 0x311}, {0x218, 0x53}, {0x218, 0x326}, {0x219, 0x73}, {0x219, 0x326}, {0x21A, 0x54}, {0x21A, 0x326}, {0x21B, 0x74}, {0x21B, 0x326}, {0x21E, 0x48}, +{0x21E, 0x30C}, {0x21F, 0x68}, {0x21F, 0x30C}, {0x226, 0x41}, {0x226, 0x307}, {0x227, 0x61}, {0x227, 0x307}, {0x228, 0x45}, {0x228, 0x327}, {0x229, 0x65}, {0x229, 0x327}, {0x22A, 0x4F}, +{0x22A, 0x308}, {0x22A, 0x304}, {0x22B, 0x6F}, {0x22B, 0x308}, {0x22B, 0x304}, {0x22C, 0x4F}, {0x22C, 0x303}, {0x22C, 0x304}, {0x22D, 0x6F}, {0x22D, 0x303}, {0x22D, 0x304}, {0x22E, 0x4F}, +{0x22E, 0x307}, {0x22F, 0x6F}, {0x22F, 0x307}, {0x230, 0x4F}, {0x230, 0x307}, {0x230, 0x304}, {0x231, 0x6F}, {0x231, 0x307}, {0x231, 0x304}, {0x232, 0x59}, {0x232, 0x304}, {0x233, 0x79}, +{0x233, 0x304}, {0x340, 0x300}, {0x341, 0x301}, {0x343, 0x313}, {0x344, 0x308}, {0x344, 0x301}, {0x374, 0x2B9}, {0x37E, 0x3B}, {0x385, 0xA8}, {0x385, 0x301}, {0x386, 0x391}, {0x386, 0x301}, +{0x387, 0xB7}, {0x388, 0x395}, {0x388, 0x301}, {0x389, 0x397}, {0x389, 0x301}, {0x38A, 0x399}, {0x38A, 0x301}, {0x38C, 0x39F}, {0x38C, 0x301}, {0x38E, 0x3A5}, {0x38E, 0x301}, {0x38F, 0x3A9}, +{0x38F, 0x301}, {0x390, 0x3B9}, {0x390, 0x308}, {0x390, 0x301}, {0x3AA, 0x399}, {0x3AA, 0x308}, {0x3AB, 0x3A5}, {0x3AB, 0x308}, {0x3AC, 0x3B1}, {0x3AC, 0x301}, {0x3AD, 0x3B5}, {0x3AD, 0x301}, +{0x3AE, 0x3B7}, {0x3AE, 0x301}, {0x3AF, 0x3B9}, {0x3AF, 0x301}, {0x3B0, 0x3C5}, {0x3B0, 0x308}, {0x3B0, 0x301}, {0x3CA, 0x3B9}, {0x3CA, 0x308}, {0x3CB, 0x3C5}, {0x3CB, 0x308}, {0x3CC, 0x3BF}, +{0x3CC, 0x301}, {0x3CD, 0x3C5}, {0x3CD, 0x301}, {0x3CE, 0x3C9}, {0x3CE, 0x301}, {0x3D3, 0x3D2}, {0x3D3, 0x301}, {0x3D4, 0x3D2}, {0x3D4, 0x308}, {0x400, 0x415}, {0x400, 0x300}, {0x401, 0x415}, +{0x401, 0x308}, {0x403, 0x413}, {0x403, 0x301}, {0x407, 0x406}, {0x407, 0x308}, {0x40C, 0x41A}, {0x40C, 0x301}, {0x40D, 0x418}, {0x40D, 0x300}, {0x40E, 0x423}, {0x40E, 0x306}, {0x419, 0x418}, +{0x419, 0x306}, {0x439, 0x438}, {0x439, 0x306}, {0x450, 0x435}, {0x450, 0x300}, {0x451, 0x435}, {0x451, 0x308}, {0x453, 0x433}, {0x453, 0x301}, {0x457, 0x456}, {0x457, 0x308}, {0x45C, 0x43A}, +{0x45C, 0x301}, {0x45D, 0x438}, {0x45D, 0x300}, {0x45E, 0x443}, {0x45E, 0x306}, {0x476, 0x474}, {0x476, 0x30F}, {0x477, 0x475}, {0x477, 0x30F}, {0x4C1, 0x416}, {0x4C1, 0x306}, {0x4C2, 0x436}, +{0x4C2, 0x306}, {0x4D0, 0x410}, {0x4D0, 0x306}, {0x4D1, 0x430}, {0x4D1, 0x306}, {0x4D2, 0x410}, {0x4D2, 0x308}, {0x4D3, 0x430}, {0x4D3, 0x308}, {0x4D6, 0x415}, {0x4D6, 0x306}, {0x4D7, 0x435}, +{0x4D7, 0x306}, {0x4DA, 0x4D8}, {0x4DA, 0x308}, {0x4DB, 0x4D9}, {0x4DB, 0x308}, {0x4DC, 0x416}, {0x4DC, 0x308}, {0x4DD, 0x436}, {0x4DD, 0x308}, {0x4DE, 0x417}, {0x4DE, 0x308}, {0x4DF, 0x437}, +{0x4DF, 0x308}, {0x4E2, 0x418}, {0x4E2, 0x304}, {0x4E3, 0x438}, {0x4E3, 0x304}, {0x4E4, 0x418}, {0x4E4, 0x308}, {0x4E5, 0x438}, {0x4E5, 0x308}, {0x4E6, 0x41E}, {0x4E6, 0x308}, {0x4E7, 0x43E}, +{0x4E7, 0x308}, {0x4EA, 0x4E8}, {0x4EA, 0x308}, {0x4EB, 0x4E9}, {0x4EB, 0x308}, {0x4EC, 0x42D}, {0x4EC, 0x308}, {0x4ED, 0x44D}, {0x4ED, 0x308}, {0x4EE, 0x423}, {0x4EE, 0x304}, {0x4EF, 0x443}, +{0x4EF, 0x304}, {0x4F0, 0x423}, {0x4F0, 0x308}, {0x4F1, 0x443}, {0x4F1, 0x308}, {0x4F2, 0x423}, {0x4F2, 0x30B}, {0x4F3, 0x443}, {0x4F3, 0x30B}, {0x4F4, 0x427}, {0x4F4, 0x308}, {0x4F5, 0x447}, +{0x4F5, 0x308}, {0x4F8, 0x42B}, {0x4F8, 0x308}, {0x4F9, 0x44B}, {0x4F9, 0x308}, {0x622, 0x627}, {0x622, 0x653}, {0x623, 0x627}, {0x623, 0x654}, {0x624, 0x648}, {0x624, 0x654}, {0x625, 0x627}, +{0x625, 0x655}, {0x626, 0x64A}, {0x626, 0x654}, {0x6C0, 0x6D5}, {0x6C0, 0x654}, {0x6C2, 0x6C1}, {0x6C2, 0x654}, {0x6D3, 0x6D2}, {0x6D3, 0x654}, {0x929, 0x928}, {0x929, 0x93C}, {0x931, 0x930}, +{0x931, 0x93C}, {0x934, 0x933}, {0x934, 0x93C}, {0x958, 0x915}, {0x958, 0x93C}, {0x959, 0x916}, {0x959, 0x93C}, {0x95A, 0x917}, {0x95A, 0x93C}, {0x95B, 0x91C}, {0x95B, 0x93C}, {0x95C, 0x921}, +{0x95C, 0x93C}, {0x95D, 0x922}, {0x95D, 0x93C}, {0x95E, 0x92B}, {0x95E, 0x93C}, {0x95F, 0x92F}, {0x95F, 0x93C}, {0x9CB, 0x9C7}, {0x9CB, 0x9BE}, {0x9CC, 0x9C7}, {0x9CC, 0x9D7}, {0x9DC, 0x9A1}, +{0x9DC, 0x9BC}, {0x9DD, 0x9A2}, {0x9DD, 0x9BC}, {0x9DF, 0x9AF}, {0x9DF, 0x9BC}, {0xA33, 0xA32}, {0xA33, 0xA3C}, {0xA36, 0xA38}, {0xA36, 0xA3C}, {0xA59, 0xA16}, {0xA59, 0xA3C}, {0xA5A, 0xA17}, +{0xA5A, 0xA3C}, {0xA5B, 0xA1C}, {0xA5B, 0xA3C}, {0xA5E, 0xA2B}, {0xA5E, 0xA3C}, {0xB48, 0xB47}, {0xB48, 0xB56}, {0xB4B, 0xB47}, {0xB4B, 0xB3E}, {0xB4C, 0xB47}, {0xB4C, 0xB57}, {0xB5C, 0xB21}, +{0xB5C, 0xB3C}, {0xB5D, 0xB22}, {0xB5D, 0xB3C}, {0xB94, 0xB92}, {0xB94, 0xBD7}, {0xBCA, 0xBC6}, {0xBCA, 0xBBE}, {0xBCB, 0xBC7}, {0xBCB, 0xBBE}, {0xBCC, 0xBC6}, {0xBCC, 0xBD7}, {0xC48, 0xC46}, +{0xC48, 0xC56}, {0xCC0, 0xCBF}, {0xCC0, 0xCD5}, {0xCC7, 0xCC6}, {0xCC7, 0xCD5}, {0xCC8, 0xCC6}, {0xCC8, 0xCD6}, {0xCCA, 0xCC6}, {0xCCA, 0xCC2}, {0xCCB, 0xCC6}, {0xCCB, 0xCC2}, {0xCCB, 0xCD5}, +{0xD4A, 0xD46}, {0xD4A, 0xD3E}, {0xD4B, 0xD47}, {0xD4B, 0xD3E}, {0xD4C, 0xD46}, {0xD4C, 0xD57}, {0xDDA, 0xDD9}, {0xDDA, 0xDCA}, {0xDDC, 0xDD9}, {0xDDC, 0xDCF}, {0xDDD, 0xDD9}, {0xDDD, 0xDCF}, +{0xDDD, 0xDCA}, {0xDDE, 0xDD9}, {0xDDE, 0xDDF}, {0xF43, 0xF42}, {0xF43, 0xFB7}, {0xF4D, 0xF4C}, {0xF4D, 0xFB7}, {0xF52, 0xF51}, {0xF52, 0xFB7}, {0xF57, 0xF56}, {0xF57, 0xFB7}, {0xF5C, 0xF5B}, +{0xF5C, 0xFB7}, {0xF69, 0xF40}, {0xF69, 0xFB5}, {0xF73, 0xF71}, {0xF73, 0xF72}, {0xF75, 0xF71}, {0xF75, 0xF74}, {0xF76, 0xFB2}, {0xF76, 0xF80}, {0xF78, 0xFB3}, {0xF78, 0xF80}, {0xF81, 0xF71}, +{0xF81, 0xF80}, {0xF93, 0xF92}, {0xF93, 0xFB7}, {0xF9D, 0xF9C}, {0xF9D, 0xFB7}, {0xFA2, 0xFA1}, {0xFA2, 0xFB7}, {0xFA7, 0xFA6}, {0xFA7, 0xFB7}, {0xFAC, 0xFAB}, {0xFAC, 0xFB7}, {0xFB9, 0xF90}, +{0xFB9, 0xFB5}, {0x1026, 0x1025}, {0x1026, 0x102E}, {0x1B06, 0x1B05}, {0x1B06, 0x1B35}, {0x1B08, 0x1B07}, {0x1B08, 0x1B35}, {0x1B0A, 0x1B09}, {0x1B0A, 0x1B35}, {0x1B0C, 0x1B0B}, {0x1B0C, 0x1B35}, +{0x1B0E, 0x1B0D}, {0x1B0E, 0x1B35}, {0x1B12, 0x1B11}, {0x1B12, 0x1B35}, {0x1B3B, 0x1B3A}, {0x1B3B, 0x1B35}, {0x1B3D, 0x1B3C}, {0x1B3D, 0x1B35}, {0x1B40, 0x1B3E}, {0x1B40, 0x1B35}, {0x1B41, 0x1B3F}, +{0x1B41, 0x1B35}, {0x1B43, 0x1B42}, {0x1B43, 0x1B35}, {0x1E00, 0x41}, {0x1E00, 0x325}, {0x1E01, 0x61}, {0x1E01, 0x325}, {0x1E02, 0x42}, {0x1E02, 0x307}, {0x1E03, 0x62}, {0x1E03, 0x307}, +{0x1E04, 0x42}, {0x1E04, 0x323}, {0x1E05, 0x62}, {0x1E05, 0x323}, {0x1E06, 0x42}, {0x1E06, 0x331}, {0x1E07, 0x62}, {0x1E07, 0x331}, {0x1E08, 0x43}, {0x1E08, 0x327}, {0x1E08, 0x301}, {0x1E09, 0x63}, +{0x1E09, 0x327}, {0x1E09, 0x301}, {0x1E0A, 0x44}, {0x1E0A, 0x307}, {0x1E0B, 0x64}, {0x1E0B, 0x307}, {0x1E0C, 0x44}, {0x1E0C, 0x323}, {0x1E0D, 0x64}, {0x1E0D, 0x323}, {0x1E0E, 0x44}, {0x1E0E, 0x331}, +{0x1E0F, 0x64}, {0x1E0F, 0x331}, {0x1E10, 0x44}, {0x1E10, 0x327}, {0x1E11, 0x64}, {0x1E11, 0x327}, {0x1E12, 0x44}, {0x1E12, 0x32D}, {0x1E13, 0x64}, {0x1E13, 0x32D}, {0x1E14, 0x45}, {0x1E14, 0x304}, +{0x1E14, 0x300}, {0x1E15, 0x65}, {0x1E15, 0x304}, {0x1E15, 0x300}, {0x1E16, 0x45}, {0x1E16, 0x304}, {0x1E16, 0x301}, {0x1E17, 0x65}, {0x1E17, 0x304}, {0x1E17, 0x301}, {0x1E18, 0x45}, {0x1E18, 0x32D}, +{0x1E19, 0x65}, {0x1E19, 0x32D}, {0x1E1A, 0x45}, {0x1E1A, 0x330}, {0x1E1B, 0x65}, {0x1E1B, 0x330}, {0x1E1C, 0x45}, {0x1E1C, 0x327}, {0x1E1C, 0x306}, {0x1E1D, 0x65}, {0x1E1D, 0x327}, {0x1E1D, 0x306}, +{0x1E1E, 0x46}, {0x1E1E, 0x307}, {0x1E1F, 0x66}, {0x1E1F, 0x307}, {0x1E20, 0x47}, {0x1E20, 0x304}, {0x1E21, 0x67}, {0x1E21, 0x304}, {0x1E22, 0x48}, {0x1E22, 0x307}, {0x1E23, 0x68}, {0x1E23, 0x307}, +{0x1E24, 0x48}, {0x1E24, 0x323}, {0x1E25, 0x68}, {0x1E25, 0x323}, {0x1E26, 0x48}, {0x1E26, 0x308}, {0x1E27, 0x68}, {0x1E27, 0x308}, {0x1E28, 0x48}, {0x1E28, 0x327}, {0x1E29, 0x68}, {0x1E29, 0x327}, +{0x1E2A, 0x48}, {0x1E2A, 0x32E}, {0x1E2B, 0x68}, {0x1E2B, 0x32E}, {0x1E2C, 0x49}, {0x1E2C, 0x330}, {0x1E2D, 0x69}, {0x1E2D, 0x330}, {0x1E2E, 0x49}, {0x1E2E, 0x308}, {0x1E2E, 0x301}, {0x1E2F, 0x69}, +{0x1E2F, 0x308}, {0x1E2F, 0x301}, {0x1E30, 0x4B}, {0x1E30, 0x301}, {0x1E31, 0x6B}, {0x1E31, 0x301}, {0x1E32, 0x4B}, {0x1E32, 0x323}, {0x1E33, 0x6B}, {0x1E33, 0x323}, {0x1E34, 0x4B}, {0x1E34, 0x331}, +{0x1E35, 0x6B}, {0x1E35, 0x331}, {0x1E36, 0x4C}, {0x1E36, 0x323}, {0x1E37, 0x6C}, {0x1E37, 0x323}, {0x1E38, 0x4C}, {0x1E38, 0x323}, {0x1E38, 0x304}, {0x1E39, 0x6C}, {0x1E39, 0x323}, {0x1E39, 0x304}, +{0x1E3A, 0x4C}, {0x1E3A, 0x331}, {0x1E3B, 0x6C}, {0x1E3B, 0x331}, {0x1E3C, 0x4C}, {0x1E3C, 0x32D}, {0x1E3D, 0x6C}, {0x1E3D, 0x32D}, {0x1E3E, 0x4D}, {0x1E3E, 0x301}, {0x1E3F, 0x6D}, {0x1E3F, 0x301}, +{0x1E40, 0x4D}, {0x1E40, 0x307}, {0x1E41, 0x6D}, {0x1E41, 0x307}, {0x1E42, 0x4D}, {0x1E42, 0x323}, {0x1E43, 0x6D}, {0x1E43, 0x323}, {0x1E44, 0x4E}, {0x1E44, 0x307}, {0x1E45, 0x6E}, {0x1E45, 0x307}, +{0x1E46, 0x4E}, {0x1E46, 0x323}, {0x1E47, 0x6E}, {0x1E47, 0x323}, {0x1E48, 0x4E}, {0x1E48, 0x331}, {0x1E49, 0x6E}, {0x1E49, 0x331}, {0x1E4A, 0x4E}, {0x1E4A, 0x32D}, {0x1E4B, 0x6E}, {0x1E4B, 0x32D}, +{0x1E4C, 0x4F}, {0x1E4C, 0x303}, {0x1E4C, 0x301}, {0x1E4D, 0x6F}, {0x1E4D, 0x303}, {0x1E4D, 0x301}, {0x1E4E, 0x4F}, {0x1E4E, 0x303}, {0x1E4E, 0x308}, {0x1E4F, 0x6F}, {0x1E4F, 0x303}, {0x1E4F, 0x308}, +{0x1E50, 0x4F}, {0x1E50, 0x304}, {0x1E50, 0x300}, {0x1E51, 0x6F}, {0x1E51, 0x304}, {0x1E51, 0x300}, {0x1E52, 0x4F}, {0x1E52, 0x304}, {0x1E52, 0x301}, {0x1E53, 0x6F}, {0x1E53, 0x304}, {0x1E53, 0x301}, +{0x1E54, 0x50}, {0x1E54, 0x301}, {0x1E55, 0x70}, {0x1E55, 0x301}, {0x1E56, 0x50}, {0x1E56, 0x307}, {0x1E57, 0x70}, {0x1E57, 0x307}, {0x1E58, 0x52}, {0x1E58, 0x307}, {0x1E59, 0x72}, {0x1E59, 0x307}, +{0x1E5A, 0x52}, {0x1E5A, 0x323}, {0x1E5B, 0x72}, {0x1E5B, 0x323}, {0x1E5C, 0x52}, {0x1E5C, 0x323}, {0x1E5C, 0x304}, {0x1E5D, 0x72}, {0x1E5D, 0x323}, {0x1E5D, 0x304}, {0x1E5E, 0x52}, {0x1E5E, 0x331}, +{0x1E5F, 0x72}, {0x1E5F, 0x331}, {0x1E60, 0x53}, {0x1E60, 0x307}, {0x1E61, 0x73}, {0x1E61, 0x307}, {0x1E62, 0x53}, {0x1E62, 0x323}, {0x1E63, 0x73}, {0x1E63, 0x323}, {0x1E64, 0x53}, {0x1E64, 0x301}, +{0x1E64, 0x307}, {0x1E65, 0x73}, {0x1E65, 0x301}, {0x1E65, 0x307}, {0x1E66, 0x53}, {0x1E66, 0x30C}, {0x1E66, 0x307}, {0x1E67, 0x73}, {0x1E67, 0x30C}, {0x1E67, 0x307}, {0x1E68, 0x53}, {0x1E68, 0x323}, +{0x1E68, 0x307}, {0x1E69, 0x73}, {0x1E69, 0x323}, {0x1E69, 0x307}, {0x1E6A, 0x54}, {0x1E6A, 0x307}, {0x1E6B, 0x74}, {0x1E6B, 0x307}, {0x1E6C, 0x54}, {0x1E6C, 0x323}, {0x1E6D, 0x74}, {0x1E6D, 0x323}, +{0x1E6E, 0x54}, {0x1E6E, 0x331}, {0x1E6F, 0x74}, {0x1E6F, 0x331}, {0x1E70, 0x54}, {0x1E70, 0x32D}, {0x1E71, 0x74}, {0x1E71, 0x32D}, {0x1E72, 0x55}, {0x1E72, 0x324}, {0x1E73, 0x75}, {0x1E73, 0x324}, +{0x1E74, 0x55}, {0x1E74, 0x330}, {0x1E75, 0x75}, {0x1E75, 0x330}, {0x1E76, 0x55}, {0x1E76, 0x32D}, {0x1E77, 0x75}, {0x1E77, 0x32D}, {0x1E78, 0x55}, {0x1E78, 0x303}, {0x1E78, 0x301}, {0x1E79, 0x75}, +{0x1E79, 0x303}, {0x1E79, 0x301}, {0x1E7A, 0x55}, {0x1E7A, 0x304}, {0x1E7A, 0x308}, {0x1E7B, 0x75}, {0x1E7B, 0x304}, {0x1E7B, 0x308}, {0x1E7C, 0x56}, {0x1E7C, 0x303}, {0x1E7D, 0x76}, {0x1E7D, 0x303}, +{0x1E7E, 0x56}, {0x1E7E, 0x323}, {0x1E7F, 0x76}, {0x1E7F, 0x323}, {0x1E80, 0x57}, {0x1E80, 0x300}, {0x1E81, 0x77}, {0x1E81, 0x300}, {0x1E82, 0x57}, {0x1E82, 0x301}, {0x1E83, 0x77}, {0x1E83, 0x301}, +{0x1E84, 0x57}, {0x1E84, 0x308}, {0x1E85, 0x77}, {0x1E85, 0x308}, {0x1E86, 0x57}, {0x1E86, 0x307}, {0x1E87, 0x77}, {0x1E87, 0x307}, {0x1E88, 0x57}, {0x1E88, 0x323}, {0x1E89, 0x77}, {0x1E89, 0x323}, +{0x1E8A, 0x58}, {0x1E8A, 0x307}, {0x1E8B, 0x78}, {0x1E8B, 0x307}, {0x1E8C, 0x58}, {0x1E8C, 0x308}, {0x1E8D, 0x78}, {0x1E8D, 0x308}, {0x1E8E, 0x59}, {0x1E8E, 0x307}, {0x1E8F, 0x79}, {0x1E8F, 0x307}, +{0x1E90, 0x5A}, {0x1E90, 0x302}, {0x1E91, 0x7A}, {0x1E91, 0x302}, {0x1E92, 0x5A}, {0x1E92, 0x323}, {0x1E93, 0x7A}, {0x1E93, 0x323}, {0x1E94, 0x5A}, {0x1E94, 0x331}, {0x1E95, 0x7A}, {0x1E95, 0x331}, +{0x1E96, 0x68}, {0x1E96, 0x331}, {0x1E97, 0x74}, {0x1E97, 0x308}, {0x1E98, 0x77}, {0x1E98, 0x30A}, {0x1E99, 0x79}, {0x1E99, 0x30A}, {0x1E9B, 0x17F}, {0x1E9B, 0x307}, {0x1EA0, 0x41}, {0x1EA0, 0x323}, +{0x1EA1, 0x61}, {0x1EA1, 0x323}, {0x1EA2, 0x41}, {0x1EA2, 0x309}, {0x1EA3, 0x61}, {0x1EA3, 0x309}, {0x1EA4, 0x41}, {0x1EA4, 0x302}, {0x1EA4, 0x301}, {0x1EA5, 0x61}, {0x1EA5, 0x302}, {0x1EA5, 0x301}, +{0x1EA6, 0x41}, {0x1EA6, 0x302}, {0x1EA6, 0x300}, {0x1EA7, 0x61}, {0x1EA7, 0x302}, {0x1EA7, 0x300}, {0x1EA8, 0x41}, {0x1EA8, 0x302}, {0x1EA8, 0x309}, {0x1EA9, 0x61}, {0x1EA9, 0x302}, {0x1EA9, 0x309}, +{0x1EAA, 0x41}, {0x1EAA, 0x302}, {0x1EAA, 0x303}, {0x1EAB, 0x61}, {0x1EAB, 0x302}, {0x1EAB, 0x303}, {0x1EAC, 0x41}, {0x1EAC, 0x323}, {0x1EAC, 0x302}, {0x1EAD, 0x61}, {0x1EAD, 0x323}, {0x1EAD, 0x302}, +{0x1EAE, 0x41}, {0x1EAE, 0x306}, {0x1EAE, 0x301}, {0x1EAF, 0x61}, {0x1EAF, 0x306}, {0x1EAF, 0x301}, {0x1EB0, 0x41}, {0x1EB0, 0x306}, {0x1EB0, 0x300}, {0x1EB1, 0x61}, {0x1EB1, 0x306}, {0x1EB1, 0x300}, +{0x1EB2, 0x41}, {0x1EB2, 0x306}, {0x1EB2, 0x309}, {0x1EB3, 0x61}, {0x1EB3, 0x306}, {0x1EB3, 0x309}, {0x1EB4, 0x41}, {0x1EB4, 0x306}, {0x1EB4, 0x303}, {0x1EB5, 0x61}, {0x1EB5, 0x306}, {0x1EB5, 0x303}, +{0x1EB6, 0x41}, {0x1EB6, 0x323}, {0x1EB6, 0x306}, {0x1EB7, 0x61}, {0x1EB7, 0x323}, {0x1EB7, 0x306}, {0x1EB8, 0x45}, {0x1EB8, 0x323}, {0x1EB9, 0x65}, {0x1EB9, 0x323}, {0x1EBA, 0x45}, {0x1EBA, 0x309}, +{0x1EBB, 0x65}, {0x1EBB, 0x309}, {0x1EBC, 0x45}, {0x1EBC, 0x303}, {0x1EBD, 0x65}, {0x1EBD, 0x303}, {0x1EBE, 0x45}, {0x1EBE, 0x302}, {0x1EBE, 0x301}, {0x1EBF, 0x65}, {0x1EBF, 0x302}, {0x1EBF, 0x301}, +{0x1EC0, 0x45}, {0x1EC0, 0x302}, {0x1EC0, 0x300}, {0x1EC1, 0x65}, {0x1EC1, 0x302}, {0x1EC1, 0x300}, {0x1EC2, 0x45}, {0x1EC2, 0x302}, {0x1EC2, 0x309}, {0x1EC3, 0x65}, {0x1EC3, 0x302}, {0x1EC3, 0x309}, +{0x1EC4, 0x45}, {0x1EC4, 0x302}, {0x1EC4, 0x303}, {0x1EC5, 0x65}, {0x1EC5, 0x302}, {0x1EC5, 0x303}, {0x1EC6, 0x45}, {0x1EC6, 0x323}, {0x1EC6, 0x302}, {0x1EC7, 0x65}, {0x1EC7, 0x323}, {0x1EC7, 0x302}, +{0x1EC8, 0x49}, {0x1EC8, 0x309}, {0x1EC9, 0x69}, {0x1EC9, 0x309}, {0x1ECA, 0x49}, {0x1ECA, 0x323}, {0x1ECB, 0x69}, {0x1ECB, 0x323}, {0x1ECC, 0x4F}, {0x1ECC, 0x323}, {0x1ECD, 0x6F}, {0x1ECD, 0x323}, +{0x1ECE, 0x4F}, {0x1ECE, 0x309}, {0x1ECF, 0x6F}, {0x1ECF, 0x309}, {0x1ED0, 0x4F}, {0x1ED0, 0x302}, {0x1ED0, 0x301}, {0x1ED1, 0x6F}, {0x1ED1, 0x302}, {0x1ED1, 0x301}, {0x1ED2, 0x4F}, {0x1ED2, 0x302}, +{0x1ED2, 0x300}, {0x1ED3, 0x6F}, {0x1ED3, 0x302}, {0x1ED3, 0x300}, {0x1ED4, 0x4F}, {0x1ED4, 0x302}, {0x1ED4, 0x309}, {0x1ED5, 0x6F}, {0x1ED5, 0x302}, {0x1ED5, 0x309}, {0x1ED6, 0x4F}, {0x1ED6, 0x302}, +{0x1ED6, 0x303}, {0x1ED7, 0x6F}, {0x1ED7, 0x302}, {0x1ED7, 0x303}, {0x1ED8, 0x4F}, {0x1ED8, 0x323}, {0x1ED8, 0x302}, {0x1ED9, 0x6F}, {0x1ED9, 0x323}, {0x1ED9, 0x302}, {0x1EDA, 0x4F}, {0x1EDA, 0x31B}, +{0x1EDA, 0x301}, {0x1EDB, 0x6F}, {0x1EDB, 0x31B}, {0x1EDB, 0x301}, {0x1EDC, 0x4F}, {0x1EDC, 0x31B}, {0x1EDC, 0x300}, {0x1EDD, 0x6F}, {0x1EDD, 0x31B}, {0x1EDD, 0x300}, {0x1EDE, 0x4F}, {0x1EDE, 0x31B}, +{0x1EDE, 0x309}, {0x1EDF, 0x6F}, {0x1EDF, 0x31B}, {0x1EDF, 0x309}, {0x1EE0, 0x4F}, {0x1EE0, 0x31B}, {0x1EE0, 0x303}, {0x1EE1, 0x6F}, {0x1EE1, 0x31B}, {0x1EE1, 0x303}, {0x1EE2, 0x4F}, {0x1EE2, 0x31B}, +{0x1EE2, 0x323}, {0x1EE3, 0x6F}, {0x1EE3, 0x31B}, {0x1EE3, 0x323}, {0x1EE4, 0x55}, {0x1EE4, 0x323}, {0x1EE5, 0x75}, {0x1EE5, 0x323}, {0x1EE6, 0x55}, {0x1EE6, 0x309}, {0x1EE7, 0x75}, {0x1EE7, 0x309}, +{0x1EE8, 0x55}, {0x1EE8, 0x31B}, {0x1EE8, 0x301}, {0x1EE9, 0x75}, {0x1EE9, 0x31B}, {0x1EE9, 0x301}, {0x1EEA, 0x55}, {0x1EEA, 0x31B}, {0x1EEA, 0x300}, {0x1EEB, 0x75}, {0x1EEB, 0x31B}, {0x1EEB, 0x300}, +{0x1EEC, 0x55}, {0x1EEC, 0x31B}, {0x1EEC, 0x309}, {0x1EED, 0x75}, {0x1EED, 0x31B}, {0x1EED, 0x309}, {0x1EEE, 0x55}, {0x1EEE, 0x31B}, {0x1EEE, 0x303}, {0x1EEF, 0x75}, {0x1EEF, 0x31B}, {0x1EEF, 0x303}, +{0x1EF0, 0x55}, {0x1EF0, 0x31B}, {0x1EF0, 0x323}, {0x1EF1, 0x75}, {0x1EF1, 0x31B}, {0x1EF1, 0x323}, {0x1EF2, 0x59}, {0x1EF2, 0x300}, {0x1EF3, 0x79}, {0x1EF3, 0x300}, {0x1EF4, 0x59}, {0x1EF4, 0x323}, +{0x1EF5, 0x79}, {0x1EF5, 0x323}, {0x1EF6, 0x59}, {0x1EF6, 0x309}, {0x1EF7, 0x79}, {0x1EF7, 0x309}, {0x1EF8, 0x59}, {0x1EF8, 0x303}, {0x1EF9, 0x79}, {0x1EF9, 0x303}, {0x1F00, 0x3B1}, {0x1F00, 0x313}, +{0x1F01, 0x3B1}, {0x1F01, 0x314}, {0x1F02, 0x3B1}, {0x1F02, 0x313}, {0x1F02, 0x300}, {0x1F03, 0x3B1}, {0x1F03, 0x314}, {0x1F03, 0x300}, {0x1F04, 0x3B1}, {0x1F04, 0x313}, {0x1F04, 0x301}, +{0x1F05, 0x3B1}, {0x1F05, 0x314}, {0x1F05, 0x301}, {0x1F06, 0x3B1}, {0x1F06, 0x313}, {0x1F06, 0x342}, {0x1F07, 0x3B1}, {0x1F07, 0x314}, {0x1F07, 0x342}, {0x1F08, 0x391}, {0x1F08, 0x313}, +{0x1F09, 0x391}, {0x1F09, 0x314}, {0x1F0A, 0x391}, {0x1F0A, 0x313}, {0x1F0A, 0x300}, {0x1F0B, 0x391}, {0x1F0B, 0x314}, {0x1F0B, 0x300}, {0x1F0C, 0x391}, {0x1F0C, 0x313}, {0x1F0C, 0x301}, +{0x1F0D, 0x391}, {0x1F0D, 0x314}, {0x1F0D, 0x301}, {0x1F0E, 0x391}, {0x1F0E, 0x313}, {0x1F0E, 0x342}, {0x1F0F, 0x391}, {0x1F0F, 0x314}, {0x1F0F, 0x342}, {0x1F10, 0x3B5}, {0x1F10, 0x313}, +{0x1F11, 0x3B5}, {0x1F11, 0x314}, {0x1F12, 0x3B5}, {0x1F12, 0x313}, {0x1F12, 0x300}, {0x1F13, 0x3B5}, {0x1F13, 0x314}, {0x1F13, 0x300}, {0x1F14, 0x3B5}, {0x1F14, 0x313}, {0x1F14, 0x301}, +{0x1F15, 0x3B5}, {0x1F15, 0x314}, {0x1F15, 0x301}, {0x1F18, 0x395}, {0x1F18, 0x313}, {0x1F19, 0x395}, {0x1F19, 0x314}, {0x1F1A, 0x395}, {0x1F1A, 0x313}, {0x1F1A, 0x300}, {0x1F1B, 0x395}, +{0x1F1B, 0x314}, {0x1F1B, 0x300}, {0x1F1C, 0x395}, {0x1F1C, 0x313}, {0x1F1C, 0x301}, {0x1F1D, 0x395}, {0x1F1D, 0x314}, {0x1F1D, 0x301}, {0x1F20, 0x3B7}, {0x1F20, 0x313}, {0x1F21, 0x3B7}, +{0x1F21, 0x314}, {0x1F22, 0x3B7}, {0x1F22, 0x313}, {0x1F22, 0x300}, {0x1F23, 0x3B7}, {0x1F23, 0x314}, {0x1F23, 0x300}, {0x1F24, 0x3B7}, {0x1F24, 0x313}, {0x1F24, 0x301}, {0x1F25, 0x3B7}, +{0x1F25, 0x314}, {0x1F25, 0x301}, {0x1F26, 0x3B7}, {0x1F26, 0x313}, {0x1F26, 0x342}, {0x1F27, 0x3B7}, {0x1F27, 0x314}, {0x1F27, 0x342}, {0x1F28, 0x397}, {0x1F28, 0x313}, {0x1F29, 0x397}, +{0x1F29, 0x314}, {0x1F2A, 0x397}, {0x1F2A, 0x313}, {0x1F2A, 0x300}, {0x1F2B, 0x397}, {0x1F2B, 0x314}, {0x1F2B, 0x300}, {0x1F2C, 0x397}, {0x1F2C, 0x313}, {0x1F2C, 0x301}, {0x1F2D, 0x397}, +{0x1F2D, 0x314}, {0x1F2D, 0x301}, {0x1F2E, 0x397}, {0x1F2E, 0x313}, {0x1F2E, 0x342}, {0x1F2F, 0x397}, {0x1F2F, 0x314}, {0x1F2F, 0x342}, {0x1F30, 0x3B9}, {0x1F30, 0x313}, {0x1F31, 0x3B9}, +{0x1F31, 0x314}, {0x1F32, 0x3B9}, {0x1F32, 0x313}, {0x1F32, 0x300}, {0x1F33, 0x3B9}, {0x1F33, 0x314}, {0x1F33, 0x300}, {0x1F34, 0x3B9}, {0x1F34, 0x313}, {0x1F34, 0x301}, {0x1F35, 0x3B9}, +{0x1F35, 0x314}, {0x1F35, 0x301}, {0x1F36, 0x3B9}, {0x1F36, 0x313}, {0x1F36, 0x342}, {0x1F37, 0x3B9}, {0x1F37, 0x314}, {0x1F37, 0x342}, {0x1F38, 0x399}, {0x1F38, 0x313}, {0x1F39, 0x399}, +{0x1F39, 0x314}, {0x1F3A, 0x399}, {0x1F3A, 0x313}, {0x1F3A, 0x300}, {0x1F3B, 0x399}, {0x1F3B, 0x314}, {0x1F3B, 0x300}, {0x1F3C, 0x399}, {0x1F3C, 0x313}, {0x1F3C, 0x301}, {0x1F3D, 0x399}, +{0x1F3D, 0x314}, {0x1F3D, 0x301}, {0x1F3E, 0x399}, {0x1F3E, 0x313}, {0x1F3E, 0x342}, {0x1F3F, 0x399}, {0x1F3F, 0x314}, {0x1F3F, 0x342}, {0x1F40, 0x3BF}, {0x1F40, 0x313}, {0x1F41, 0x3BF}, +{0x1F41, 0x314}, {0x1F42, 0x3BF}, {0x1F42, 0x313}, {0x1F42, 0x300}, {0x1F43, 0x3BF}, {0x1F43, 0x314}, {0x1F43, 0x300}, {0x1F44, 0x3BF}, {0x1F44, 0x313}, {0x1F44, 0x301}, {0x1F45, 0x3BF}, +{0x1F45, 0x314}, {0x1F45, 0x301}, {0x1F48, 0x39F}, {0x1F48, 0x313}, {0x1F49, 0x39F}, {0x1F49, 0x314}, {0x1F4A, 0x39F}, {0x1F4A, 0x313}, {0x1F4A, 0x300}, {0x1F4B, 0x39F}, {0x1F4B, 0x314}, +{0x1F4B, 0x300}, {0x1F4C, 0x39F}, {0x1F4C, 0x313}, {0x1F4C, 0x301}, {0x1F4D, 0x39F}, {0x1F4D, 0x314}, {0x1F4D, 0x301}, {0x1F50, 0x3C5}, {0x1F50, 0x313}, {0x1F51, 0x3C5}, {0x1F51, 0x314}, +{0x1F52, 0x3C5}, {0x1F52, 0x313}, {0x1F52, 0x300}, {0x1F53, 0x3C5}, {0x1F53, 0x314}, {0x1F53, 0x300}, {0x1F54, 0x3C5}, {0x1F54, 0x313}, {0x1F54, 0x301}, {0x1F55, 0x3C5}, {0x1F55, 0x314}, +{0x1F55, 0x301}, {0x1F56, 0x3C5}, {0x1F56, 0x313}, {0x1F56, 0x342}, {0x1F57, 0x3C5}, {0x1F57, 0x314}, {0x1F57, 0x342}, {0x1F59, 0x3A5}, {0x1F59, 0x314}, {0x1F5B, 0x3A5}, {0x1F5B, 0x314}, +{0x1F5B, 0x300}, {0x1F5D, 0x3A5}, {0x1F5D, 0x314}, {0x1F5D, 0x301}, {0x1F5F, 0x3A5}, {0x1F5F, 0x314}, {0x1F5F, 0x342}, {0x1F60, 0x3C9}, {0x1F60, 0x313}, {0x1F61, 0x3C9}, {0x1F61, 0x314}, +{0x1F62, 0x3C9}, {0x1F62, 0x313}, {0x1F62, 0x300}, {0x1F63, 0x3C9}, {0x1F63, 0x314}, {0x1F63, 0x300}, {0x1F64, 0x3C9}, {0x1F64, 0x313}, {0x1F64, 0x301}, {0x1F65, 0x3C9}, {0x1F65, 0x314}, +{0x1F65, 0x301}, {0x1F66, 0x3C9}, {0x1F66, 0x313}, {0x1F66, 0x342}, {0x1F67, 0x3C9}, {0x1F67, 0x314}, {0x1F67, 0x342}, {0x1F68, 0x3A9}, {0x1F68, 0x313}, {0x1F69, 0x3A9}, {0x1F69, 0x314}, +{0x1F6A, 0x3A9}, {0x1F6A, 0x313}, {0x1F6A, 0x300}, {0x1F6B, 0x3A9}, {0x1F6B, 0x314}, {0x1F6B, 0x300}, {0x1F6C, 0x3A9}, {0x1F6C, 0x313}, {0x1F6C, 0x301}, {0x1F6D, 0x3A9}, {0x1F6D, 0x314}, +{0x1F6D, 0x301}, {0x1F6E, 0x3A9}, {0x1F6E, 0x313}, {0x1F6E, 0x342}, {0x1F6F, 0x3A9}, {0x1F6F, 0x314}, {0x1F6F, 0x342}, {0x1F70, 0x3B1}, {0x1F70, 0x300}, {0x1F71, 0x3B1}, {0x1F71, 0x301}, +{0x1F72, 0x3B5}, {0x1F72, 0x300}, {0x1F73, 0x3B5}, {0x1F73, 0x301}, {0x1F74, 0x3B7}, {0x1F74, 0x300}, {0x1F75, 0x3B7}, {0x1F75, 0x301}, {0x1F76, 0x3B9}, {0x1F76, 0x300}, {0x1F77, 0x3B9}, +{0x1F77, 0x301}, {0x1F78, 0x3BF}, {0x1F78, 0x300}, {0x1F79, 0x3BF}, {0x1F79, 0x301}, {0x1F7A, 0x3C5}, {0x1F7A, 0x300}, {0x1F7B, 0x3C5}, {0x1F7B, 0x301}, {0x1F7C, 0x3C9}, {0x1F7C, 0x300}, +{0x1F7D, 0x3C9}, {0x1F7D, 0x301}, {0x1F80, 0x3B1}, {0x1F80, 0x313}, {0x1F80, 0x345}, {0x1F81, 0x3B1}, {0x1F81, 0x314}, {0x1F81, 0x345}, {0x1F82, 0x3B1}, {0x1F82, 0x313}, {0x1F82, 0x300}, +{0x1F82, 0x345}, {0x1F83, 0x3B1}, {0x1F83, 0x314}, {0x1F83, 0x300}, {0x1F83, 0x345}, {0x1F84, 0x3B1}, {0x1F84, 0x313}, {0x1F84, 0x301}, {0x1F84, 0x345}, {0x1F85, 0x3B1}, {0x1F85, 0x314}, +{0x1F85, 0x301}, {0x1F85, 0x345}, {0x1F86, 0x3B1}, {0x1F86, 0x313}, {0x1F86, 0x342}, {0x1F86, 0x345}, {0x1F87, 0x3B1}, {0x1F87, 0x314}, {0x1F87, 0x342}, {0x1F87, 0x345}, {0x1F88, 0x391}, +{0x1F88, 0x313}, {0x1F88, 0x345}, {0x1F89, 0x391}, {0x1F89, 0x314}, {0x1F89, 0x345}, {0x1F8A, 0x391}, {0x1F8A, 0x313}, {0x1F8A, 0x300}, {0x1F8A, 0x345}, {0x1F8B, 0x391}, {0x1F8B, 0x314}, +{0x1F8B, 0x300}, {0x1F8B, 0x345}, {0x1F8C, 0x391}, {0x1F8C, 0x313}, {0x1F8C, 0x301}, {0x1F8C, 0x345}, {0x1F8D, 0x391}, {0x1F8D, 0x314}, {0x1F8D, 0x301}, {0x1F8D, 0x345}, {0x1F8E, 0x391}, +{0x1F8E, 0x313}, {0x1F8E, 0x342}, {0x1F8E, 0x345}, {0x1F8F, 0x391}, {0x1F8F, 0x314}, {0x1F8F, 0x342}, {0x1F8F, 0x345}, {0x1F90, 0x3B7}, {0x1F90, 0x313}, {0x1F90, 0x345}, {0x1F91, 0x3B7}, +{0x1F91, 0x314}, {0x1F91, 0x345}, {0x1F92, 0x3B7}, {0x1F92, 0x313}, {0x1F92, 0x300}, {0x1F92, 0x345}, {0x1F93, 0x3B7}, {0x1F93, 0x314}, {0x1F93, 0x300}, {0x1F93, 0x345}, {0x1F94, 0x3B7}, +{0x1F94, 0x313}, {0x1F94, 0x301}, {0x1F94, 0x345}, {0x1F95, 0x3B7}, {0x1F95, 0x314}, {0x1F95, 0x301}, {0x1F95, 0x345}, {0x1F96, 0x3B7}, {0x1F96, 0x313}, {0x1F96, 0x342}, {0x1F96, 0x345}, +{0x1F97, 0x3B7}, {0x1F97, 0x314}, {0x1F97, 0x342}, {0x1F97, 0x345}, {0x1F98, 0x397}, {0x1F98, 0x313}, {0x1F98, 0x345}, {0x1F99, 0x397}, {0x1F99, 0x314}, {0x1F99, 0x345}, {0x1F9A, 0x397}, +{0x1F9A, 0x313}, {0x1F9A, 0x300}, {0x1F9A, 0x345}, {0x1F9B, 0x397}, {0x1F9B, 0x314}, {0x1F9B, 0x300}, {0x1F9B, 0x345}, {0x1F9C, 0x397}, {0x1F9C, 0x313}, {0x1F9C, 0x301}, {0x1F9C, 0x345}, +{0x1F9D, 0x397}, {0x1F9D, 0x314}, {0x1F9D, 0x301}, {0x1F9D, 0x345}, {0x1F9E, 0x397}, {0x1F9E, 0x313}, {0x1F9E, 0x342}, {0x1F9E, 0x345}, {0x1F9F, 0x397}, {0x1F9F, 0x314}, {0x1F9F, 0x342}, +{0x1F9F, 0x345}, {0x1FA0, 0x3C9}, {0x1FA0, 0x313}, {0x1FA0, 0x345}, {0x1FA1, 0x3C9}, {0x1FA1, 0x314}, {0x1FA1, 0x345}, {0x1FA2, 0x3C9}, {0x1FA2, 0x313}, {0x1FA2, 0x300}, {0x1FA2, 0x345}, +{0x1FA3, 0x3C9}, {0x1FA3, 0x314}, {0x1FA3, 0x300}, {0x1FA3, 0x345}, {0x1FA4, 0x3C9}, {0x1FA4, 0x313}, {0x1FA4, 0x301}, {0x1FA4, 0x345}, {0x1FA5, 0x3C9}, {0x1FA5, 0x314}, {0x1FA5, 0x301}, +{0x1FA5, 0x345}, {0x1FA6, 0x3C9}, {0x1FA6, 0x313}, {0x1FA6, 0x342}, {0x1FA6, 0x345}, {0x1FA7, 0x3C9}, {0x1FA7, 0x314}, {0x1FA7, 0x342}, {0x1FA7, 0x345}, {0x1FA8, 0x3A9}, {0x1FA8, 0x313}, +{0x1FA8, 0x345}, {0x1FA9, 0x3A9}, {0x1FA9, 0x314}, {0x1FA9, 0x345}, {0x1FAA, 0x3A9}, {0x1FAA, 0x313}, {0x1FAA, 0x300}, {0x1FAA, 0x345}, {0x1FAB, 0x3A9}, {0x1FAB, 0x314}, {0x1FAB, 0x300}, +{0x1FAB, 0x345}, {0x1FAC, 0x3A9}, {0x1FAC, 0x313}, {0x1FAC, 0x301}, {0x1FAC, 0x345}, {0x1FAD, 0x3A9}, {0x1FAD, 0x314}, {0x1FAD, 0x301}, {0x1FAD, 0x345}, {0x1FAE, 0x3A9}, {0x1FAE, 0x313}, +{0x1FAE, 0x342}, {0x1FAE, 0x345}, {0x1FAF, 0x3A9}, {0x1FAF, 0x314}, {0x1FAF, 0x342}, {0x1FAF, 0x345}, {0x1FB0, 0x3B1}, {0x1FB0, 0x306}, {0x1FB1, 0x3B1}, {0x1FB1, 0x304}, {0x1FB2, 0x3B1}, +{0x1FB2, 0x300}, {0x1FB2, 0x345}, {0x1FB3, 0x3B1}, {0x1FB3, 0x345}, {0x1FB4, 0x3B1}, {0x1FB4, 0x301}, {0x1FB4, 0x345}, {0x1FB6, 0x3B1}, {0x1FB6, 0x342}, {0x1FB7, 0x3B1}, {0x1FB7, 0x342}, +{0x1FB7, 0x345}, {0x1FB8, 0x391}, {0x1FB8, 0x306}, {0x1FB9, 0x391}, {0x1FB9, 0x304}, {0x1FBA, 0x391}, {0x1FBA, 0x300}, {0x1FBB, 0x391}, {0x1FBB, 0x301}, {0x1FBC, 0x391}, {0x1FBC, 0x345}, +{0x1FBE, 0x3B9}, {0x1FC1, 0xA8}, {0x1FC1, 0x342}, {0x1FC2, 0x3B7}, {0x1FC2, 0x300}, {0x1FC2, 0x345}, {0x1FC3, 0x3B7}, {0x1FC3, 0x345}, {0x1FC4, 0x3B7}, {0x1FC4, 0x301}, {0x1FC4, 0x345}, +{0x1FC6, 0x3B7}, {0x1FC6, 0x342}, {0x1FC7, 0x3B7}, {0x1FC7, 0x342}, {0x1FC7, 0x345}, {0x1FC8, 0x395}, {0x1FC8, 0x300}, {0x1FC9, 0x395}, {0x1FC9, 0x301}, {0x1FCA, 0x397}, {0x1FCA, 0x300}, +{0x1FCB, 0x397}, {0x1FCB, 0x301}, {0x1FCC, 0x397}, {0x1FCC, 0x345}, {0x1FCD, 0x1FBF}, {0x1FCD, 0x300}, {0x1FCE, 0x1FBF}, {0x1FCE, 0x301}, {0x1FCF, 0x1FBF}, {0x1FCF, 0x342}, {0x1FD0, 0x3B9}, +{0x1FD0, 0x306}, {0x1FD1, 0x3B9}, {0x1FD1, 0x304}, {0x1FD2, 0x3B9}, {0x1FD2, 0x308}, {0x1FD2, 0x300}, {0x1FD3, 0x3B9}, {0x1FD3, 0x308}, {0x1FD3, 0x301}, {0x1FD6, 0x3B9}, {0x1FD6, 0x342}, +{0x1FD7, 0x3B9}, {0x1FD7, 0x308}, {0x1FD7, 0x342}, {0x1FD8, 0x399}, {0x1FD8, 0x306}, {0x1FD9, 0x399}, {0x1FD9, 0x304}, {0x1FDA, 0x399}, {0x1FDA, 0x300}, {0x1FDB, 0x399}, {0x1FDB, 0x301}, +{0x1FDD, 0x1FFE}, {0x1FDD, 0x300}, {0x1FDE, 0x1FFE}, {0x1FDE, 0x301}, {0x1FDF, 0x1FFE}, {0x1FDF, 0x342}, {0x1FE0, 0x3C5}, {0x1FE0, 0x306}, {0x1FE1, 0x3C5}, {0x1FE1, 0x304}, {0x1FE2, 0x3C5}, +{0x1FE2, 0x308}, {0x1FE2, 0x300}, {0x1FE3, 0x3C5}, {0x1FE3, 0x308}, {0x1FE3, 0x301}, {0x1FE4, 0x3C1}, {0x1FE4, 0x313}, {0x1FE5, 0x3C1}, {0x1FE5, 0x314}, {0x1FE6, 0x3C5}, {0x1FE6, 0x342}, +{0x1FE7, 0x3C5}, {0x1FE7, 0x308}, {0x1FE7, 0x342}, {0x1FE8, 0x3A5}, {0x1FE8, 0x306}, {0x1FE9, 0x3A5}, {0x1FE9, 0x304}, {0x1FEA, 0x3A5}, {0x1FEA, 0x300}, {0x1FEB, 0x3A5}, {0x1FEB, 0x301}, +{0x1FEC, 0x3A1}, {0x1FEC, 0x314}, {0x1FED, 0xA8}, {0x1FED, 0x300}, {0x1FEE, 0xA8}, {0x1FEE, 0x301}, {0x1FEF, 0x60}, {0x1FF2, 0x3C9}, {0x1FF2, 0x300}, {0x1FF2, 0x345}, {0x1FF3, 0x3C9}, {0x1FF3, 0x345}, +{0x1FF4, 0x3C9}, {0x1FF4, 0x301}, {0x1FF4, 0x345}, {0x1FF6, 0x3C9}, {0x1FF6, 0x342}, {0x1FF7, 0x3C9}, {0x1FF7, 0x342}, {0x1FF7, 0x345}, {0x1FF8, 0x39F}, {0x1FF8, 0x300}, {0x1FF9, 0x39F}, +{0x1FF9, 0x301}, {0x1FFA, 0x3A9}, {0x1FFA, 0x300}, {0x1FFB, 0x3A9}, {0x1FFB, 0x301}, {0x1FFC, 0x3A9}, {0x1FFC, 0x345}, {0x1FFD, 0xB4}, {0x2000, 0x2002}, {0x2001, 0x2003}, {0x2126, 0x3A9}, +{0x212A, 0x4B}, {0x212B, 0x41}, {0x212B, 0x30A}, {0x219A, 0x2190}, {0x219A, 0x338}, {0x219B, 0x2192}, {0x219B, 0x338}, {0x21AE, 0x2194}, {0x21AE, 0x338}, {0x21CD, 0x21D0}, {0x21CD, 0x338}, +{0x21CE, 0x21D4}, {0x21CE, 0x338}, {0x21CF, 0x21D2}, {0x21CF, 0x338}, {0x2204, 0x2203}, {0x2204, 0x338}, {0x2209, 0x2208}, {0x2209, 0x338}, {0x220C, 0x220B}, {0x220C, 0x338}, {0x2224, 0x2223}, +{0x2224, 0x338}, {0x2226, 0x2225}, {0x2226, 0x338}, {0x2241, 0x223C}, {0x2241, 0x338}, {0x2244, 0x2243}, {0x2244, 0x338}, {0x2247, 0x2245}, {0x2247, 0x338}, {0x2249, 0x2248}, {0x2249, 0x338}, +{0x2260, 0x3D}, {0x2260, 0x338}, {0x2262, 0x2261}, {0x2262, 0x338}, {0x226D, 0x224D}, {0x226D, 0x338}, {0x226E, 0x3C}, {0x226E, 0x338}, {0x226F, 0x3E}, {0x226F, 0x338}, {0x2270, 0x2264}, +{0x2270, 0x338}, {0x2271, 0x2265}, {0x2271, 0x338}, {0x2274, 0x2272}, {0x2274, 0x338}, {0x2275, 0x2273}, {0x2275, 0x338}, {0x2278, 0x2276}, {0x2278, 0x338}, {0x2279, 0x2277}, {0x2279, 0x338}, +{0x2280, 0x227A}, {0x2280, 0x338}, {0x2281, 0x227B}, {0x2281, 0x338}, {0x2284, 0x2282}, {0x2284, 0x338}, {0x2285, 0x2283}, {0x2285, 0x338}, {0x2288, 0x2286}, {0x2288, 0x338}, {0x2289, 0x2287}, +{0x2289, 0x338}, {0x22AC, 0x22A2}, {0x22AC, 0x338}, {0x22AD, 0x22A8}, {0x22AD, 0x338}, {0x22AE, 0x22A9}, {0x22AE, 0x338}, {0x22AF, 0x22AB}, {0x22AF, 0x338}, {0x22E0, 0x227C}, {0x22E0, 0x338}, +{0x22E1, 0x227D}, {0x22E1, 0x338}, {0x22E2, 0x2291}, {0x22E2, 0x338}, {0x22E3, 0x2292}, {0x22E3, 0x338}, {0x22EA, 0x22B2}, {0x22EA, 0x338}, {0x22EB, 0x22B3}, {0x22EB, 0x338}, {0x22EC, 0x22B4}, +{0x22EC, 0x338}, {0x22ED, 0x22B5}, {0x22ED, 0x338}, {0x2329, 0x3008}, {0x232A, 0x3009}, {0x2ADC, 0x2ADD}, {0x2ADC, 0x338}, {0x304C, 0x304B}, {0x304C, 0x3099}, {0x304E, 0x304D}, {0x304E, 0x3099}, +{0x3050, 0x304F}, {0x3050, 0x3099}, {0x3052, 0x3051}, {0x3052, 0x3099}, {0x3054, 0x3053}, {0x3054, 0x3099}, {0x3056, 0x3055}, {0x3056, 0x3099}, {0x3058, 0x3057}, {0x3058, 0x3099}, {0x305A, 0x3059}, +{0x305A, 0x3099}, {0x305C, 0x305B}, {0x305C, 0x3099}, {0x305E, 0x305D}, {0x305E, 0x3099}, {0x3060, 0x305F}, {0x3060, 0x3099}, {0x3062, 0x3061}, {0x3062, 0x3099}, {0x3065, 0x3064}, {0x3065, 0x3099}, +{0x3067, 0x3066}, {0x3067, 0x3099}, {0x3069, 0x3068}, {0x3069, 0x3099}, {0x3070, 0x306F}, {0x3070, 0x3099}, {0x3071, 0x306F}, {0x3071, 0x309A}, {0x3073, 0x3072}, {0x3073, 0x3099}, {0x3074, 0x3072}, +{0x3074, 0x309A}, {0x3076, 0x3075}, {0x3076, 0x3099}, {0x3077, 0x3075}, {0x3077, 0x309A}, {0x3079, 0x3078}, {0x3079, 0x3099}, {0x307A, 0x3078}, {0x307A, 0x309A}, {0x307C, 0x307B}, {0x307C, 0x3099}, +{0x307D, 0x307B}, {0x307D, 0x309A}, {0x3094, 0x3046}, {0x3094, 0x3099}, {0x309E, 0x309D}, {0x309E, 0x3099}, {0x30AC, 0x30AB}, {0x30AC, 0x3099}, {0x30AE, 0x30AD}, {0x30AE, 0x3099}, {0x30B0, 0x30AF}, +{0x30B0, 0x3099}, {0x30B2, 0x30B1}, {0x30B2, 0x3099}, {0x30B4, 0x30B3}, {0x30B4, 0x3099}, {0x30B6, 0x30B5}, {0x30B6, 0x3099}, {0x30B8, 0x30B7}, {0x30B8, 0x3099}, {0x30BA, 0x30B9}, {0x30BA, 0x3099}, +{0x30BC, 0x30BB}, {0x30BC, 0x3099}, {0x30BE, 0x30BD}, {0x30BE, 0x3099}, {0x30C0, 0x30BF}, {0x30C0, 0x3099}, {0x30C2, 0x30C1}, {0x30C2, 0x3099}, {0x30C5, 0x30C4}, {0x30C5, 0x3099}, {0x30C7, 0x30C6}, +{0x30C7, 0x3099}, {0x30C9, 0x30C8}, {0x30C9, 0x3099}, {0x30D0, 0x30CF}, {0x30D0, 0x3099}, {0x30D1, 0x30CF}, {0x30D1, 0x309A}, {0x30D3, 0x30D2}, {0x30D3, 0x3099}, {0x30D4, 0x30D2}, {0x30D4, 0x309A}, +{0x30D6, 0x30D5}, {0x30D6, 0x3099}, {0x30D7, 0x30D5}, {0x30D7, 0x309A}, {0x30D9, 0x30D8}, {0x30D9, 0x3099}, {0x30DA, 0x30D8}, {0x30DA, 0x309A}, {0x30DC, 0x30DB}, {0x30DC, 0x3099}, {0x30DD, 0x30DB}, +{0x30DD, 0x309A}, {0x30F4, 0x30A6}, {0x30F4, 0x3099}, {0x30F7, 0x30EF}, {0x30F7, 0x3099}, {0x30F8, 0x30F0}, {0x30F8, 0x3099}, {0x30F9, 0x30F1}, {0x30F9, 0x3099}, {0x30FA, 0x30F2}, {0x30FA, 0x3099}, +{0x30FE, 0x30FD}, {0x30FE, 0x3099}, {0xF900, 0x8C48}, {0xF901, 0x66F4}, {0xF902, 0x8ECA}, {0xF903, 0x8CC8}, {0xF904, 0x6ED1}, {0xF905, 0x4E32}, {0xF906, 0x53E5}, {0xF907, 0x9F9C}, {0xF908, 0x9F9C}, +{0xF909, 0x5951}, {0xF90A, 0x91D1}, {0xF90B, 0x5587}, {0xF90C, 0x5948}, {0xF90D, 0x61F6}, {0xF90E, 0x7669}, {0xF90F, 0x7F85}, {0xF910, 0x863F}, {0xF911, 0x87BA}, {0xF912, 0x88F8}, {0xF913, 0x908F}, +{0xF914, 0x6A02}, {0xF915, 0x6D1B}, {0xF916, 0x70D9}, {0xF917, 0x73DE}, {0xF918, 0x843D}, {0xF919, 0x916A}, {0xF91A, 0x99F1}, {0xF91B, 0x4E82}, {0xF91C, 0x5375}, {0xF91D, 0x6B04}, {0xF91E, 0x721B}, +{0xF91F, 0x862D}, {0xF920, 0x9E1E}, {0xF921, 0x5D50}, {0xF922, 0x6FEB}, {0xF923, 0x85CD}, {0xF924, 0x8964}, {0xF925, 0x62C9}, {0xF926, 0x81D8}, {0xF927, 0x881F}, {0xF928, 0x5ECA}, {0xF929, 0x6717}, +{0xF92A, 0x6D6A}, {0xF92B, 0x72FC}, {0xF92C, 0x90CE}, {0xF92D, 0x4F86}, {0xF92E, 0x51B7}, {0xF92F, 0x52DE}, {0xF930, 0x64C4}, {0xF931, 0x6AD3}, {0xF932, 0x7210}, {0xF933, 0x76E7}, {0xF934, 0x8001}, +{0xF935, 0x8606}, {0xF936, 0x865C}, {0xF937, 0x8DEF}, {0xF938, 0x9732}, {0xF939, 0x9B6F}, {0xF93A, 0x9DFA}, {0xF93B, 0x788C}, {0xF93C, 0x797F}, {0xF93D, 0x7DA0}, {0xF93E, 0x83C9}, {0xF93F, 0x9304}, +{0xF940, 0x9E7F}, {0xF941, 0x8AD6}, {0xF942, 0x58DF}, {0xF943, 0x5F04}, {0xF944, 0x7C60}, {0xF945, 0x807E}, {0xF946, 0x7262}, {0xF947, 0x78CA}, {0xF948, 0x8CC2}, {0xF949, 0x96F7}, {0xF94A, 0x58D8}, +{0xF94B, 0x5C62}, {0xF94C, 0x6A13}, {0xF94D, 0x6DDA}, {0xF94E, 0x6F0F}, {0xF94F, 0x7D2F}, {0xF950, 0x7E37}, {0xF951, 0x964B}, {0xF952, 0x52D2}, {0xF953, 0x808B}, {0xF954, 0x51DC}, {0xF955, 0x51CC}, +{0xF956, 0x7A1C}, {0xF957, 0x7DBE}, {0xF958, 0x83F1}, {0xF959, 0x9675}, {0xF95A, 0x8B80}, {0xF95B, 0x62CF}, {0xF95C, 0x6A02}, {0xF95D, 0x8AFE}, {0xF95E, 0x4E39}, {0xF95F, 0x5BE7}, {0xF960, 0x6012}, +{0xF961, 0x7387}, {0xF962, 0x7570}, {0xF963, 0x5317}, {0xF964, 0x78FB}, {0xF965, 0x4FBF}, {0xF966, 0x5FA9}, {0xF967, 0x4E0D}, {0xF968, 0x6CCC}, {0xF969, 0x6578}, {0xF96A, 0x7D22}, {0xF96B, 0x53C3}, +{0xF96C, 0x585E}, {0xF96D, 0x7701}, {0xF96E, 0x8449}, {0xF96F, 0x8AAA}, {0xF970, 0x6BBA}, {0xF971, 0x8FB0}, {0xF972, 0x6C88}, {0xF973, 0x62FE}, {0xF974, 0x82E5}, {0xF975, 0x63A0}, {0xF976, 0x7565}, +{0xF977, 0x4EAE}, {0xF978, 0x5169}, {0xF979, 0x51C9}, {0xF97A, 0x6881}, {0xF97B, 0x7CE7}, {0xF97C, 0x826F}, {0xF97D, 0x8AD2}, {0xF97E, 0x91CF}, {0xF97F, 0x52F5}, {0xF980, 0x5442}, {0xF981, 0x5973}, +{0xF982, 0x5EEC}, {0xF983, 0x65C5}, {0xF984, 0x6FFE}, {0xF985, 0x792A}, {0xF986, 0x95AD}, {0xF987, 0x9A6A}, {0xF988, 0x9E97}, {0xF989, 0x9ECE}, {0xF98A, 0x529B}, {0xF98B, 0x66C6}, {0xF98C, 0x6B77}, +{0xF98D, 0x8F62}, {0xF98E, 0x5E74}, {0xF98F, 0x6190}, {0xF990, 0x6200}, {0xF991, 0x649A}, {0xF992, 0x6F23}, {0xF993, 0x7149}, {0xF994, 0x7489}, {0xF995, 0x79CA}, {0xF996, 0x7DF4}, {0xF997, 0x806F}, +{0xF998, 0x8F26}, {0xF999, 0x84EE}, {0xF99A, 0x9023}, {0xF99B, 0x934A}, {0xF99C, 0x5217}, {0xF99D, 0x52A3}, {0xF99E, 0x54BD}, {0xF99F, 0x70C8}, {0xF9A0, 0x88C2}, {0xF9A1, 0x8AAA}, {0xF9A2, 0x5EC9}, +{0xF9A3, 0x5FF5}, {0xF9A4, 0x637B}, {0xF9A5, 0x6BAE}, {0xF9A6, 0x7C3E}, {0xF9A7, 0x7375}, {0xF9A8, 0x4EE4}, {0xF9A9, 0x56F9}, {0xF9AA, 0x5BE7}, {0xF9AB, 0x5DBA}, {0xF9AC, 0x601C}, {0xF9AD, 0x73B2}, +{0xF9AE, 0x7469}, {0xF9AF, 0x7F9A}, {0xF9B0, 0x8046}, {0xF9B1, 0x9234}, {0xF9B2, 0x96F6}, {0xF9B3, 0x9748}, {0xF9B4, 0x9818}, {0xF9B5, 0x4F8B}, {0xF9B6, 0x79AE}, {0xF9B7, 0x91B4}, {0xF9B8, 0x96B8}, +{0xF9B9, 0x60E1}, {0xF9BA, 0x4E86}, {0xF9BB, 0x50DA}, {0xF9BC, 0x5BEE}, {0xF9BD, 0x5C3F}, {0xF9BE, 0x6599}, {0xF9BF, 0x6A02}, {0xF9C0, 0x71CE}, {0xF9C1, 0x7642}, {0xF9C2, 0x84FC}, {0xF9C3, 0x907C}, +{0xF9C4, 0x9F8D}, {0xF9C5, 0x6688}, {0xF9C6, 0x962E}, {0xF9C7, 0x5289}, {0xF9C8, 0x677B}, {0xF9C9, 0x67F3}, {0xF9CA, 0x6D41}, {0xF9CB, 0x6E9C}, {0xF9CC, 0x7409}, {0xF9CD, 0x7559}, {0xF9CE, 0x786B}, +{0xF9CF, 0x7D10}, {0xF9D0, 0x985E}, {0xF9D1, 0x516D}, {0xF9D2, 0x622E}, {0xF9D3, 0x9678}, {0xF9D4, 0x502B}, {0xF9D5, 0x5D19}, {0xF9D6, 0x6DEA}, {0xF9D7, 0x8F2A}, {0xF9D8, 0x5F8B}, {0xF9D9, 0x6144}, +{0xF9DA, 0x6817}, {0xF9DB, 0x7387}, {0xF9DC, 0x9686}, {0xF9DD, 0x5229}, {0xF9DE, 0x540F}, {0xF9DF, 0x5C65}, {0xF9E0, 0x6613}, {0xF9E1, 0x674E}, {0xF9E2, 0x68A8}, {0xF9E3, 0x6CE5}, {0xF9E4, 0x7406}, +{0xF9E5, 0x75E2}, {0xF9E6, 0x7F79}, {0xF9E7, 0x88CF}, {0xF9E8, 0x88E1}, {0xF9E9, 0x91CC}, {0xF9EA, 0x96E2}, {0xF9EB, 0x533F}, {0xF9EC, 0x6EBA}, {0xF9ED, 0x541D}, {0xF9EE, 0x71D0}, {0xF9EF, 0x7498}, +{0xF9F0, 0x85FA}, {0xF9F1, 0x96A3}, {0xF9F2, 0x9C57}, {0xF9F3, 0x9E9F}, {0xF9F4, 0x6797}, {0xF9F5, 0x6DCB}, {0xF9F6, 0x81E8}, {0xF9F7, 0x7ACB}, {0xF9F8, 0x7B20}, {0xF9F9, 0x7C92}, {0xF9FA, 0x72C0}, +{0xF9FB, 0x7099}, {0xF9FC, 0x8B58}, {0xF9FD, 0x4EC0}, {0xF9FE, 0x8336}, {0xF9FF, 0x523A}, {0xFA00, 0x5207}, {0xFA01, 0x5EA6}, {0xFA02, 0x62D3}, {0xFA03, 0x7CD6}, {0xFA04, 0x5B85}, {0xFA05, 0x6D1E}, +{0xFA06, 0x66B4}, {0xFA07, 0x8F3B}, {0xFA08, 0x884C}, {0xFA09, 0x964D}, {0xFA0A, 0x898B}, {0xFA0B, 0x5ED3}, {0xFA0C, 0x5140}, {0xFA0D, 0x55C0}, {0xFA10, 0x585A}, {0xFA12, 0x6674}, {0xFA15, 0x51DE}, +{0xFA16, 0x732A}, {0xFA17, 0x76CA}, {0xFA18, 0x793C}, {0xFA19, 0x795E}, {0xFA1A, 0x7965}, {0xFA1B, 0x798F}, {0xFA1C, 0x9756}, {0xFA1D, 0x7CBE}, {0xFA1E, 0x7FBD}, {0xFA20, 0x8612}, {0xFA22, 0x8AF8}, +{0xFA25, 0x9038}, {0xFA26, 0x90FD}, {0xFA2A, 0x98EF}, {0xFA2B, 0x98FC}, {0xFA2C, 0x9928}, {0xFA2D, 0x9DB4}, {0xFA2E, 0x90DE}, {0xFA2F, 0x96B7}, {0xFA30, 0x4FAE}, {0xFA31, 0x50E7}, {0xFA32, 0x514D}, +{0xFA33, 0x52C9}, {0xFA34, 0x52E4}, {0xFA35, 0x5351}, {0xFA36, 0x559D}, {0xFA37, 0x5606}, {0xFA38, 0x5668}, {0xFA39, 0x5840}, {0xFA3A, 0x58A8}, {0xFA3B, 0x5C64}, {0xFA3C, 0x5C6E}, {0xFA3D, 0x6094}, +{0xFA3E, 0x6168}, {0xFA3F, 0x618E}, {0xFA40, 0x61F2}, {0xFA41, 0x654F}, {0xFA42, 0x65E2}, {0xFA43, 0x6691}, {0xFA44, 0x6885}, {0xFA45, 0x6D77}, {0xFA46, 0x6E1A}, {0xFA47, 0x6F22}, {0xFA48, 0x716E}, +{0xFA49, 0x722B}, {0xFA4A, 0x7422}, {0xFA4B, 0x7891}, {0xFA4C, 0x793E}, {0xFA4D, 0x7949}, {0xFA4E, 0x7948}, {0xFA4F, 0x7950}, {0xFA50, 0x7956}, {0xFA51, 0x795D}, {0xFA52, 0x798D}, {0xFA53, 0x798E}, +{0xFA54, 0x7A40}, {0xFA55, 0x7A81}, {0xFA56, 0x7BC0}, {0xFA57, 0x7DF4}, {0xFA58, 0x7E09}, {0xFA59, 0x7E41}, {0xFA5A, 0x7F72}, {0xFA5B, 0x8005}, {0xFA5C, 0x81ED}, {0xFA5D, 0x8279}, {0xFA5E, 0x8279}, +{0xFA5F, 0x8457}, {0xFA60, 0x8910}, {0xFA61, 0x8996}, {0xFA62, 0x8B01}, {0xFA63, 0x8B39}, {0xFA64, 0x8CD3}, {0xFA65, 0x8D08}, {0xFA66, 0x8FB6}, {0xFA67, 0x9038}, {0xFA68, 0x96E3}, {0xFA69, 0x97FF}, +{0xFA6A, 0x983B}, {0xFA6B, 0x6075}, {0xFA6C, 0x242EE}, {0xFA6D, 0x8218}, {0xFA70, 0x4E26}, {0xFA71, 0x51B5}, {0xFA72, 0x5168}, {0xFA73, 0x4F80}, {0xFA74, 0x5145}, {0xFA75, 0x5180}, {0xFA76, 0x52C7}, +{0xFA77, 0x52FA}, {0xFA78, 0x559D}, {0xFA79, 0x5555}, {0xFA7A, 0x5599}, {0xFA7B, 0x55E2}, {0xFA7C, 0x585A}, {0xFA7D, 0x58B3}, {0xFA7E, 0x5944}, {0xFA7F, 0x5954}, {0xFA80, 0x5A62}, {0xFA81, 0x5B28}, +{0xFA82, 0x5ED2}, {0xFA83, 0x5ED9}, {0xFA84, 0x5F69}, {0xFA85, 0x5FAD}, {0xFA86, 0x60D8}, {0xFA87, 0x614E}, {0xFA88, 0x6108}, {0xFA89, 0x618E}, {0xFA8A, 0x6160}, {0xFA8B, 0x61F2}, {0xFA8C, 0x6234}, +{0xFA8D, 0x63C4}, {0xFA8E, 0x641C}, {0xFA8F, 0x6452}, {0xFA90, 0x6556}, {0xFA91, 0x6674}, {0xFA92, 0x6717}, {0xFA93, 0x671B}, {0xFA94, 0x6756}, {0xFA95, 0x6B79}, {0xFA96, 0x6BBA}, {0xFA97, 0x6D41}, +{0xFA98, 0x6EDB}, {0xFA99, 0x6ECB}, {0xFA9A, 0x6F22}, {0xFA9B, 0x701E}, {0xFA9C, 0x716E}, {0xFA9D, 0x77A7}, {0xFA9E, 0x7235}, {0xFA9F, 0x72AF}, {0xFAA0, 0x732A}, {0xFAA1, 0x7471}, {0xFAA2, 0x7506}, +{0xFAA3, 0x753B}, {0xFAA4, 0x761D}, {0xFAA5, 0x761F}, {0xFAA6, 0x76CA}, {0xFAA7, 0x76DB}, {0xFAA8, 0x76F4}, {0xFAA9, 0x774A}, {0xFAAA, 0x7740}, {0xFAAB, 0x78CC}, {0xFAAC, 0x7AB1}, {0xFAAD, 0x7BC0}, +{0xFAAE, 0x7C7B}, {0xFAAF, 0x7D5B}, {0xFAB0, 0x7DF4}, {0xFAB1, 0x7F3E}, {0xFAB2, 0x8005}, {0xFAB3, 0x8352}, {0xFAB4, 0x83EF}, {0xFAB5, 0x8779}, {0xFAB6, 0x8941}, {0xFAB7, 0x8986}, {0xFAB8, 0x8996}, +{0xFAB9, 0x8ABF}, {0xFABA, 0x8AF8}, {0xFABB, 0x8ACB}, {0xFABC, 0x8B01}, {0xFABD, 0x8AFE}, {0xFABE, 0x8AED}, {0xFABF, 0x8B39}, {0xFAC0, 0x8B8A}, {0xFAC1, 0x8D08}, {0xFAC2, 0x8F38}, {0xFAC3, 0x9072}, +{0xFAC4, 0x9199}, {0xFAC5, 0x9276}, {0xFAC6, 0x967C}, {0xFAC7, 0x96E3}, {0xFAC8, 0x9756}, {0xFAC9, 0x97DB}, {0xFACA, 0x97FF}, {0xFACB, 0x980B}, {0xFACC, 0x983B}, {0xFACD, 0x9B12}, {0xFACE, 0x9F9C}, +{0xFACF, 0x2284A}, {0xFAD0, 0x22844}, {0xFAD1, 0x233D5}, {0xFAD2, 0x3B9D}, {0xFAD3, 0x4018}, {0xFAD4, 0x4039}, {0xFAD5, 0x25249}, {0xFAD6, 0x25CD0}, {0xFAD7, 0x27ED3}, {0xFAD8, 0x9F43}, +{0xFAD9, 0x9F8E}, {0xFB1D, 0x5D9}, {0xFB1D, 0x5B4}, {0xFB1F, 0x5F2}, {0xFB1F, 0x5B7}, {0xFB2A, 0x5E9}, {0xFB2A, 0x5C1}, {0xFB2B, 0x5E9}, {0xFB2B, 0x5C2}, {0xFB2C, 0x5E9}, {0xFB2C, 0x5BC}, +{0xFB2C, 0x5C1}, {0xFB2D, 0x5E9}, {0xFB2D, 0x5BC}, {0xFB2D, 0x5C2}, {0xFB2E, 0x5D0}, {0xFB2E, 0x5B7}, {0xFB2F, 0x5D0}, {0xFB2F, 0x5B8}, {0xFB30, 0x5D0}, {0xFB30, 0x5BC}, {0xFB31, 0x5D1}, +{0xFB31, 0x5BC}, {0xFB32, 0x5D2}, {0xFB32, 0x5BC}, {0xFB33, 0x5D3}, {0xFB33, 0x5BC}, {0xFB34, 0x5D4}, {0xFB34, 0x5BC}, {0xFB35, 0x5D5}, {0xFB35, 0x5BC}, {0xFB36, 0x5D6}, {0xFB36, 0x5BC}, +{0xFB38, 0x5D8}, {0xFB38, 0x5BC}, {0xFB39, 0x5D9}, {0xFB39, 0x5BC}, {0xFB3A, 0x5DA}, {0xFB3A, 0x5BC}, {0xFB3B, 0x5DB}, {0xFB3B, 0x5BC}, {0xFB3C, 0x5DC}, {0xFB3C, 0x5BC}, {0xFB3E, 0x5DE}, +{0xFB3E, 0x5BC}, {0xFB40, 0x5E0}, {0xFB40, 0x5BC}, {0xFB41, 0x5E1}, {0xFB41, 0x5BC}, {0xFB43, 0x5E3}, {0xFB43, 0x5BC}, {0xFB44, 0x5E4}, {0xFB44, 0x5BC}, {0xFB46, 0x5E6}, {0xFB46, 0x5BC}, +{0xFB47, 0x5E7}, {0xFB47, 0x5BC}, {0xFB48, 0x5E8}, {0xFB48, 0x5BC}, {0xFB49, 0x5E9}, {0xFB49, 0x5BC}, {0xFB4A, 0x5EA}, {0xFB4A, 0x5BC}, {0xFB4B, 0x5D5}, {0xFB4B, 0x5B9}, {0xFB4C, 0x5D1}, +{0xFB4C, 0x5BF}, {0xFB4D, 0x5DB}, {0xFB4D, 0x5BF}, {0xFB4E, 0x5E4}, {0xFB4E, 0x5BF}, {0x1109A, 0x11099}, {0x1109A, 0x110BA}, {0x1109C, 0x1109B}, {0x1109C, 0x110BA}, {0x110AB, 0x110A5}, +{0x110AB, 0x110BA}, {0x1112E, 0x11131}, {0x1112E, 0x11127}, {0x1112F, 0x11132}, {0x1112F, 0x11127}, {0x1134B, 0x11347}, {0x1134B, 0x1133E}, {0x1134C, 0x11347}, {0x1134C, 0x11357}, {0x114BB, 0x114B9}, +{0x114BB, 0x114BA}, {0x114BC, 0x114B9}, {0x114BC, 0x114B0}, {0x114BE, 0x114B9}, {0x114BE, 0x114BD}, {0x115BA, 0x115B8}, {0x115BA, 0x115AF}, {0x115BB, 0x115B9}, {0x115BB, 0x115AF}, {0x1D15E, 0x1D157}, +{0x1D15E, 0x1D165}, {0x1D15F, 0x1D158}, {0x1D15F, 0x1D165}, {0x1D160, 0x1D158}, {0x1D160, 0x1D165}, {0x1D160, 0x1D16E}, {0x1D161, 0x1D158}, {0x1D161, 0x1D165}, {0x1D161, 0x1D16F}, {0x1D162, 0x1D158}, +{0x1D162, 0x1D165}, {0x1D162, 0x1D170}, {0x1D163, 0x1D158}, {0x1D163, 0x1D165}, {0x1D163, 0x1D171}, {0x1D164, 0x1D158}, {0x1D164, 0x1D165}, {0x1D164, 0x1D172}, {0x1D1BB, 0x1D1B9}, {0x1D1BB, 0x1D165}, +{0x1D1BC, 0x1D1BA}, {0x1D1BC, 0x1D165}, {0x1D1BD, 0x1D1B9}, {0x1D1BD, 0x1D165}, {0x1D1BD, 0x1D16E}, {0x1D1BE, 0x1D1BA}, {0x1D1BE, 0x1D165}, {0x1D1BE, 0x1D16E}, {0x1D1BF, 0x1D1B9}, {0x1D1BF, 0x1D165}, +{0x1D1BF, 0x1D16F}, {0x1D1C0, 0x1D1BA}, {0x1D1C0, 0x1D165}, {0x1D1C0, 0x1D16F}, {0x2F800, 0x4E3D}, {0x2F801, 0x4E38}, {0x2F802, 0x4E41}, {0x2F803, 0x20122}, {0x2F804, 0x4F60}, {0x2F805, 0x4FAE}, +{0x2F806, 0x4FBB}, {0x2F807, 0x5002}, {0x2F808, 0x507A}, {0x2F809, 0x5099}, {0x2F80A, 0x50E7}, {0x2F80B, 0x50CF}, {0x2F80C, 0x349E}, {0x2F80D, 0x2063A}, {0x2F80E, 0x514D}, {0x2F80F, 0x5154}, +{0x2F810, 0x5164}, {0x2F811, 0x5177}, {0x2F812, 0x2051C}, {0x2F813, 0x34B9}, {0x2F814, 0x5167}, {0x2F815, 0x518D}, {0x2F816, 0x2054B}, {0x2F817, 0x5197}, {0x2F818, 0x51A4}, {0x2F819, 0x4ECC}, +{0x2F81A, 0x51AC}, {0x2F81B, 0x51B5}, {0x2F81C, 0x291DF}, {0x2F81D, 0x51F5}, {0x2F81E, 0x5203}, {0x2F81F, 0x34DF}, {0x2F820, 0x523B}, {0x2F821, 0x5246}, {0x2F822, 0x5272}, {0x2F823, 0x5277}, +{0x2F824, 0x3515}, {0x2F825, 0x52C7}, {0x2F826, 0x52C9}, {0x2F827, 0x52E4}, {0x2F828, 0x52FA}, {0x2F829, 0x5305}, {0x2F82A, 0x5306}, {0x2F82B, 0x5317}, {0x2F82C, 0x5349}, {0x2F82D, 0x5351}, +{0x2F82E, 0x535A}, {0x2F82F, 0x5373}, {0x2F830, 0x537D}, {0x2F831, 0x537F}, {0x2F832, 0x537F}, {0x2F833, 0x537F}, {0x2F834, 0x20A2C}, {0x2F835, 0x7070}, {0x2F836, 0x53CA}, {0x2F837, 0x53DF}, +{0x2F838, 0x20B63}, {0x2F839, 0x53EB}, {0x2F83A, 0x53F1}, {0x2F83B, 0x5406}, {0x2F83C, 0x549E}, {0x2F83D, 0x5438}, {0x2F83E, 0x5448}, {0x2F83F, 0x5468}, {0x2F840, 0x54A2}, {0x2F841, 0x54F6}, +{0x2F842, 0x5510}, {0x2F843, 0x5553}, {0x2F844, 0x5563}, {0x2F845, 0x5584}, {0x2F846, 0x5584}, {0x2F847, 0x5599}, {0x2F848, 0x55AB}, {0x2F849, 0x55B3}, {0x2F84A, 0x55C2}, {0x2F84B, 0x5716}, +{0x2F84C, 0x5606}, {0x2F84D, 0x5717}, {0x2F84E, 0x5651}, {0x2F84F, 0x5674}, {0x2F850, 0x5207}, {0x2F851, 0x58EE}, {0x2F852, 0x57CE}, {0x2F853, 0x57F4}, {0x2F854, 0x580D}, {0x2F855, 0x578B}, +{0x2F856, 0x5832}, {0x2F857, 0x5831}, {0x2F858, 0x58AC}, {0x2F859, 0x214E4}, {0x2F85A, 0x58F2}, {0x2F85B, 0x58F7}, {0x2F85C, 0x5906}, {0x2F85D, 0x591A}, {0x2F85E, 0x5922}, {0x2F85F, 0x5962}, +{0x2F860, 0x216A8}, {0x2F861, 0x216EA}, {0x2F862, 0x59EC}, {0x2F863, 0x5A1B}, {0x2F864, 0x5A27}, {0x2F865, 0x59D8}, {0x2F866, 0x5A66}, {0x2F867, 0x36EE}, {0x2F868, 0x36FC}, {0x2F869, 0x5B08}, +{0x2F86A, 0x5B3E}, {0x2F86B, 0x5B3E}, {0x2F86C, 0x219C8}, {0x2F86D, 0x5BC3}, {0x2F86E, 0x5BD8}, {0x2F86F, 0x5BE7}, {0x2F870, 0x5BF3}, {0x2F871, 0x21B18}, {0x2F872, 0x5BFF}, {0x2F873, 0x5C06}, +{0x2F874, 0x5F53}, {0x2F875, 0x5C22}, {0x2F876, 0x3781}, {0x2F877, 0x5C60}, {0x2F878, 0x5C6E}, {0x2F879, 0x5CC0}, {0x2F87A, 0x5C8D}, {0x2F87B, 0x21DE4}, {0x2F87C, 0x5D43}, {0x2F87D, 0x21DE6}, +{0x2F87E, 0x5D6E}, {0x2F87F, 0x5D6B}, {0x2F880, 0x5D7C}, {0x2F881, 0x5DE1}, {0x2F882, 0x5DE2}, {0x2F883, 0x382F}, {0x2F884, 0x5DFD}, {0x2F885, 0x5E28}, {0x2F886, 0x5E3D}, {0x2F887, 0x5E69}, +{0x2F888, 0x3862}, {0x2F889, 0x22183}, {0x2F88A, 0x387C}, {0x2F88B, 0x5EB0}, {0x2F88C, 0x5EB3}, {0x2F88D, 0x5EB6}, {0x2F88E, 0x5ECA}, {0x2F88F, 0x2A392}, {0x2F890, 0x5EFE}, {0x2F891, 0x22331}, +{0x2F892, 0x22331}, {0x2F893, 0x8201}, {0x2F894, 0x5F22}, {0x2F895, 0x5F22}, {0x2F896, 0x38C7}, {0x2F897, 0x232B8}, {0x2F898, 0x261DA}, {0x2F899, 0x5F62}, {0x2F89A, 0x5F6B}, {0x2F89B, 0x38E3}, +{0x2F89C, 0x5F9A}, {0x2F89D, 0x5FCD}, {0x2F89E, 0x5FD7}, {0x2F89F, 0x5FF9}, {0x2F8A0, 0x6081}, {0x2F8A1, 0x393A}, {0x2F8A2, 0x391C}, {0x2F8A3, 0x6094}, {0x2F8A4, 0x226D4}, {0x2F8A5, 0x60C7}, +{0x2F8A6, 0x6148}, {0x2F8A7, 0x614C}, {0x2F8A8, 0x614E}, {0x2F8A9, 0x614C}, {0x2F8AA, 0x617A}, {0x2F8AB, 0x618E}, {0x2F8AC, 0x61B2}, {0x2F8AD, 0x61A4}, {0x2F8AE, 0x61AF}, {0x2F8AF, 0x61DE}, +{0x2F8B0, 0x61F2}, {0x2F8B1, 0x61F6}, {0x2F8B2, 0x6210}, {0x2F8B3, 0x621B}, {0x2F8B4, 0x625D}, {0x2F8B5, 0x62B1}, {0x2F8B6, 0x62D4}, {0x2F8B7, 0x6350}, {0x2F8B8, 0x22B0C}, {0x2F8B9, 0x633D}, +{0x2F8BA, 0x62FC}, {0x2F8BB, 0x6368}, {0x2F8BC, 0x6383}, {0x2F8BD, 0x63E4}, {0x2F8BE, 0x22BF1}, {0x2F8BF, 0x6422}, {0x2F8C0, 0x63C5}, {0x2F8C1, 0x63A9}, {0x2F8C2, 0x3A2E}, {0x2F8C3, 0x6469}, +{0x2F8C4, 0x647E}, {0x2F8C5, 0x649D}, {0x2F8C6, 0x6477}, {0x2F8C7, 0x3A6C}, {0x2F8C8, 0x654F}, {0x2F8C9, 0x656C}, {0x2F8CA, 0x2300A}, {0x2F8CB, 0x65E3}, {0x2F8CC, 0x66F8}, {0x2F8CD, 0x6649}, +{0x2F8CE, 0x3B19}, {0x2F8CF, 0x6691}, {0x2F8D0, 0x3B08}, {0x2F8D1, 0x3AE4}, {0x2F8D2, 0x5192}, {0x2F8D3, 0x5195}, {0x2F8D4, 0x6700}, {0x2F8D5, 0x669C}, {0x2F8D6, 0x80AD}, {0x2F8D7, 0x43D9}, +{0x2F8D8, 0x6717}, {0x2F8D9, 0x671B}, {0x2F8DA, 0x6721}, {0x2F8DB, 0x675E}, {0x2F8DC, 0x6753}, {0x2F8DD, 0x233C3}, {0x2F8DE, 0x3B49}, {0x2F8DF, 0x67FA}, {0x2F8E0, 0x6785}, {0x2F8E1, 0x6852}, +{0x2F8E2, 0x6885}, {0x2F8E3, 0x2346D}, {0x2F8E4, 0x688E}, {0x2F8E5, 0x681F}, {0x2F8E6, 0x6914}, {0x2F8E7, 0x3B9D}, {0x2F8E8, 0x6942}, {0x2F8E9, 0x69A3}, {0x2F8EA, 0x69EA}, {0x2F8EB, 0x6AA8}, +{0x2F8EC, 0x236A3}, {0x2F8ED, 0x6ADB}, {0x2F8EE, 0x3C18}, {0x2F8EF, 0x6B21}, {0x2F8F0, 0x238A7}, {0x2F8F1, 0x6B54}, {0x2F8F2, 0x3C4E}, {0x2F8F3, 0x6B72}, {0x2F8F4, 0x6B9F}, {0x2F8F5, 0x6BBA}, +{0x2F8F6, 0x6BBB}, {0x2F8F7, 0x23A8D}, {0x2F8F8, 0x21D0B}, {0x2F8F9, 0x23AFA}, {0x2F8FA, 0x6C4E}, {0x2F8FB, 0x23CBC}, {0x2F8FC, 0x6CBF}, {0x2F8FD, 0x6CCD}, {0x2F8FE, 0x6C67}, {0x2F8FF, 0x6D16}, +{0x2F900, 0x6D3E}, {0x2F901, 0x6D77}, {0x2F902, 0x6D41}, {0x2F903, 0x6D69}, {0x2F904, 0x6D78}, {0x2F905, 0x6D85}, {0x2F906, 0x23D1E}, {0x2F907, 0x6D34}, {0x2F908, 0x6E2F}, {0x2F909, 0x6E6E}, +{0x2F90A, 0x3D33}, {0x2F90B, 0x6ECB}, {0x2F90C, 0x6EC7}, {0x2F90D, 0x23ED1}, {0x2F90E, 0x6DF9}, {0x2F90F, 0x6F6E}, {0x2F910, 0x23F5E}, {0x2F911, 0x23F8E}, {0x2F912, 0x6FC6}, {0x2F913, 0x7039}, +{0x2F914, 0x701E}, {0x2F915, 0x701B}, {0x2F916, 0x3D96}, {0x2F917, 0x704A}, {0x2F918, 0x707D}, {0x2F919, 0x7077}, {0x2F91A, 0x70AD}, {0x2F91B, 0x20525}, {0x2F91C, 0x7145}, {0x2F91D, 0x24263}, +{0x2F91E, 0x719C}, {0x2F91F, 0x243AB}, {0x2F920, 0x7228}, {0x2F921, 0x7235}, {0x2F922, 0x7250}, {0x2F923, 0x24608}, {0x2F924, 0x7280}, {0x2F925, 0x7295}, {0x2F926, 0x24735}, {0x2F927, 0x24814}, +{0x2F928, 0x737A}, {0x2F929, 0x738B}, {0x2F92A, 0x3EAC}, {0x2F92B, 0x73A5}, {0x2F92C, 0x3EB8}, {0x2F92D, 0x3EB8}, {0x2F92E, 0x7447}, {0x2F92F, 0x745C}, {0x2F930, 0x7471}, {0x2F931, 0x7485}, +{0x2F932, 0x74CA}, {0x2F933, 0x3F1B}, {0x2F934, 0x7524}, {0x2F935, 0x24C36}, {0x2F936, 0x753E}, {0x2F937, 0x24C92}, {0x2F938, 0x7570}, {0x2F939, 0x2219F}, {0x2F93A, 0x7610}, {0x2F93B, 0x24FA1}, +{0x2F93C, 0x24FB8}, {0x2F93D, 0x25044}, {0x2F93E, 0x3FFC}, {0x2F93F, 0x4008}, {0x2F940, 0x76F4}, {0x2F941, 0x250F3}, {0x2F942, 0x250F2}, {0x2F943, 0x25119}, {0x2F944, 0x25133}, {0x2F945, 0x771E}, +{0x2F946, 0x771F}, {0x2F947, 0x771F}, {0x2F948, 0x774A}, {0x2F949, 0x4039}, {0x2F94A, 0x778B}, {0x2F94B, 0x4046}, {0x2F94C, 0x4096}, {0x2F94D, 0x2541D}, {0x2F94E, 0x784E}, {0x2F94F, 0x788C}, +{0x2F950, 0x78CC}, {0x2F951, 0x40E3}, {0x2F952, 0x25626}, {0x2F953, 0x7956}, {0x2F954, 0x2569A}, {0x2F955, 0x256C5}, {0x2F956, 0x798F}, {0x2F957, 0x79EB}, {0x2F958, 0x412F}, {0x2F959, 0x7A40}, +{0x2F95A, 0x7A4A}, {0x2F95B, 0x7A4F}, {0x2F95C, 0x2597C}, {0x2F95D, 0x25AA7}, {0x2F95E, 0x25AA7}, {0x2F95F, 0x7AEE}, {0x2F960, 0x4202}, {0x2F961, 0x25BAB}, {0x2F962, 0x7BC6}, {0x2F963, 0x7BC9}, +{0x2F964, 0x4227}, {0x2F965, 0x25C80}, {0x2F966, 0x7CD2}, {0x2F967, 0x42A0}, {0x2F968, 0x7CE8}, {0x2F969, 0x7CE3}, {0x2F96A, 0x7D00}, {0x2F96B, 0x25F86}, {0x2F96C, 0x7D63}, {0x2F96D, 0x4301}, +{0x2F96E, 0x7DC7}, {0x2F96F, 0x7E02}, {0x2F970, 0x7E45}, {0x2F971, 0x4334}, {0x2F972, 0x26228}, {0x2F973, 0x26247}, {0x2F974, 0x4359}, {0x2F975, 0x262D9}, {0x2F976, 0x7F7A}, {0x2F977, 0x2633E}, +{0x2F978, 0x7F95}, {0x2F979, 0x7FFA}, {0x2F97A, 0x8005}, {0x2F97B, 0x264DA}, {0x2F97C, 0x26523}, {0x2F97D, 0x8060}, {0x2F97E, 0x265A8}, {0x2F97F, 0x8070}, {0x2F980, 0x2335F}, {0x2F981, 0x43D5}, +{0x2F982, 0x80B2}, {0x2F983, 0x8103}, {0x2F984, 0x440B}, {0x2F985, 0x813E}, {0x2F986, 0x5AB5}, {0x2F987, 0x267A7}, {0x2F988, 0x267B5}, {0x2F989, 0x23393}, {0x2F98A, 0x2339C}, {0x2F98B, 0x8201}, +{0x2F98C, 0x8204}, {0x2F98D, 0x8F9E}, {0x2F98E, 0x446B}, {0x2F98F, 0x8291}, {0x2F990, 0x828B}, {0x2F991, 0x829D}, {0x2F992, 0x52B3}, {0x2F993, 0x82B1}, {0x2F994, 0x82B3}, {0x2F995, 0x82BD}, +{0x2F996, 0x82E6}, {0x2F997, 0x26B3C}, {0x2F998, 0x82E5}, {0x2F999, 0x831D}, {0x2F99A, 0x8363}, {0x2F99B, 0x83AD}, {0x2F99C, 0x8323}, {0x2F99D, 0x83BD}, {0x2F99E, 0x83E7}, {0x2F99F, 0x8457}, +{0x2F9A0, 0x8353}, {0x2F9A1, 0x83CA}, {0x2F9A2, 0x83CC}, {0x2F9A3, 0x83DC}, {0x2F9A4, 0x26C36}, {0x2F9A5, 0x26D6B}, {0x2F9A6, 0x26CD5}, {0x2F9A7, 0x452B}, {0x2F9A8, 0x84F1}, {0x2F9A9, 0x84F3}, +{0x2F9AA, 0x8516}, {0x2F9AB, 0x273CA}, {0x2F9AC, 0x8564}, {0x2F9AD, 0x26F2C}, {0x2F9AE, 0x455D}, {0x2F9AF, 0x4561}, {0x2F9B0, 0x26FB1}, {0x2F9B1, 0x270D2}, {0x2F9B2, 0x456B}, {0x2F9B3, 0x8650}, +{0x2F9B4, 0x865C}, {0x2F9B5, 0x8667}, {0x2F9B6, 0x8669}, {0x2F9B7, 0x86A9}, {0x2F9B8, 0x8688}, {0x2F9B9, 0x870E}, {0x2F9BA, 0x86E2}, {0x2F9BB, 0x8779}, {0x2F9BC, 0x8728}, {0x2F9BD, 0x876B}, +{0x2F9BE, 0x8786}, {0x2F9BF, 0x45D7}, {0x2F9C0, 0x87E1}, {0x2F9C1, 0x8801}, {0x2F9C2, 0x45F9}, {0x2F9C3, 0x8860}, {0x2F9C4, 0x8863}, {0x2F9C5, 0x27667}, {0x2F9C6, 0x88D7}, {0x2F9C7, 0x88DE}, +{0x2F9C8, 0x4635}, {0x2F9C9, 0x88FA}, {0x2F9CA, 0x34BB}, {0x2F9CB, 0x278AE}, {0x2F9CC, 0x27966}, {0x2F9CD, 0x46BE}, {0x2F9CE, 0x46C7}, {0x2F9CF, 0x8AA0}, {0x2F9D0, 0x8AED}, {0x2F9D1, 0x8B8A}, +{0x2F9D2, 0x8C55}, {0x2F9D3, 0x27CA8}, {0x2F9D4, 0x8CAB}, {0x2F9D5, 0x8CC1}, {0x2F9D6, 0x8D1B}, {0x2F9D7, 0x8D77}, {0x2F9D8, 0x27F2F}, {0x2F9D9, 0x20804}, {0x2F9DA, 0x8DCB}, {0x2F9DB, 0x8DBC}, +{0x2F9DC, 0x8DF0}, {0x2F9DD, 0x208DE}, {0x2F9DE, 0x8ED4}, {0x2F9DF, 0x8F38}, {0x2F9E0, 0x285D2}, {0x2F9E1, 0x285ED}, {0x2F9E2, 0x9094}, {0x2F9E3, 0x90F1}, {0x2F9E4, 0x9111}, {0x2F9E5, 0x2872E}, +{0x2F9E6, 0x911B}, {0x2F9E7, 0x9238}, {0x2F9E8, 0x92D7}, {0x2F9E9, 0x92D8}, {0x2F9EA, 0x927C}, {0x2F9EB, 0x93F9}, {0x2F9EC, 0x9415}, {0x2F9ED, 0x28BFA}, {0x2F9EE, 0x958B}, {0x2F9EF, 0x4995}, +{0x2F9F0, 0x95B7}, {0x2F9F1, 0x28D77}, {0x2F9F2, 0x49E6}, {0x2F9F3, 0x96C3}, {0x2F9F4, 0x5DB2}, {0x2F9F5, 0x9723}, {0x2F9F6, 0x29145}, {0x2F9F7, 0x2921A}, {0x2F9F8, 0x4A6E}, {0x2F9F9, 0x4A76}, +{0x2F9FA, 0x97E0}, {0x2F9FB, 0x2940A}, {0x2F9FC, 0x4AB2}, {0x2F9FD, 0x29496}, {0x2F9FE, 0x980B}, {0x2F9FF, 0x980B}, {0x2FA00, 0x9829}, {0x2FA01, 0x295B6}, {0x2FA02, 0x98E2}, {0x2FA03, 0x4B33}, +{0x2FA04, 0x9929}, {0x2FA05, 0x99A7}, {0x2FA06, 0x99C2}, {0x2FA07, 0x99FE}, {0x2FA08, 0x4BCE}, {0x2FA09, 0x29B30}, {0x2FA0A, 0x9B12}, {0x2FA0B, 0x9C40}, {0x2FA0C, 0x9CFD}, {0x2FA0D, 0x4CCE}, +{0x2FA0E, 0x4CED}, {0x2FA0F, 0x9D67}, {0x2FA10, 0x2A0CE}, {0x2FA11, 0x4CF8}, {0x2FA12, 0x2A105}, {0x2FA13, 0x2A20E}, {0x2FA14, 0x2A291}, {0x2FA15, 0x9EBB}, {0x2FA16, 0x4D56}, {0x2FA17, 0x9EF9}, +{0x2FA18, 0x9EFE}, {0x2FA19, 0x9F05}, {0x2FA1A, 0x9F0F}, {0x2FA1B, 0x9F16}, {0x2FA1D, 0x2A600}, }; static std::string codepoint_to_utf8(uint32_t cp) { From 3ab8b3a92ede46df88bc5a2dfca3777de4a2b2b6 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Fri, 1 Mar 2024 12:39:06 +0100 Subject: [PATCH 08/26] llama : cleanup unused mmq flags (#5772) * cleanup unused --no-mul-mat-q,-nommq, -mmq, --mul-mat-q, mul_mat_q * remove: mul_mat_q in compare llama bench and usage * update llama-bench --------- Co-authored-by: slaren --- common/common.cpp | 2 -- common/common.h | 1 - examples/batched-bench/batched-bench.cpp | 18 +++++--------- examples/llama-bench/README.md | 1 - examples/llama-bench/llama-bench.cpp | 30 +++--------------------- examples/server/server.cpp | 8 ------- llama.cpp | 3 --- llama.h | 1 - scripts/compare-llama-bench.py | 2 +- 9 files changed, 10 insertions(+), 56 deletions(-) diff --git a/common/common.cpp b/common/common.cpp index 18289755c..bf1ed8a66 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1281,7 +1281,6 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param cparams.n_batch = params.n_batch; cparams.n_threads = params.n_threads; cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; - cparams.mul_mat_q = params.mul_mat_q; cparams.seed = params.seed; cparams.logits_all = params.logits_all; cparams.embedding = params.embedding; @@ -1725,7 +1724,6 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "n_predict: %d # default: -1 (unlimited)\n", params.n_predict); fprintf(stream, "n_probs: %d # only used by server binary, default: 0\n", sparams.n_probs); fprintf(stream, "no_mmap: %s # default: false\n", !params.use_mmap ? "true" : "false"); - fprintf(stream, "no_mul_mat_q: %s # default: false\n", !params.mul_mat_q ? "true" : "false"); fprintf(stream, "no_penalize_nl: %s # default: false\n", !sparams.penalize_nl ? "true" : "false"); fprintf(stream, "ppl_output_type: %d # default: 0\n", params.ppl_output_type); fprintf(stream, "ppl_stride: %d # default: 0\n", params.ppl_stride); diff --git a/common/common.h b/common/common.h index 25003df26..ab62bdb82 100644 --- a/common/common.h +++ b/common/common.h @@ -115,7 +115,6 @@ struct gpt_params { bool kl_divergence = false; // compute KL-divergence - bool mul_mat_q = true; // if true, use mul_mat_q kernels instead of cuBLAS bool random_prompt = false; // do not randomize prompt if none provided bool use_color = false; // use color to distinguish generations and inputs bool interactive = false; // interactive mode diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index b4b8a38e1..19aff18ae 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -32,16 +32,15 @@ int main(int argc, char ** argv) { gpt_params params; if (argc == 1 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] [MMQ] \n" , argv[0]); + printf("usage: %s MODEL_PATH [N_KV_MAX] [IS_PP_SHARED] [NGL] \n" , argv[0]); printf(" , and PL are comma-separated lists of numbers without spaces\n\n"); - printf(" example: %s ggml-model-f16.gguf 2048 0 999 0 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); + printf(" example: %s ggml-model-f16.gguf 2048 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]); return 1 ; } int n_kv_max = 2048; int is_pp_shared = 0; int n_gpu_layers = 0; - int mmq = 0; std::vector n_pp = { 128, 256, 512, 1024, 2048, 3584, 7680, }; std::vector n_tg = { 128, 256, }; @@ -65,19 +64,15 @@ int main(int argc, char ** argv) { } if (argc >= 6) { - mmq = std::atoi(argv[5]); + n_pp = parse_list(argv[5]); } if (argc >= 7) { - n_pp = parse_list(argv[6]); + n_tg = parse_list(argv[6]); } if (argc >= 8) { - n_tg = parse_list(argv[7]); - } - - if (argc >= 9) { - n_pl = parse_list(argv[8]); + n_pl = parse_list(argv[7]); } // init LLM @@ -106,7 +101,6 @@ int main(int argc, char ** argv) { ctx_params.seed = 1234; ctx_params.n_ctx = n_kv_max; ctx_params.n_batch = 512; - ctx_params.mul_mat_q = mmq; ctx_params.n_threads = params.n_threads; ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch; @@ -159,7 +153,7 @@ int main(int argc, char ** argv) { } LOG_TEE("\n"); - LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq, ctx_params.n_threads, ctx_params.n_threads_batch); + LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch); LOG_TEE("\n"); LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); diff --git a/examples/llama-bench/README.md b/examples/llama-bench/README.md index 374e40a7d..10f37b441 100644 --- a/examples/llama-bench/README.md +++ b/examples/llama-bench/README.md @@ -35,7 +35,6 @@ options: -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) -mmp, --mmap <0|1> (default: 1) - -mmq, --mul-mat-q <0|1> (default: 1) -ts, --tensor_split (default: 0) -r, --repetitions (default: 5) -o, --output (default: md) diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index 8fec3d43d..c2155b2ac 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -176,7 +176,6 @@ struct cmd_params { std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; - std::vector mul_mat_q; std::vector> tensor_split; std::vector use_mmap; int reps; @@ -196,7 +195,6 @@ static const cmd_params cmd_params_defaults = { /* split_mode */ {LLAMA_SPLIT_MODE_LAYER}, /* main_gpu */ {0}, /* no_kv_offload */ {false}, - /* mul_mat_q */ {true}, /* tensor_split */ {std::vector(llama_max_devices(), 0.0f)}, /* use_mmap */ {true}, /* reps */ 5, @@ -221,7 +219,6 @@ static void print_usage(int /* argc */, char ** argv) { printf(" -mg, --main-gpu (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str()); printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); - printf(" -mmq, --mul-mat-q <0|1> (default: %s)\n", join(cmd_params_defaults.mul_mat_q, ",").c_str()); printf(" -ts, --tensor_split (default: 0)\n"); printf(" -r, --repetitions (default: %d)\n", cmd_params_defaults.reps); printf(" -o, --output (default: %s)\n", output_format_str(cmd_params_defaults.output_format)); @@ -383,13 +380,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = split(argv[i], split_delim); params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end()); - } else if (arg == "-mmq" || arg == "--mul-mat-q") { - if (++i >= argc) { - invalid_param = true; - break; - } - auto p = split(argv[i], split_delim); - params.mul_mat_q.insert(params.mul_mat_q.end(), p.begin(), p.end()); } else if (arg == "-mmp" || arg == "--mmap") { if (++i >= argc) { invalid_param = true; @@ -466,7 +456,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; } if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; } - if (params.mul_mat_q.empty()) { params.mul_mat_q = cmd_params_defaults.mul_mat_q; } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; } if (params.n_threads.empty()) { params.n_threads = cmd_params_defaults.n_threads; } @@ -486,7 +475,6 @@ struct cmd_params_instance { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool mul_mat_q; std::vector tensor_split; bool use_mmap; @@ -518,7 +506,6 @@ struct cmd_params_instance { cparams.n_batch = n_batch; cparams.type_k = type_k; cparams.type_v = type_v; - cparams.mul_mat_q = mul_mat_q; cparams.offload_kqv = !no_kv_offload; return cparams; @@ -538,7 +525,6 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & nb : params.n_batch) for (const auto & tk : params.type_k) for (const auto & tv : params.type_v) - for (const auto & mmq : params.mul_mat_q) for (const auto & nkvo : params.no_kv_offload) for (const auto & nt : params.n_threads) { for (const auto & n_prompt : params.n_prompt) { @@ -557,7 +543,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, - /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, /* .use_mmap = */ mmp, }; @@ -580,7 +565,6 @@ static std::vector get_cmd_params_instances(const cmd_param /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, - /* .mul_mat_q = */ mmq, /* .tensor_split = */ ts, /* .use_mmap = */ mmp, }; @@ -616,7 +600,6 @@ struct test { llama_split_mode split_mode; int main_gpu; bool no_kv_offload; - bool mul_mat_q; std::vector tensor_split; bool use_mmap; int n_prompt; @@ -639,7 +622,6 @@ struct test { split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; - mul_mat_q = inst.mul_mat_q; tensor_split = inst.tensor_split; use_mmap = inst.use_mmap; n_prompt = inst.n_prompt; @@ -713,7 +695,7 @@ struct test { "n_batch", "n_threads", "type_k", "type_v", "n_gpu_layers", "split_mode", "main_gpu", "no_kv_offload", - "mul_mat_q", "tensor_split", "use_mmap", + "tensor_split", "use_mmap", "n_prompt", "n_gen", "test_time", "avg_ns", "stddev_ns", "avg_ts", "stddev_ts" @@ -733,7 +715,7 @@ struct test { } if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" || field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" || - field == "mul_mat_q" || field == "use_mmap") { + field == "use_mmap") { return BOOL; } if (field == "avg_ts" || field == "stddev_ts") { @@ -767,7 +749,7 @@ struct test { std::to_string(n_batch), std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), - std::to_string(mul_mat_q), tensor_split_str, std::to_string(use_mmap), + tensor_split_str, std::to_string(use_mmap), std::to_string(n_prompt), std::to_string(n_gen), test_time, std::to_string(avg_ns()), std::to_string(stdev_ns()), std::to_string(avg_ts()), std::to_string(stdev_ts()) @@ -931,9 +913,6 @@ struct markdown_printer : public printer { if (field == "n_threads") { return "threads"; } - if (field == "mul_mat_q") { - return "mmq"; - } if (field == "no_kv_offload") { return "nkvo"; } @@ -974,9 +953,6 @@ struct markdown_printer : public printer { if (params.split_mode.size() > 1 || params.split_mode != cmd_params_defaults.split_mode) { fields.emplace_back("split_mode"); } - if (params.mul_mat_q.size() > 1 || params.mul_mat_q != cmd_params_defaults.mul_mat_q) { - fields.emplace_back("mul_mat_q"); - } if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) { fields.emplace_back("no_kv_offload"); } diff --git a/examples/server/server.cpp b/examples/server/server.cpp index eea987966..2b2f4a0f4 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -2390,14 +2390,6 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } #else LOG_WARNING("llama.cpp was compiled without cuBLAS. It is not possible to set a tensor split.\n", {}); -#endif // GGML_USE_CUBLAS - } - else if (arg == "--no-mul-mat-q" || arg == "-nommq") - { -#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_SYCL) - params.mul_mat_q = false; -#else - LOG_WARNING("warning: llama.cpp was compiled without cuBLAS. Disabling mul_mat_q kernels has no effect.\n", {}); #endif // GGML_USE_CUBLAS } else if (arg == "--main-gpu" || arg == "-mg") diff --git a/llama.cpp b/llama.cpp index a35f07aa4..073fd3b70 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1645,7 +1645,6 @@ struct llama_cparams { float yarn_beta_slow; float defrag_thold; - bool mul_mat_q; bool offload_kqv; bool do_pooling; @@ -11633,7 +11632,6 @@ struct llama_context_params llama_context_default_params() { /*.cb_eval_user_data =*/ nullptr, /*.type_k =*/ GGML_TYPE_F16, /*.type_v =*/ GGML_TYPE_F16, - /*.mul_mat_q =*/ true, /*.logits_all =*/ false, /*.embedding =*/ false, /*.offload_kqv =*/ true, @@ -11785,7 +11783,6 @@ struct llama_context * llama_new_context_with_model( cparams.yarn_beta_fast = params.yarn_beta_fast; cparams.yarn_beta_slow = params.yarn_beta_slow; cparams.defrag_thold = params.defrag_thold; - cparams.mul_mat_q = params.mul_mat_q; cparams.offload_kqv = params.offload_kqv; cparams.do_pooling = params.do_pooling; diff --git a/llama.h b/llama.h index 4d0ebe37d..ed51f478a 100644 --- a/llama.h +++ b/llama.h @@ -255,7 +255,6 @@ extern "C" { enum ggml_type type_v; // data type for V cache // Keep the booleans together to avoid misalignment during copy-by-value. - bool mul_mat_q; // if true, use experimental mul_mat_q kernels (DEPRECATED - always true) bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU diff --git a/scripts/compare-llama-bench.py b/scripts/compare-llama-bench.py index 70737f976..39c3e52e5 100755 --- a/scripts/compare-llama-bench.py +++ b/scripts/compare-llama-bench.py @@ -31,7 +31,7 @@ PRETTY_NAMES = { "model_size": "Model Size [GiB]", "model_n_params": "Num. of Parameters", "n_batch": "Batch size", "n_threads": "Threads", "type_k": "K type", "type_v": "V type", "n_gpu_layers": "GPU layers", "main_gpu": "Main GPU", "no_kv_offload": "NKVO", - "mul_mat_q": "MMQ", "tensor_split": "Tensor split" + "tensor_split": "Tensor split" } DEFAULT_SHOW = ["model_type"] # Always show these properties by default. From f49a5356865ced0eca1df9f9d84631dfef71b9dc Mon Sep 17 00:00:00 2001 From: Miwa / Ensan <63481257+ensan-hcl@users.noreply.github.com> Date: Fri, 1 Mar 2024 22:48:56 +0900 Subject: [PATCH 09/26] common : fix flag `--logits-all` to `--all-logits` (#5805) --- common/common.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common/common.cpp b/common/common.cpp index bf1ed8a66..938c428cf 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1015,7 +1015,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { printf(" --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); printf(" --no-penalize-nl do not penalize newline token\n"); printf(" --temp N temperature (default: %.1f)\n", (double)sparams.temp); - printf(" --logits-all return logits for all tokens in the batch (default: disabled)\n"); + printf(" --all-logits return logits for all tokens in the batch (default: disabled)\n"); printf(" --hellaswag compute HellaSwag score over random tasks from datafile supplied with -f\n"); printf(" --hellaswag-tasks N number of tasks to use when computing the HellaSwag score (default: %zu)\n", params.hellaswag_tasks); printf(" --winogrande compute Winogrande score over random tasks from datafile supplied with -f\n"); From e7433867288d2f142cffe596f3751bda5d7ee2c7 Mon Sep 17 00:00:00 2001 From: kunal-vaishnavi <115581922+kunal-vaishnavi@users.noreply.github.com> Date: Fri, 1 Mar 2024 06:08:08 -0800 Subject: [PATCH 10/26] gemma : fix bfloat16 -> float16 conversion issue (#5810) --- convert-hf-to-gguf.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index ae30b2a76..d3e8ec1f6 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -1811,16 +1811,15 @@ class GemmaModel(Model): tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count) for name, data_torch in self.get_tensors(): - # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 - if name.endswith("norm.weight"): - data_torch = data_torch + 1 - old_dtype = data_torch.dtype # convert any unsupported data types to float32 if data_torch.dtype not in (torch.float16, torch.float32): data_torch = data_torch.to(torch.float32) + # ref: https://github.com/huggingface/transformers/blob/fc37f38915372c15992b540dfcbbe00a916d4fc6/src/transformers/models/gemma/modeling_gemma.py#L89 + if name.endswith("norm.weight"): + data_torch = data_torch + 1 data = data_torch.squeeze().numpy() # map tensor names From c2224f003bf9cf558b1a3c57033563e11a4de9a5 Mon Sep 17 00:00:00 2001 From: ddpasa <112642920+ddpasa@users.noreply.github.com> Date: Fri, 1 Mar 2024 18:00:00 +0100 Subject: [PATCH 11/26] ggml-vulkan: fix VULKAN_CHECK_RESULTS flag, which was previously broken (#5813) --- ggml-vulkan.cpp | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/ggml-vulkan.cpp b/ggml-vulkan.cpp index 896c290b2..ae9cb3c1c 100644 --- a/ggml-vulkan.cpp +++ b/ggml-vulkan.cpp @@ -5428,7 +5428,8 @@ static void ggml_vk_print_tensor(ggml_backend_vk_context * ctx, const ggml_tenso ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - ggml_vk_buffer_read(ctx, extra->buffer_gpu, extra->offset, tensor_data, tensor_size); + vk_buffer buffer_gpu = extra->buffer_gpu.lock(); + ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size); } std::cerr << "TENSOR CHECK " << name << " (" << tensor->name << "): " << ggml_op_name(tensor->op) << std::endl; @@ -5540,7 +5541,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ for (int i3 = 0; i3 < src0->ne[3]; i3++) { for (int i2 = 0; i2 < src0->ne[2]; i2++) { const int idx = i3*src0->ne[2] + i2; - ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]); + vk_buffer buffer_gpu = extra->buffer_gpu.lock(); + ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src0->nb[2], ((char *)src0_clone->data + idx * src0_clone->nb[2]), src0->ne[1] * src0->nb[1]); } } @@ -5550,10 +5552,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src0_clone->nb[i] = src0_clone->nb[i - 1]*src0_clone->ne[i - 1]; } } else { - if (offset + src0_size >= extra->buffer_gpu->size) { - src0_size = extra->buffer_gpu->size - offset; + vk_buffer buffer_gpu = extra->buffer_gpu.lock(); + if (offset + src0_size >= buffer_gpu->size) { + src0_size = buffer_gpu->size - offset; } - ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset, src0_clone->data, src0_size); + ggml_vk_buffer_read(ctx, buffer_gpu, offset, src0_clone->data, src0_size); memcpy(src0_clone->nb, src0->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -5583,7 +5586,8 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ for (int i3 = 0; i3 < src1->ne[3]; i3++) { for (int i2 = 0; i2 < src1->ne[2]; i2++) { const int idx = i3*src1->ne[2] + i2; - ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]); + vk_buffer buffer_gpu = extra->buffer_gpu.lock(); + ggml_vk_buffer_read(ctx, buffer_gpu, offset + idx * src1->nb[2], ((char *)src1_clone->data + idx * src1_clone->nb[2]), src1->ne[1] * src1->nb[1]); } } @@ -5593,10 +5597,11 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ src1_clone->nb[i] = src1_clone->nb[i - 1]*src1_clone->ne[i - 1]; } } else { - if (offset + src1_size >= extra->buffer_gpu->size) { - src1_size = extra->buffer_gpu->size - offset; + vk_buffer buffer_gpu = extra->buffer_gpu.lock(); + if (offset + src1_size >= buffer_gpu->size) { + src1_size = buffer_gpu->size - offset; } - ggml_vk_buffer_read(ctx, extra->buffer_gpu, offset, src1_clone->data, src1_size); + ggml_vk_buffer_read(ctx, buffer_gpu, offset, src1_clone->data, src1_size); memcpy(src1_clone->nb, src1->nb, sizeof(size_t) * GGML_MAX_DIMS); } } else { @@ -5643,11 +5648,7 @@ static void ggml_vk_check_results_0(ggml_backend_vk_context * ctx, ggml_compute_ } else if (tensor->op == GGML_OP_RMS_NORM) { tensor_clone = ggml_rms_norm(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_SOFT_MAX) { - if (src1 != nullptr) { - tensor_clone = ggml_soft_max_ext(ggml_ctx, src0_clone, src1_clone, *(float *)tensor->op_params); - } else { tensor_clone = ggml_soft_max(ggml_ctx, src0_clone); - } } else if (tensor->op == GGML_OP_DIAG_MASK_INF) { tensor_clone = ggml_diag_mask_inf(ggml_ctx, src0_clone, *(float *)tensor->op_params); } else if (tensor->op == GGML_OP_ROPE) { @@ -5753,11 +5754,12 @@ static void ggml_vk_check_results_1(ggml_backend_vk_context * ctx, ggml_compute_ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - if (extra->offset + tensor_size >= extra->buffer_gpu->size) { - tensor_size = extra->buffer_gpu->size - (extra->offset); + vk_buffer buffer_gpu = extra->buffer_gpu.lock(); + if (extra->offset + tensor_size >= buffer_gpu->size) { + tensor_size = buffer_gpu->size - (extra->offset); } - ggml_vk_buffer_read(ctx, extra->buffer_gpu, extra->offset, tensor_data, tensor_size); + ggml_vk_buffer_read(ctx, buffer_gpu, extra->offset, tensor_data, tensor_size); } float first_error_result = -1.0f; From 38d16b142624bdd7c41d9955752b7f7b59c5e048 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 1 Mar 2024 20:00:58 +0200 Subject: [PATCH 12/26] server : remove api_like_OAI.py proxy script (#5808) --- README.md | 1 + examples/server/README.md | 17 +-- examples/server/api_like_OAI.py | 228 -------------------------------- 3 files changed, 3 insertions(+), 243 deletions(-) delete mode 100755 examples/server/api_like_OAI.py diff --git a/README.md b/README.md index 5401e197f..67717c1e3 100644 --- a/README.md +++ b/README.md @@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others) ### Hot topics +- The `api_like_OAI.py` script has been removed - use `server` instead ([#5766](https://github.com/ggerganov/llama.cpp/issues/5766#issuecomment-1969037761)) - Support for chat templates: [Wiki (contributions welcome)](https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template) - Support for Gemma models: https://github.com/ggerganov/llama.cpp/pull/5631 - Non-linear quantization IQ4_NL: https://github.com/ggerganov/llama.cpp/pull/5590 diff --git a/examples/server/README.md b/examples/server/README.md index ad35306c6..397ee8252 100644 --- a/examples/server/README.md +++ b/examples/server/README.md @@ -326,7 +326,7 @@ Notice that each `probs` is an array of length `n_probs`. - `default_generation_settings` - the default generation settings for the `/completion` endpoint, has the same fields as the `generation_settings` response object from the `/completion` endpoint. - `total_slots` - the total number of slots for process requests (defined by `--parallel` option) -- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. Compared to `api_like_OAI.py` this API implementation does not require a wrapper to be served. +- **POST** `/v1/chat/completions`: OpenAI-compatible Chat Completions API. Given a ChatML-formatted json description in `messages`, it returns the predicted completion. Both synchronous and streaming mode are supported, so scripted and interactive applications work fine. While no strong claims of compatibility with OpenAI API spec is being made, in our experience it suffices to support many apps. Only ChatML-tuned models, such as Dolphin, OpenOrca, OpenHermes, OpenChat-3.5, etc can be used with this endpoint. *Options:* @@ -528,20 +528,7 @@ bash chat.sh ### API like OAI -API example using Python Flask: [api_like_OAI.py](api_like_OAI.py) -This example must be used with server.cpp - -```sh -python api_like_OAI.py -``` - -After running the API server, you can use it in Python by setting the API base URL. - -```python -openai.api_base = "http://:port" -``` - -Then you can utilize llama.cpp as an OpenAI's **chat.completion** or **text_completion** API +The HTTP server supports OAI-like API ### Extending or building alternative Web Front End diff --git a/examples/server/api_like_OAI.py b/examples/server/api_like_OAI.py deleted file mode 100755 index 607fe49d3..000000000 --- a/examples/server/api_like_OAI.py +++ /dev/null @@ -1,228 +0,0 @@ -#!/usr/bin/env python3 -import argparse -from flask import Flask, jsonify, request, Response -import urllib.parse -import requests -import time -import json - - -app = Flask(__name__) -slot_id = -1 - -parser = argparse.ArgumentParser(description="An example of using server.cpp with a similar API to OAI. It must be used together with server.cpp.") -parser.add_argument("--chat-prompt", type=str, help="the top prompt in chat completions(default: 'A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.')", default='A chat between a curious user and an artificial intelligence assistant. The assistant follows the given rules no matter what.') -parser.add_argument("--user-name", type=str, help="USER name in chat completions(default: 'USER: ')", default="USER: ") -parser.add_argument("--ai-name", type=str, help="ASSISTANT name in chat completions(default: 'ASSISTANT: ')", default="ASSISTANT: ") -parser.add_argument("--system-name", type=str, help="SYSTEM name in chat completions(default: 'ASSISTANT's RULE: ')", default="ASSISTANT's RULE: ") -parser.add_argument("--stop", type=str, help="the end of response in chat completions(default: '')", default="") -parser.add_argument("--llama-api", type=str, help="Set the address of server.cpp in llama.cpp(default: http://127.0.0.1:8080)", default='http://127.0.0.1:8080') -parser.add_argument("--api-key", type=str, help="Set the api key to allow only few user(default: NULL)", default="") -parser.add_argument("--host", type=str, help="Set the ip address to listen.(default: 127.0.0.1)", default='127.0.0.1') -parser.add_argument("--port", type=int, help="Set the port to listen.(default: 8081)", default=8081) - -args = parser.parse_args() - -def is_present(json, key): - try: - buf = json[key] - except KeyError: - return False - if json[key] == None: - return False - return True - -#convert chat to prompt -def convert_chat(messages): - - system_n = args.system_name - user_n = args.user_name - ai_n = args.ai_name - stop = args.stop - - prompt = "" + args.chat_prompt + stop - - for line in messages: - if (line["role"] == "system"): - prompt += f"{system_n}{line['content']}{stop}" - if (line["role"] == "user"): - prompt += f"{user_n}{line['content']}{stop}" - if (line["role"] == "assistant"): - prompt += f"{ai_n}{line['content']}{stop}" - prompt += ai_n.rstrip() - - return prompt - -def make_postData(body, chat=False, stream=False): - postData = {} - if (chat): - postData["prompt"] = convert_chat(body["messages"]) - else: - postData["prompt"] = body["prompt"] - if(is_present(body, "temperature")): postData["temperature"] = body["temperature"] - if(is_present(body, "top_k")): postData["top_k"] = body["top_k"] - if(is_present(body, "top_p")): postData["top_p"] = body["top_p"] - if(is_present(body, "max_tokens")): postData["n_predict"] = body["max_tokens"] - if(is_present(body, "presence_penalty")): postData["presence_penalty"] = body["presence_penalty"] - if(is_present(body, "frequency_penalty")): postData["frequency_penalty"] = body["frequency_penalty"] - if(is_present(body, "repeat_penalty")): postData["repeat_penalty"] = body["repeat_penalty"] - if(is_present(body, "mirostat")): postData["mirostat"] = body["mirostat"] - if(is_present(body, "mirostat_tau")): postData["mirostat_tau"] = body["mirostat_tau"] - if(is_present(body, "mirostat_eta")): postData["mirostat_eta"] = body["mirostat_eta"] - if(is_present(body, "seed")): postData["seed"] = body["seed"] - if(is_present(body, "grammar")): postData["grammar"] = body["grammar"] - if(is_present(body, "logit_bias")): postData["logit_bias"] = [[int(token), body["logit_bias"][token]] for token in body["logit_bias"].keys()] - if (args.stop != ""): - postData["stop"] = [args.stop] - else: - postData["stop"] = [] - if(is_present(body, "stop")): postData["stop"] += body["stop"] - postData["n_keep"] = -1 - postData["stream"] = stream - postData["cache_prompt"] = True - postData["slot_id"] = slot_id - return postData - -def make_resData(data, chat=False, promptToken=[]): - resData = { - "id": "chatcmpl" if (chat) else "cmpl", - "object": "chat.completion" if (chat) else "text_completion", - "created": int(time.time()), - "truncated": data["truncated"], - "model": "LLaMA_CPP", - "usage": { - "prompt_tokens": data["tokens_evaluated"], - "completion_tokens": data["tokens_predicted"], - "total_tokens": data["tokens_evaluated"] + data["tokens_predicted"] - } - } - if (len(promptToken) != 0): - resData["promptToken"] = promptToken - if (chat): - #only one choice is supported - resData["choices"] = [{ - "index": 0, - "message": { - "role": "assistant", - "content": data["content"], - }, - "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" - }] - else: - #only one choice is supported - resData["choices"] = [{ - "text": data["content"], - "index": 0, - "logprobs": None, - "finish_reason": "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" - }] - return resData - -def make_resData_stream(data, chat=False, time_now = 0, start=False): - resData = { - "id": "chatcmpl" if (chat) else "cmpl", - "object": "chat.completion.chunk" if (chat) else "text_completion.chunk", - "created": time_now, - "model": "LLaMA_CPP", - "choices": [ - { - "finish_reason": None, - "index": 0 - } - ] - } - slot_id = data.get("slot_id") - if (chat): - if (start): - resData["choices"][0]["delta"] = { - "role": "assistant" - } - else: - resData["choices"][0]["delta"] = { - "content": data["content"] - } - if (data["stop"]): - resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" - else: - resData["choices"][0]["text"] = data["content"] - if (data["stop"]): - resData["choices"][0]["finish_reason"] = "stop" if (data["stopped_eos"] or data["stopped_word"]) else "length" - - return resData - - -@app.route('/chat/completions', methods=['POST', 'OPTIONS']) -@app.route('/v1/chat/completions', methods=['POST', 'OPTIONS']) -def chat_completions(): - if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key): - return Response(status=403) - if request.method == 'OPTIONS': - return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) - body = request.get_json() - stream = False - tokenize = False - if(is_present(body, "stream")): stream = body["stream"] - if(is_present(body, "tokenize")): tokenize = body["tokenize"] - postData = make_postData(body, chat=True, stream=stream) - - promptToken = [] - if (tokenize): - tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json() - promptToken = tokenData["tokens"] - - if (not stream): - data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData)) - print(data.json()) - resData = make_resData(data.json(), chat=True, promptToken=promptToken) - return jsonify(resData) - else: - def generate(): - data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True) - time_now = int(time.time()) - resData = make_resData_stream({}, chat=True, time_now=time_now, start=True) - yield 'data: {}\n\n'.format(json.dumps(resData)) - for line in data.iter_lines(): - if line: - decoded_line = line.decode('utf-8') - resData = make_resData_stream(json.loads(decoded_line[6:]), chat=True, time_now=time_now) - yield 'data: {}\n\n'.format(json.dumps(resData)) - return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) - - -@app.route('/completions', methods=['POST', 'OPTIONS']) -@app.route('/v1/completions', methods=['POST', 'OPTIONS']) -def completion(): - if (args.api_key != "" and request.headers["Authorization"].split()[1] != args.api_key): - return Response(status=403) - if request.method == 'OPTIONS': - return Response(headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) - body = request.get_json() - stream = False - tokenize = False - if(is_present(body, "stream")): stream = body["stream"] - if(is_present(body, "tokenize")): tokenize = body["tokenize"] - postData = make_postData(body, chat=False, stream=stream) - - promptToken = [] - if (tokenize): - tokenData = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/tokenize"), data=json.dumps({"content": postData["prompt"]})).json() - promptToken = tokenData["tokens"] - - if (not stream): - data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData)) - print(data.json()) - resData = make_resData(data.json(), chat=False, promptToken=promptToken) - return jsonify(resData) - else: - def generate(): - data = requests.request("POST", urllib.parse.urljoin(args.llama_api, "/completion"), data=json.dumps(postData), stream=True) - time_now = int(time.time()) - for line in data.iter_lines(): - if line: - decoded_line = line.decode('utf-8') - resData = make_resData_stream(json.loads(decoded_line[6:]), chat=False, time_now=time_now) - yield 'data: {}\n\n'.format(json.dumps(resData)) - return Response(generate(), mimetype='text/event-stream', headers={"Access-Control-Allow-Origin": "*", "Access-Control-Allow-Headers": "*"}) - -if __name__ == '__main__': - app.run(args.host, port=args.port) From c29af7e2252d288f2ea58a7d437c1cb7c0abf160 Mon Sep 17 00:00:00 2001 From: Sourab Mangrulkar <13534540+pacman100@users.noreply.github.com> Date: Sat, 2 Mar 2024 01:00:46 +0530 Subject: [PATCH 13/26] llama : add StarCoder2 support (#5795) * Add support for starcoder2 * handle rope type * skip rope freq and rotary embeddings from being serialized * resolve comments * Update llama.cpp * remove redundant changes * handle `rope-theta` * llama : change starcoder2 rope type * address comment --------- Co-authored-by: Georgi Gerganov --- convert-hf-to-gguf.py | 8 +- gguf-py/gguf/constants.py | 21 ++++ gguf-py/gguf/tensor_mapping.py | 2 + llama.cpp | 199 +++++++++++++++++++++++++++++++++ 4 files changed, 229 insertions(+), 1 deletion(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index d3e8ec1f6..28b92ac38 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -96,9 +96,11 @@ class Model: if (n_head_kv := self.hparams.get("num_key_value_heads")) is not None: self.gguf_writer.add_head_count_kv(n_head_kv) + if (rope_theta := self.hparams.get("rope_theta")) is not None: + self.gguf_writer.add_rope_freq_base(rope_theta) if (f_rms_eps := self.hparams.get("rms_norm_eps")) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) - if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon"], optional=True)) is not None: + if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_eps(f_norm_eps) if (n_experts := self.hparams.get("num_local_experts")) is not None: self.gguf_writer.add_expert_count(n_experts) @@ -220,6 +222,8 @@ class Model: return NomicBertModel if model_architecture == "GemmaForCausalLM": return GemmaModel + if model_architecture == "Starcoder2ForCausalLM": + return Model return Model def _is_model_safetensors(self) -> bool: @@ -281,6 +285,8 @@ class Model: return gguf.MODEL_ARCH.NOMIC_BERT if arch == "GemmaForCausalLM": return gguf.MODEL_ARCH.GEMMA + if arch == "Starcoder2ForCausalLM": + return gguf.MODEL_ARCH.STARCODER2 raise NotImplementedError(f'Architecture "{arch}" not supported!') diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 8f9139d1b..5db760cb1 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -112,6 +112,7 @@ class MODEL_ARCH(IntEnum): INTERNLM2 = auto() MINICPM = auto() GEMMA = auto() + STARCODER2 = auto() class MODEL_TENSOR(IntEnum): @@ -169,6 +170,7 @@ MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = { MODEL_ARCH.INTERNLM2: "internlm2", MODEL_ARCH.MINICPM: "minicpm", MODEL_ARCH.GEMMA: "gemma", + MODEL_ARCH.STARCODER2: "starcoder2", } TENSOR_NAMES: dict[MODEL_TENSOR, str] = { @@ -526,6 +528,21 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.FFN_UP, MODEL_TENSOR.FFN_NORM, ], + MODEL_ARCH.STARCODER2: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.ATTN_ROT_EMBD, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } @@ -554,6 +571,10 @@ MODEL_TENSOR_SKIP: dict[MODEL_ARCH, list[MODEL_TENSOR]] = { MODEL_TENSOR.ROPE_FREQS, MODEL_TENSOR.ATTN_ROT_EMBD, ], + MODEL_ARCH.STARCODER2: [ + MODEL_TENSOR.ROPE_FREQS, + MODEL_TENSOR.ATTN_ROT_EMBD, + ], } # diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 861003776..db2ec9704 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -210,6 +210,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mlp.up_proj", # plamo "model.layers.{bid}.feed_forward.w3", # internlm2 "encoder.layers.{bid}.mlp.fc11", # nomic-bert + "model.layers.{bid}.mlp.c_fc", # starcoder2 ), MODEL_TENSOR.FFN_UP_EXP: ( @@ -256,6 +257,7 @@ class TensorNameMap: "model.layers.layers.{bid}.mlp.down_proj", # plamo "model.layers.{bid}.feed_forward.w2", # internlm2 "encoder.layers.{bid}.mlp.fc2", # nomic-bert + "model.layers.{bid}.mlp.c_proj", # starcoder2 ), MODEL_TENSOR.FFN_DOWN_EXP: ( diff --git a/llama.cpp b/llama.cpp index 073fd3b70..b1db5b179 100644 --- a/llama.cpp +++ b/llama.cpp @@ -211,6 +211,7 @@ enum llm_arch { LLM_ARCH_INTERNLM2, LLM_ARCH_MINICPM, LLM_ARCH_GEMMA, + LLM_ARCH_STARCODER2, LLM_ARCH_UNKNOWN, }; @@ -238,6 +239,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_INTERNLM2, "internlm2" }, { LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_GEMMA, "gemma" }, + { LLM_ARCH_STARCODER2, "starcoder2" }, }; enum llm_kv { @@ -779,6 +781,24 @@ static std::map> LLM_TENSOR_NAMES = { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, }, }, + { + LLM_ARCH_STARCODER2, + { + { LLM_TENSOR_TOKEN_EMBD, "token_embd" }, + { LLM_TENSOR_OUTPUT_NORM, "output_norm" }, + { LLM_TENSOR_OUTPUT, "output" }, + { LLM_TENSOR_ROPE_FREQS, "rope_freqs" }, + { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" }, + { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" }, + { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" }, + { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" }, + { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" }, + { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" }, + { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" }, + { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" }, + { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" }, + }, + }, { LLM_ARCH_UNKNOWN, { @@ -3320,6 +3340,16 @@ static void llm_load_hparams( default: model.type = e_model::MODEL_UNKNOWN; } } break; + case LLM_ARCH_STARCODER2: + { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps); + switch (hparams.n_layer) { + case 30: model.type = e_model::MODEL_3B; break; + case 32: model.type = e_model::MODEL_7B; break; + case 40: model.type = e_model::MODEL_15B; break; + default: model.type = e_model::MODEL_UNKNOWN; + } + } break; default: (void)0; } @@ -4490,6 +4520,56 @@ static bool llm_load_tensors( layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); } } break; + case LLM_ARCH_STARCODER2: + { + model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + + // output + { + model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}); + model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}); + + model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false); + // if output is NULL, init from the input tok embed + if (model.output == NULL) { + model.output = ml.create_tensor(ctx_output, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}); + ml.n_created--; // artificial tensor + ml.size_data += ggml_nbytes(model.output); + } + + } + + for (int i = 0; i < n_layer; ++i) { + ggml_context * ctx_layer = ctx_for_layer(i); + ggml_context * ctx_split = ctx_for_layer_split(i); + + auto & layer = model.layers[i]; + + layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}); + layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}); + + layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}); + layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}); + layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}); + layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}); + + // optional bias tensors + layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}); + layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}); + layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}); + layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}); + + layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}); + layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}); + + layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}); + layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}); + + // optional bias tensors + layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}); + layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP , "bias", i), { n_ff}); + } + } break; default: throw std::runtime_error("unknown architecture"); } @@ -7559,6 +7639,120 @@ struct llm_build_context { return gf; } + + struct ggml_cgraph * build_starcoder2() { + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false); + + const int64_t n_embd_head = hparams.n_embd_head_v; + GGML_ASSERT(n_embd_head == hparams.n_embd_head_k); + GGML_ASSERT(n_embd_head == hparams.n_rot); + + struct ggml_tensor * cur; + struct ggml_tensor * inpL; + + inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb); + cb(inpL, "inp_embd", -1); + + // inp_pos - contains the positions + struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0); + cb(inp_pos, "inp_pos", -1); + + // KQ_mask (mask for 1 head, it will be broadcasted to all heads) + struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0); + cb(KQ_mask, "KQ_mask", -1); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * inpSA = inpL; + + // norm + cur = llm_build_norm(ctx0, inpL, hparams, + model.layers[il].attn_norm, model.layers[il].attn_norm_b, + LLM_NORM, cb, il); + cb(cur, "attn_norm", il); + + // self-attention + { + // compute Q and K and RoPE them + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur); + cb(Qcur, "Qcur", il); + if (model.layers[il].bq) { + Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq); + cb(Qcur, "Qcur", il); + } + + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur); + cb(Kcur, "Kcur", il); + if (model.layers[il].bk) { + Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk); + cb(Kcur, "Kcur", il); + } + + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur); + cb(Vcur, "Vcur", il); + if (model.layers[il].bv) { + Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv); + cb(Vcur, "Vcur", il); + } + + Qcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Qcur, "Qcur", il); + + Kcur = ggml_rope_custom( + ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos, + n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow + ); + cb(Kcur, "Kcur", il); + + cur = llm_build_kv(ctx0, model, hparams, kv_self, gf, + model.layers[il].wo, model.layers[il].bo, + Kcur, Vcur, Qcur, KQ_mask, nullptr, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il); + cb(cur, "kqv_out", il); + } + + struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // feed-forward network + + cur = llm_build_norm(ctx0, ffn_inp, hparams, + model.layers[il].ffn_norm, model.layers[il].ffn_norm_b, + LLM_NORM, cb, il); + cb(cur, "ffn_norm", il); + + cur = llm_build_ffn(ctx0, cur, + model.layers[il].ffn_up, model.layers[il].ffn_up_b, + NULL, NULL, + model.layers[il].ffn_down, model.layers[il].ffn_down_b, + NULL, + LLM_FFN_GELU, LLM_FFN_SEQ, cb, il); + cb(cur, "ffn_out", il); + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + // input for next layer + inpL = cur; + } + + cur = inpL; + + cur = llm_build_norm(ctx0, cur, hparams, + model.output_norm, model.output_norm_b, + LLM_NORM, cb, -1); + cb(cur, "result_norm", -1); + + // lm_head + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + + ggml_build_forward_expand(gf, cur); + + return gf; + } }; static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector & ids) { @@ -7705,6 +7899,10 @@ static struct ggml_cgraph * llama_build_graph( { result = llm.build_gemma(); } break; + case LLM_ARCH_STARCODER2: + { + result = llm.build_starcoder2(); + } break; default: GGML_ASSERT(false); } @@ -12084,6 +12282,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) { case LLM_ARCH_QWEN2: case LLM_ARCH_PHI2: case LLM_ARCH_GEMMA: + case LLM_ARCH_STARCODER2: return LLAMA_ROPE_TYPE_NEOX; // all model arches should be listed explicitly here From da3b9ba2b710c0f8b44398a0eb9e5a7ae2ad967a Mon Sep 17 00:00:00 2001 From: nold Date: Fri, 1 Mar 2024 22:51:12 +0100 Subject: [PATCH 14/26] convert-hf-to-gguf : require einops for InternLM2ForCausalLM (#5792) --- requirements/requirements-convert-hf-to-gguf.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements/requirements-convert-hf-to-gguf.txt b/requirements/requirements-convert-hf-to-gguf.txt index 6ac402610..6ce840d73 100644 --- a/requirements/requirements-convert-hf-to-gguf.txt +++ b/requirements/requirements-convert-hf-to-gguf.txt @@ -1,2 +1,3 @@ -r ./requirements-convert.txt torch~=2.1.1 +einops~=0.7.0 From cb5e8f7fc4ee57d4bcccafbe04a82cededd35486 Mon Sep 17 00:00:00 2001 From: Tushar Date: Sat, 2 Mar 2024 04:48:26 +0530 Subject: [PATCH 15/26] build(nix): Introduce flake.formatter for `nix fmt` (#5687) * build(nix): Introduce flake.formatter for `nix fmt` * chore: Switch to pkgs.nixfmt-rfc-style --- .devops/nix/sif.nix | 2 +- flake.nix | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/.devops/nix/sif.nix b/.devops/nix/sif.nix index 7535ca0f3..7a5e1dd0f 100644 --- a/.devops/nix/sif.nix +++ b/.devops/nix/sif.nix @@ -7,7 +7,7 @@ }: let - optionalInt = cond: x: if cond then x else 0; + optionalInt = cond: x: if cond then x else 0; in singularity-tools.buildImage rec { inherit (llama-cpp) name; diff --git a/flake.nix b/flake.nix index dc4e503c3..45f9deda0 100644 --- a/flake.nix +++ b/flake.nix @@ -107,11 +107,12 @@ # ``` # # Cf. https://nixos.org/manual/nix/unstable/command-ref/new-cli/nix3-flake.html?highlight=flake#flake-format - flake.overlays.default = - (final: prev: { + flake.overlays.default = ( + final: prev: { llamaPackages = final.callPackage .devops/nix/scope.nix { inherit llamaVersion; }; inherit (final.llamaPackages) llama-cpp; - }); + } + ); systems = [ "aarch64-darwin" @@ -131,6 +132,9 @@ ... }: { + # For standardised reproducible formatting with `nix fmt` + formatter = pkgs.nixfmt-rfc-style; + # Unlike `.#packages`, legacyPackages may contain values of # arbitrary types (including nested attrsets) and may even throw # exceptions. This attribute isn't recursed into by `nix flake From 9bf297a02bfbd474e51912409a470dd797e2fe13 Mon Sep 17 00:00:00 2001 From: crasm Date: Sat, 2 Mar 2024 00:11:06 -0500 Subject: [PATCH 16/26] workflows : remove nocleanup arg for check-requirements.sh (#5826) Reduces peak tmpfs usage and should prevent the check from failing from running out of space. Fixes the 'No space left on device' issue mentioned in #5703. --- .github/workflows/python-check-requirements.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/python-check-requirements.yml b/.github/workflows/python-check-requirements.yml index 92e1108b3..b82205992 100644 --- a/.github/workflows/python-check-requirements.yml +++ b/.github/workflows/python-check-requirements.yml @@ -3,12 +3,14 @@ name: Python check requirements.txt on: push: paths: + - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - 'requirements.txt' - 'requirements/*.txt' pull_request: paths: + - '.github/workflows/python-check-requirements.yml' - 'scripts/check-requirements.sh' - 'convert*.py' - 'requirements.txt' @@ -26,4 +28,4 @@ jobs: with: python-version: "3.11" - name: Run check-requirements.sh script - run: bash scripts/check-requirements.sh nocleanup + run: bash scripts/check-requirements.sh From 715641391dda1ff9762dc5d99d9a30acce99f2c6 Mon Sep 17 00:00:00 2001 From: Neo Zhang Jianyu Date: Sat, 2 Mar 2024 19:49:30 +0800 Subject: [PATCH 17/26] Support multiple GPUs (split mode) on SYCL backend (#5806) * suport multiple cards: split-mode - layer|row * rm warning * rebase with master, support tow new OPs, close feature for -sm=row, fix for unit test * update news * fix merge error * update according to review comments --- README-sycl.md | 21 + common/common.cpp | 4 + examples/llama-bench/llama-bench.cpp | 17 +- examples/sycl/ls-sycl-device.cpp | 2 +- examples/sycl/run-llama2.sh | 17 +- ggml-sycl.cpp | 2205 +++++++++++++++++--------- ggml-sycl.h | 5 + llama.cpp | 49 +- 8 files changed, 1506 insertions(+), 814 deletions(-) diff --git a/README-sycl.md b/README-sycl.md index dd5bf9dea..85eb16f2b 100644 --- a/README-sycl.md +++ b/README-sycl.md @@ -1,6 +1,7 @@ # llama.cpp for SYCL - [Background](#background) +- [News](#news) - [OS](#os) - [Intel GPU](#intel-gpu) - [Docker](#docker) @@ -25,6 +26,21 @@ The llama.cpp for SYCL is used to support Intel GPUs. For Intel CPU, recommend to use llama.cpp for X86 (Intel MKL building). +## News + +- 2024.3 + - Support multiple cards: **--split-mode**: [none|layer]; not support [row], it's on developing. + - Support to assign main GPU by **--main-gpu**, replace $GGML_SYCL_DEVICE. + - Support detecting all GPUs with level-zero and same top **Max compute units**. + - Support OPs + - hardsigmoid + - hardswish + - pool2d + +- 2024.1 + - Create SYCL backend for Intel GPU. + - Support Windows build + ## OS |OS|Status|Verified| @@ -449,6 +465,7 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device |-|-|-| |GGML_SYCL_DEVICE|0 (default) or 1|Set the device id used. Check the device ids by default running output| |GGML_SYCL_DEBUG|0 (default) or 1|Enable log function by macro: GGML_SYCL_DEBUG| +|ZES_ENABLE_SYSMAN| 0 (default) or 1|Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.
Recommended to use when --split-mode = layer| ## Known Issue @@ -458,6 +475,10 @@ Using device **0** (Intel(R) Arc(TM) A770 Graphics) as main device Solution: add **--no-mmap** or **--mmap 0**. +- Split-mode: [row] is not supported + + It's on developing. + ## Q&A - Error: `error while loading shared libraries: libsycl.so.7: cannot open shared object file: No such file or directory`. diff --git a/common/common.cpp b/common/common.cpp index 938c428cf..1c0b7c403 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -640,6 +640,10 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) { } else if (arg_next == "layer") { params.split_mode = LLAMA_SPLIT_MODE_LAYER; } else if (arg_next == "row") { +#ifdef GGML_USE_SYCL + fprintf(stderr, "warning: The split mode value:[row] is not supported by llama.cpp with SYCL. It's developing.\nExit!\n"); + exit(1); +#endif // GGML_USE_SYCL params.split_mode = LLAMA_SPLIT_MODE_ROW; } else { invalid_param = true; diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index c2155b2ac..aa79d002a 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -123,20 +123,15 @@ static std::string get_gpu_info() { } #endif #ifdef GGML_USE_SYCL - int device_list[GGML_SYCL_MAX_DEVICES]; - ggml_sycl_get_gpu_list(device_list, GGML_SYCL_MAX_DEVICES); - - for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) { - if (device_list[i] >0 ){ - char buf[128]; - ggml_sycl_get_device_description(i, buf, sizeof(buf)); - id += buf; + int count = ggml_backend_sycl_get_device_count(); + for (int i = 0; i < count; i++) { + char buf[128]; + ggml_sycl_get_device_description(i, buf, sizeof(buf)); + id += buf; + if (i < count - 1) { id += "/"; } } - if (id.length() >2 ) { - id.pop_back(); - } #endif // TODO: other backends return id; diff --git a/examples/sycl/ls-sycl-device.cpp b/examples/sycl/ls-sycl-device.cpp index 52442e4ca..74a8b7fd8 100644 --- a/examples/sycl/ls-sycl-device.cpp +++ b/examples/sycl/ls-sycl-device.cpp @@ -7,7 +7,7 @@ #include "ggml-sycl.h" -int main(int argc, char ** argv) { +int main() { ggml_backend_sycl_print_sycl_devices(); return 0; } diff --git a/examples/sycl/run-llama2.sh b/examples/sycl/run-llama2.sh index f5f4c1e98..52f7c01a4 100755 --- a/examples/sycl/run-llama2.sh +++ b/examples/sycl/run-llama2.sh @@ -8,12 +8,19 @@ INPUT2="Building a website can be done in 10 simple steps:\nStep 1:" source /opt/intel/oneapi/setvars.sh if [ $# -gt 0 ]; then - export GGML_SYCL_DEVICE=$1 + GGML_SYCL_DEVICE=$1 else - export GGML_SYCL_DEVICE=0 + GGML_SYCL_DEVICE=0 fi -echo GGML_SYCL_DEVICE=$GGML_SYCL_DEVICE +echo "use $GGML_SYCL_DEVICE as main GPU" #export GGML_SYCL_DEBUG=1 -./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -#./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 5 -e -ngl 33 -t 1 -s 0 + + +#ZES_ENABLE_SYSMAN=1, Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory. Recommended to use when --split-mode = layer. + +#use all GPUs with same max compute units +ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 + +#use main GPU only +#ZES_ENABLE_SYSMAN=1 ./build/bin/main -m models/llama-2-7b.Q4_0.gguf -p "${INPUT2}" -n 400 -e -ngl 33 -s 0 -mg $GGML_SYCL_DEVICE -sm none diff --git a/ggml-sycl.cpp b/ggml-sycl.cpp index 6f391b0c6..cad08d610 100644 --- a/ggml-sycl.cpp +++ b/ggml-sycl.cpp @@ -661,26 +661,29 @@ namespace dpct /// \param [out] total_memory The number of bytes of total memory on the SYCL device. void get_memory_info(size_t &free_memory, size_t &total_memory) { + total_memory = get_device_info().get_global_mem_size(); + const char *warning_info = "get_memory_info: [warning] ext_intel_free_memory is not " + "supported (export/set ZES_ENABLE_SYSMAN=1 to support), " + "use total memory as free memory"; #if (defined(__SYCL_COMPILER_VERSION) && __SYCL_COMPILER_VERSION >= 20221105) if (!has(sycl::aspect::ext_intel_free_memory)) { - std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl; - free_memory = 0; + std::cerr << warning_info << std::endl; + free_memory = total_memory; } else { free_memory = get_info(); } #else - std::cerr << "get_memory_info: ext_intel_free_memory is not supported." << std::endl; - free_memory = 0; + std::cerr << warning_info << std::endl; + free_memory = total_memory; #if defined(_MSC_VER) && !defined(__clang__) #pragma message("Querying the number of bytes of free memory is not supported") #else #warning "Querying the number of bytes of free memory is not supported" #endif #endif - total_memory = get_device_info().get_global_mem_size(); } void get_device_info(device_info &out) const @@ -738,15 +741,25 @@ namespace dpct #endif // DPCT_USM_LEVEL_NONE } - sycl::queue *create_in_order_queue(bool enable_exception_handler = false) - { - std::lock_guard lock(m_mutex); - return create_queue_impl(enable_exception_handler, - sycl::property::queue::in_order()); + sycl::queue *create_queue(sycl::context context, sycl::device device, + bool enable_exception_handler = false) { + return create_in_order_queue(context, device, enable_exception_handler); } - sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) - { + sycl::queue *create_in_order_queue(bool enable_exception_handler = false) { + std::lock_guard lock(m_mutex); + return create_queue_impl(enable_exception_handler, + sycl::property::queue::in_order()); + } + + sycl::queue *create_in_order_queue(sycl::context context, sycl::device device, + bool enable_exception_handler = false) { + std::lock_guard lock(m_mutex); + return create_queue_impl(context, device, enable_exception_handler, + sycl::property::queue::in_order()); + } + + sycl::queue *create_out_of_order_queue(bool enable_exception_handler = false) { std::lock_guard lock(m_mutex); return create_queue_impl(enable_exception_handler); } @@ -809,6 +822,25 @@ namespace dpct return _queues.back().get(); } + template + sycl::queue *create_queue_impl(sycl::context context, sycl::device device, + bool enable_exception_handler, + Properties... properties) { + sycl::async_handler eh = {}; + if (enable_exception_handler) { + eh = exception_handler; + } + _queues.push_back(std::make_shared( + context, device, eh, + sycl::property_list( + #ifdef DPCT_PROFILING_ENABLED + sycl::property::queue::enable_profiling(), + #endif + properties...))); + + return _queues.back().get(); + } + void get_version(int &major, int &minor) const { detail::get_version(*this, major, minor); @@ -2943,14 +2975,11 @@ bool ggml_sycl_loaded(void); void * ggml_sycl_host_malloc(size_t size); void ggml_sycl_host_free(void * ptr); bool ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst); -void ggml_sycl_set_tensor_split(const float * tensor_split); -void ggml_sycl_transform_tensor(void * data, struct ggml_tensor * tensor); void ggml_sycl_free_data(struct ggml_tensor * tensor); void ggml_sycl_assign_buffers(struct ggml_tensor * tensor); void ggml_sycl_assign_buffers_no_scratch(struct ggml_tensor * tensor); void ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor); void ggml_sycl_assign_buffers_no_alloc(struct ggml_tensor * tensor); -void ggml_sycl_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset); void ggml_sycl_copy_to_device(struct ggml_tensor * tensor); void ggml_sycl_set_main_device(int main_device); void ggml_sycl_set_mul_mat_q(bool mul_mat_q); @@ -2963,6 +2992,14 @@ int get_main_device(); void print_ggml_tensor(const char*name, struct ggml_tensor *src); void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cnt); +void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst, + const void *ptr_src, size_t size) { + char *host_buf = (char *)malloc(size); + q_src.memcpy(host_buf, (const char *)ptr_src, size).wait(); + q_dst.memcpy((char *)ptr_dst, host_buf, size).wait(); + free(host_buf); +} + static __dpct_inline__ int get_int_from_int8(const int8_t *x8, const int &i32) { const uint16_t * x16 = (const uint16_t *) (x8 + sizeof(int) * i32); // assume at least 2 byte alignment @@ -3180,6 +3217,8 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define SYCL_SILU_BLOCK_SIZE 256 #define SYCL_TANH_BLOCK_SIZE 256 #define SYCL_RELU_BLOCK_SIZE 256 +#define SYCL_HARDSIGMOID_BLOCK_SIZE 256 +#define SYCL_HARDSWISH_BLOCK_SIZE 256 #define SYCL_SQR_BLOCK_SIZE 256 #define SYCL_CPY_BLOCK_SIZE 32 #define SYCL_SCALE_BLOCK_SIZE 256 @@ -3196,6 +3235,7 @@ static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_ #define SYCL_PAD_BLOCK_SIZE 256 #define SYCL_ACC_BLOCK_SIZE 256 #define SYCL_IM2COL_BLOCK_SIZE 256 +#define SYCL_POOL2D_BLOCK_SIZE 256 // dmmv = dequantize_mul_mat_vec #ifndef GGML_SYCL_DMMV_X @@ -3218,8 +3258,7 @@ static_assert(K_QUANTS_PER_ITERATION == 1 || K_QUANTS_PER_ITERATION == 2, "K_QUA #define MUL_MAT_SRC1_COL_STRIDE 128 #define MAX_STREAMS 8 -static dpct::queue_ptr g_syclStreams[GGML_SYCL_MAX_DEVICES][MAX_STREAMS] = { - {0}}; +static dpct::queue_ptr g_syclStreams[GGML_SYCL_MAX_DEVICES][MAX_STREAMS] = {{0}}; struct ggml_tensor_extra_gpu { void * data_device[GGML_SYCL_MAX_DEVICES]; // 1 pointer for each device for split tensors @@ -3228,30 +3267,108 @@ struct ggml_tensor_extra_gpu { [MAX_STREAMS]; // events for synchronizing multiple GPUs }; -inline dpct::err0 ggml_sycl_set_device(const int device) try { - int current_device; +class sycl_gpu_mgr { + public: + std::vector gpus; + std::vector devices; + sycl::queue *first_queue; + sycl::context co_ctx; + int max_compute_units = 0; + int work_group_size = 0; + std::string gpus_list = ""; - SYCL_CHECK(CHECK_TRY_ERROR( - current_device = dpct::dev_mgr::instance().current_device_id())); + sycl_gpu_mgr() { + detect_sycl_gpu_list_with_max_cu(); + get_allow_gpus(); + create_context_with_gpus(); + } - // GGML_SYCL_DEBUG("ggml_sycl_set_device device=%d, current_device=%d\n", device, current_device); - if (device == current_device) { - return 0; - } + void create_context_with_gpus() { + sycl::context ctx = sycl::context(devices); + assert(gpus.size() > 0); + first_queue = dpct::get_current_device().create_queue(ctx, devices[0]); + co_ctx = first_queue->get_context(); + } - return CHECK_TRY_ERROR(dpct::select_device(device)); -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - crash(); - std::exit(1); -} + sycl::context &get_co_ctx() { return co_ctx; } + void get_allow_gpus() { + gpus_list = ""; + for (size_t i = 0; i < gpus.size(); ++i) { + gpus_list += std::to_string(gpus[i]); + gpus_list += ","; + } + if (gpus_list.length() > 2) { + gpus_list.pop_back(); + } + } + + bool is_allowed_gpu(int device_id) { + return std::find(gpus.begin(), gpus.end(), device_id) != gpus.end(); + } + + void detect_sycl_gpu_list_with_max_cu() try { + int device_count = dpct::dev_mgr::instance().device_count(); + + for (int id = 0; id < device_count; id++) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + if (!device.is_gpu()) + continue; + dpct::device_info prop; + dpct::get_device_info(prop, device); + if (max_compute_units < prop.get_max_compute_units()) + max_compute_units = prop.get_max_compute_units(); + } + + for (int id = 0; id < device_count; id++) { + sycl::device device = dpct::dev_mgr::instance().get_device(id); + if (!device.is_gpu()) + continue; + dpct::device_info prop; + dpct::get_device_info(prop, device); + if (max_compute_units == prop.get_max_compute_units() && + prop.get_major_version() == 1) { + gpus.push_back(id); + devices.push_back(device); + work_group_size = prop.get_max_work_group_size(); + } + } + return; + } catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); + } + + int get_gpu_count() { return (int)gpus.size(); } + + int get_index(int id) { + for (int i = 0; i < (int)gpus.size(); i++) { + if (gpus[i] == id) + return i; + } + assert(false); + return -1; + } + + int get_next_index(int id) { + int cur_index = get_index(id); + for (int i = cur_index + 1; i < (int)gpus.size(); i++) { + if (gpus[i] == id) + return i; + } + assert(false); + return -1; + } +}; + +static sycl_gpu_mgr *g_sycl_gpu_mgr = NULL; static int g_device_count = -1; static int g_all_sycl_device_count = -1; static int g_main_device = -1; -static int g_main_device_index = -1; +static int g_main_device_id = -1; + +static std::array g_default_tensor_split = {}; static float g_tensor_split[GGML_SYCL_MAX_DEVICES] = {0}; @@ -3268,8 +3385,6 @@ struct sycl_device_id2index { int index; }; -static sycl_device_id2index g_sycl_device_id2index[GGML_SYCL_MAX_DEVICES] = { {-1} }; - static void * g_scratch_buffer = nullptr; static size_t g_scratch_size = 0; // disabled by default static size_t g_scratch_offset = 0; @@ -3290,6 +3405,63 @@ static void bad_arch(const sycl::stream &stream_ct1) { (void) bad_arch; // suppress unused function warning } +/* +device_index: device index from 0 to n (continue numbers). + It is used for device select/set in SYCL backend internal data structure. +*/ +void check_allow_gpu_index(const int device_index) { + if (device_index >= g_device_count) { + char error_buf[256]; + snprintf(error_buf, sizeof(error_buf), + "%s error: device_index:%d is out of range: [0-%d]", __func__, + device_index, g_device_count - 1); + fprintf(stderr, "%s\n", error_buf); + assert(false); + } +} + +/* +device_id: device ID is shown by ggml_backend_sycl_print_sycl_devices(). + It is only used to set current working device. +*/ +void check_allow_gpu_id(const int device_id) { + if (!g_sycl_gpu_mgr->is_allowed_gpu(device_id)) { + char error_buf[256]; + snprintf(error_buf, sizeof(error_buf), + "error: cannot set device=%d, which is not allowed. Please " + "set GPU ID in: [%s]", + device_id, g_sycl_gpu_mgr->gpus_list.c_str()); + fprintf(stderr, "%s\n", error_buf); + throw std::invalid_argument(error_buf); + } +} + +int get_current_device_id() { + return dpct::dev_mgr::instance().current_device_id(); +} + +inline dpct::err0 ggml_sycl_set_device(const int device) try { + + int device_id = g_sycl_gpu_mgr->gpus[device]; + check_allow_gpu_id(device_id); + + int current_device_id; + SYCL_CHECK(CHECK_TRY_ERROR(current_device_id = get_current_device_id())); + + // GGML_SYCL_DEBUG("ggml_sycl_set_device device_id=%d, + // current_device_id=%d\n", device, current_device); + if (device_id == current_device_id) { + return 0; + } + + return CHECK_TRY_ERROR(dpct::select_device(device_id)); +} catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + crash(); + std::exit(1); +} + void log_ggml_var_device(const char*name, float *src, size_t total_elements, bool src_on_device){ if(!g_ggml_sycl_debug) return; if(!src){ @@ -3302,22 +3474,18 @@ void log_ggml_var_device(const char*name, float *src, size_t total_elements, boo size_t total_size = total_elements*sizeof(float); float *local_buf = NULL; - // printf("total_size %d2, src_on_device %d\n", total_size, src_on_device); if(src_on_device) { local_buf = (float *) ggml_sycl_host_malloc(total_size); - // printf("local buf %p size %d bytes\n", local_buf, total_size); ggml_sycl_set_device(g_main_device); - dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; + dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0]; main_stream->memcpy(local_buf, src, total_size); } else { local_buf = (float *)src; - // printf("local buf from src-> data %p\n", local_buf); } std::ofstream logfile; logfile.open(filename); - // printf("local buf element %d\n", total_elements); for(size_t i=0; iextra; - src_data = (float*)src_extra->data_device[g_main_device_index]; + src_data = (float*)src_extra->data_device[g_main_device]; } else { src_data = (float *)src->data; @@ -3359,10 +3527,6 @@ void log_tensor_with_cnt(const char* name, struct ggml_tensor * src, int stop_cn sprintf(filename, "%s_%07d", name, log_file_name_idx); log_file_name_idx++; print_ggml_tensor(filename, src); - // print_ggml_tensor("ggml_sycl_rms_norm_src0", (ggml_tensor *)src0); - // print_ggml_tensor("ggml_sycl_rms_norm_src1", (ggml_tensor *)src1); - // int *ptr = NULL; - // *ptr = 0; } static __dpct_inline__ float warp_reduce_sum(float x, @@ -3583,6 +3747,28 @@ static void relu_f32(const float * x, float * dst, const int k, dst[i] = sycl::fmax((float)(x[i]), (float)0); } +static void hardsigmoid_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f)); +} + +static void hardswish_f32(const float * x, float * dst, const int k, + const sycl::nd_item<3> &item_ct1) { + const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2); + + if (i >= k) { + return; + } + dst[i] = x[i] * sycl::fmin(1.0f, sycl::fmax(0.0f, (x[i] + 3.0f) / 6.0f)); +} + static void leaky_relu_f32(const float *x, float *dst, const int k, const float negative_slope, const sycl::nd_item<3> &item_ct1) { const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + @@ -4964,8 +5150,8 @@ static void k_get_rows_float( template static void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k, const sycl::nd_item<3> &item_ct1) { - const int i = item_ct1.get_local_range(2) * item_ct1.get_group(2) + - 2 * item_ct1.get_local_id(2); + const int i = 2 * (item_ct1.get_local_range(2) * item_ct1.get_group(2) + + item_ct1.get_local_id(2)); if (i >= k) { return; @@ -7695,7 +7881,7 @@ static void cpy_1_f16_f16(const char * cxi, char * cdsti) { static void cpy_1_f16_f32(const char * cxi, char * cdsti) { const sycl::half *xi = (const sycl::half *)cxi; - float *dsti = (float *)cdsti; + float * dsti = (float *) cdsti; *dsti = *xi; } @@ -8297,6 +8483,62 @@ static void im2col_kernel(const float *x, T *dst, int offset_delta, } } +template +static void pool2d_nchw_kernel( + const int ih, const int iw, const int oh, const int ow, + const int kh, const int kw, const int sh, const int sw, + const int ph, const int pw, const int parallel_elements, + const Ti* src, To* dst, const enum ggml_op_pool op, + const sycl::nd_item<3> &item_ct1) { + int idx = item_ct1.get_local_id(2) + + item_ct1.get_group(2) * item_ct1.get_local_range(2); + if (idx >= parallel_elements) { + return; + } + + const int I_HW = ih * iw; + const int O_HW = oh * ow; + const int nc = idx / O_HW; + const int cur_oh = idx % O_HW / ow; + const int cur_ow = idx % O_HW % ow; + const Ti* i_ptr = src + nc * I_HW; + To* o_ptr = dst + nc * O_HW; + const int start_h = cur_oh * sh - ph; + const int bh = sycl::max(0, start_h); + const int eh = sycl::min(ih, start_h + kh); + const int start_w = cur_ow * sw - pw; + const int bw = sycl::max(0, start_w); + const int ew = sycl::min(iw, start_w + kw); + + To res = 0; + + switch (op) { + case GGML_OP_POOL_AVG: res = 0; break; + case GGML_OP_POOL_MAX: res = -FLT_MAX; break; + } + + for (int i = bh; i < eh; i += 1) { + for (int j = bw; j < ew; j += 1) { +#if DPCT_COMPATIBILITY_TEMP >= 350 + /* + DPCT1098:106: The '*' expression is used instead of the __ldg + call. These two expressions do not provide the exact same + functionality. Check the generated code for potential precision + and/or performance issues. + */ + Ti cur = *(i_ptr + i * iw + j); +#else + Ti cur = i_ptr[i * iw + j]; +#endif + switch (op) { + case GGML_OP_POOL_AVG: res += (cur / (kh * kw)); break; + case GGML_OP_POOL_MAX: res = sycl::max(res, (To)cur); break; + } + } + } + o_ptr[cur_oh * ow + cur_ow] = res; +} + template static void get_rows_sycl(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const void *src0_dd, @@ -8585,6 +8827,30 @@ static void relu_f32_sycl(const float *x, float *dst, const int k, }); } +static void hardsigmoid_f32_sycl(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + SYCL_HARDSIGMOID_BLOCK_SIZE - 1) / SYCL_HARDSIGMOID_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_HARDSIGMOID_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + hardsigmoid_f32(x, dst, k, item_ct1); + }); +} + +static void hardswish_f32_sycl(const float *x, float *dst, const int k, + dpct::queue_ptr stream) { + const int num_blocks = (k + SYCL_HARDSWISH_BLOCK_SIZE - 1) / SYCL_HARDSWISH_BLOCK_SIZE; + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_HARDSWISH_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + hardswish_f32(x, dst, k, item_ct1); + }); +} + static void leaky_relu_f32_sycl(const float *x, float *dst, const int k, const float negative_slope, dpct::queue_ptr stream) { @@ -8811,11 +9077,10 @@ template static void dequantize_block_sycl(const void *__restrict__ vx, dst_t *__restrict__ y, const int k, dpct::queue_ptr stream) { - const int num_blocks = (k + SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / SYCL_DEQUANTIZE_BLOCK_SIZE; + const int num_blocks = (k + 2*SYCL_DEQUANTIZE_BLOCK_SIZE - 1) / (2*SYCL_DEQUANTIZE_BLOCK_SIZE); { dpct::has_capability_or_fail(stream->get_device(), {sycl::aspect::fp16}); - stream->parallel_for( sycl::nd_range<3>( sycl::range<3>(1, 1, num_blocks) * @@ -9208,24 +9473,6 @@ static void mul_mat_vec_q_sycl_submitter(const void *vx, const void *vy, }); } -int get_device_index_by_id(int id){ - int res = g_sycl_device_id2index[id].index; - // GGML_SYCL_DEBUG("get_device_index_by_id id=%d device_index=%d\n", id, res); - GGML_ASSERT(res>=0); - return res; -} - -int get_device_id_by_index(int index){ - int res = g_device_caps[index].device_id; - GGML_ASSERT(res>=0); - return res; -} - - -int get_current_device_index(){ - return get_device_index_by_id(dpct::dev_mgr::instance().current_device_id()); -} - static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, float *dst, const int ncols_x, const int nrows_x, const int ncols_y, @@ -9234,7 +9481,7 @@ static void ggml_mul_mat_q4_0_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -9349,7 +9596,7 @@ static void ggml_mul_mat_q4_1_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -9464,7 +9711,7 @@ static void ggml_mul_mat_q5_0_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -9579,7 +9826,7 @@ static void ggml_mul_mat_q5_1_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -9694,7 +9941,7 @@ static void ggml_mul_mat_q8_0_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -9809,7 +10056,7 @@ static void ggml_mul_mat_q2_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -9932,7 +10179,7 @@ static void ggml_mul_mat_q3_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -10060,7 +10307,7 @@ static void ggml_mul_mat_q4_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -10181,7 +10428,7 @@ static void ggml_mul_mat_q5_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -10302,7 +10549,7 @@ static void ggml_mul_mat_q6_K_q8_1_sycl(const void *vx, const void *vy, int id; SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); const int compute_capability = g_device_caps[id].cc; int mmq_x, mmq_y, nwarps; @@ -10458,6 +10705,31 @@ static void ggml_mul_mat_vec_nc_f16_f32_sycl( } } +static void +ggml_cpy_f16_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00, + const int ne01, const int ne02, const int nb00, + const int nb01, const int nb02, const int nb03, + const int ne10, const int ne11, const int ne12, + const int nb10, const int nb11, const int nb12, + const int nb13, dpct::queue_ptr stream) { + + const int num_blocks = (ne + SYCL_CPY_BLOCK_SIZE - 1) / SYCL_CPY_BLOCK_SIZE; + { + dpct::has_capability_or_fail(stream->get_device(), + {sycl::aspect::fp16}); + + stream->parallel_for( + sycl::nd_range<3>(sycl::range<3>(1, 1, num_blocks) * + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_CPY_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + cpy_f32_f16(cx, cdst, ne, ne00, ne01, ne02, nb00, + nb01, nb02, nb03, ne10, ne11, ne12, + nb10, nb11, nb12, nb13, item_ct1); + }); + } +} + static void ggml_cpy_f32_f32_sycl(const char *cx, char *cdst, const int ne, const int ne00, const int ne01, const int ne02, const int nb00, @@ -11014,12 +11286,9 @@ struct sycl_buffer { static sycl_buffer g_sycl_buffer_pool[GGML_SYCL_MAX_DEVICES][MAX_SYCL_BUFFERS]; static size_t g_sycl_pool_size[GGML_SYCL_MAX_DEVICES] = {0}; -static void *ggml_sycl_pool_malloc_leg(size_t size, size_t *actual_size) try { +static void *ggml_sycl_pool_malloc_leg(int device_index, size_t size, size_t *actual_size) try { scoped_spin_lock lock(g_sycl_pool_lock); - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); - // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg index %d\n", id); + // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg device_index %d size=%lu\n", device_index, size); #ifdef DEBUG_SYCL_MALLOC int nnz = 0; size_t max_size = 0; @@ -11027,7 +11296,7 @@ static void *ggml_sycl_pool_malloc_leg(size_t size, size_t *actual_size) try { size_t best_diff = 1ull << 36; int ibest = -1; for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { - sycl_buffer& b = g_sycl_buffer_pool[id][i]; + sycl_buffer& b = g_sycl_buffer_pool[device_index][i]; if (b.ptr != nullptr) { #ifdef DEBUG_SYCL_MALLOC ++nnz; @@ -11043,7 +11312,7 @@ static void *ggml_sycl_pool_malloc_leg(size_t size, size_t *actual_size) try { *actual_size = b.size; b.ptr = nullptr; b.size = 0; - // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return 1 %p\n", ptr); + // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return 1 %p and rm in pool\n", ptr); return ptr; } } @@ -11051,30 +11320,30 @@ static void *ggml_sycl_pool_malloc_leg(size_t size, size_t *actual_size) try { } } if (ibest >= 0) { - sycl_buffer& b = g_sycl_buffer_pool[id][ibest]; + sycl_buffer& b = g_sycl_buffer_pool[device_index][ibest]; void * ptr = b.ptr; *actual_size = b.size; b.ptr = nullptr; b.size = 0; - // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return 2 %p\n", ptr); + // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return 2 %p and rm in pool\n", ptr); return ptr; } void * ptr; size_t look_ahead_size = (size_t) (1.05 * size); look_ahead_size = 256 * ((look_ahead_size + 255)/256); - const dpct::queue_ptr stream = g_syclStreams[id][0]; + const dpct::queue_ptr stream = g_syclStreams[device_index][0]; SYCL_CHECK( CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device( look_ahead_size, *stream))); *actual_size = look_ahead_size; - g_sycl_pool_size[id] += look_ahead_size; + g_sycl_pool_size[device_index] += look_ahead_size; #ifdef DEBUG_SYCL_MALLOC fprintf(stderr, "%s[%d]: %d buffers, max_size = %u MB, pool_size = %u MB, requested %u MB\n", __func__, id, nnz, (uint32_t)(max_size/1024/1024), (uint32_t)(g_sycl_pool_size[id]/1024/1024), (uint32_t)(size/1024/1024)); #endif - // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg return %p\n", ptr); + // GGML_SYCL_DEBUG("ggml_sycl_pool_malloc_leg look_ahead_size=%lu, return %p\n", look_ahead_size, ptr); return ptr; } catch (sycl::exception const &exc) { @@ -11083,15 +11352,11 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_pool_free_leg(void *ptr, size_t size) try { +static void ggml_sycl_pool_free_leg(int device_index, void *ptr, size_t size) try { scoped_spin_lock lock(g_sycl_pool_lock); - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); - - const dpct::queue_ptr stream = g_syclStreams[id][0]; + const dpct::queue_ptr stream = g_syclStreams[device_index][0]; for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) { - sycl_buffer& b = g_sycl_buffer_pool[id][i]; + sycl_buffer& b = g_sycl_buffer_pool[device_index][i]; if (b.ptr == nullptr) { b.ptr = ptr; b.size = size; @@ -11100,7 +11365,7 @@ static void ggml_sycl_pool_free_leg(void *ptr, size_t size) try { } fprintf(stderr, "WARNING: sycl buffer pool full, increase MAX_SYCL_BUFFERS\n"); SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *stream))); - g_sycl_pool_size[id] -= size; + g_sycl_pool_size[device_index] -= size; } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -11117,7 +11382,8 @@ DPCT1082:64: Migration of CUmemGenericAllocationHandle type is not supported. static dpct::device_ptr g_sycl_pool_addr[GGML_SYCL_MAX_DEVICES] = {0}; static size_t g_sycl_pool_used[GGML_SYCL_MAX_DEVICES] = {0}; -static void *ggml_sycl_pool_malloc_vmm(size_t size, size_t *actual_size) try { +static void *ggml_sycl_pool_malloc_vmm(int device_index, size_t size, size_t *actual_size) try { + GGML_UNUSED(device_index); GGML_UNUSED(size); GGML_UNUSED(actual_size); return NULL; @@ -11128,20 +11394,16 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_pool_free_vmm(void *ptr, size_t size) try { +static void ggml_sycl_pool_free_vmm(int device_index, void *ptr, size_t size) try { scoped_spin_lock lock(g_sycl_pool_lock); - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = dpct::dev_mgr::instance().current_device_id())); - #ifdef DEBUG_SYCL_MALLOC - printf("sycl pool[%d]: freed %llu bytes at %llx\n", id, (unsigned long long) size, ptr); + printf("sycl pool[%d]: freed %llu bytes at %llx\n", device_index, (unsigned long long) size, ptr); #endif - g_sycl_pool_used[id] -= size; + g_sycl_pool_used[device_index] -= size; // all deallocations must be in reverse order of the allocations - GGML_ASSERT(ptr == (void *) (g_sycl_pool_addr[id] + g_sycl_pool_used[id])); + GGML_ASSERT(ptr == (void *) (g_sycl_pool_addr[device_index] + g_sycl_pool_used[device_index])); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -11149,14 +11411,11 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void *ggml_sycl_pool_malloc(size_t size, size_t *actual_size) try { - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); - if (g_device_caps[id].vmm) { - return ggml_sycl_pool_malloc_vmm(size, actual_size); +static void *ggml_sycl_pool_malloc(int device_index, size_t size, size_t *actual_size) try { + if (g_device_caps[device_index].vmm) { + return ggml_sycl_pool_malloc_vmm(device_index, size, actual_size); } else { - return ggml_sycl_pool_malloc_leg(size, actual_size); + return ggml_sycl_pool_malloc_leg(device_index, size, actual_size); } } catch (sycl::exception const &exc) { @@ -11165,14 +11424,11 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_sycl_pool_free(void *ptr, size_t size) try { - int id; - SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); - if (g_device_caps[id].vmm) { - ggml_sycl_pool_free_vmm(ptr, size); +static void ggml_sycl_pool_free(int device_index, void *ptr, size_t size) try { + if (g_device_caps[device_index].vmm) { + ggml_sycl_pool_free_vmm(device_index, ptr, size); } else { - ggml_sycl_pool_free_leg(ptr, size); + ggml_sycl_pool_free_leg(device_index, ptr, size); } } catch (sycl::exception const &exc) { @@ -11184,13 +11440,17 @@ catch (sycl::exception const &exc) { template struct sycl_pool_alloc { + int device_index = -1; + int device_id = -1; T * ptr = nullptr; size_t actual_size = 0; // size is in number of elements T * alloc(size_t size) { GGML_ASSERT(ptr == nullptr); - ptr = (T *) ggml_sycl_pool_malloc(size * sizeof(T), &this->actual_size); + device_id = get_current_device_id(); + device_index = g_sycl_gpu_mgr->get_index(device_id); + ptr = (T *) ggml_sycl_pool_malloc(device_index, size * sizeof(T), &this->actual_size); // GGML_SYCL_DEBUG("alloc %lu return %p actual size=%lu\n", size * sizeof(T), ptr, this->actual_size); return ptr; } @@ -11201,7 +11461,7 @@ struct sycl_pool_alloc { ~sycl_pool_alloc() { if (ptr != nullptr) { - ggml_sycl_pool_free(ptr, actual_size); + ggml_sycl_pool_free(device_index, ptr, actual_size); } } @@ -11222,44 +11482,57 @@ bool ggml_sycl_loaded(void) { return g_sycl_loaded; } -void ggml_backend_sycl_print_sycl_devices(){ - int device_count = dpct::dev_mgr::instance().device_count(); - fprintf(stderr, "found %d SYCL devices:\n", device_count); - for (int id = 0; id < device_count; ++id) { - dpct::device_info prop; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(id)))); - sycl::device cur_device = dpct::dev_mgr::instance().get_device(id); - fprintf(stderr, " Device %d: %s,\tcompute capability %d.%d,\n\tmax compute_units %d,\tmax work group size %d,\tmax sub group size %d,\tglobal mem size %lu\n", id, - prop.get_name(), prop.get_major_version(), - prop.get_minor_version(), - prop.get_max_compute_units(), - prop.get_max_work_group_size(), - prop.get_max_sub_group_size(), - prop.get_global_mem_size() - ); - } - // fprintf(stderr, "\n"); +void print_device_detail(int id) { + dpct::device_info prop; + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::get_device_info(prop, dpct::dev_mgr::instance().get_device(id)))); + sycl::device cur_device = dpct::dev_mgr::instance().get_device(id); + std::string version; + version += std::to_string(prop.get_major_version()); + version += "."; + version += std::to_string(prop.get_minor_version()); + + fprintf(stderr, "|%2d|%45s|%18s|%17d|%14d|%13d|%15lu|\n", id, + prop.get_name(), version.c_str(), prop.get_max_compute_units(), + prop.get_max_work_group_size(), prop.get_max_sub_group_size(), + prop.get_global_mem_size()); } -int get_sycl_env(const char* env_name, int default_val){ - char * user_device_string = getenv(env_name); +void ggml_backend_sycl_print_sycl_devices() { + int device_count = dpct::dev_mgr::instance().device_count(); + fprintf(stderr, "found %d SYCL devices:\n", device_count); + fprintf(stderr, "|ID| Name |compute capability|Max compute units|Max work group|Max sub group|Global mem size|\n"); + fprintf(stderr, "|--|---------------------------------------------|------------------|-----------------|--------------|-------------|---------------|\n"); + for (int id = 0; id < device_count; ++id) { + print_device_detail(id); + } +} + +void print_gpu_device_list() { + fprintf(stderr, "detect %d SYCL GPUs: [%s] with Max compute units:%d\n", + g_sycl_gpu_mgr->get_gpu_count(), + g_sycl_gpu_mgr->gpus_list.c_str(), + g_sycl_gpu_mgr->max_compute_units); +} + +int get_sycl_env(const char *env_name, int default_val) { + char *user_device_string = getenv(env_name); int user_number = default_val; unsigned n; - if (user_device_string != NULL && sscanf(user_device_string, " %u", &n) == 1) { - user_number = (int)n; - } else { - user_number=default_val; - } + if (user_device_string != NULL && + sscanf(user_device_string, " %u", &n) == 1) { + user_number = (int)n; + } else { + user_number = default_val; + } return user_number; } -int get_work_group_size(int user_device_id){ +int get_work_group_size(int user_device_id) { dpct::device_info prop; - dpct::get_device_info( - prop, - dpct::dev_mgr::instance().get_device(user_device_id)); + dpct::get_device_info(prop, + dpct::dev_mgr::instance().get_device(user_device_id)); return prop.get_max_work_group_size(); } @@ -11268,113 +11541,81 @@ void ggml_init_sycl() try { if (!initialized) { g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0); + fprintf(stderr, "%s: GGML_SYCL_DEBUG: %d\n", __func__, g_ggml_sycl_debug); - printf("GGML_SYCL_DEBUG=%d\n", g_ggml_sycl_debug); - - int user_device_id = get_sycl_env("GGML_SYCL_DEVICE", 0); - +#if defined(GGML_SYCL_F16) + fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__); +#else + fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); +#endif if (CHECK_TRY_ERROR(g_all_sycl_device_count = - dpct::dev_mgr::instance().device_count()) != - 0) { + dpct::dev_mgr::instance().device_count()) != 0) { initialized = true; g_sycl_loaded = false; return; } GGML_ASSERT(g_all_sycl_device_count <= GGML_SYCL_MAX_DEVICES); + ggml_backend_sycl_print_sycl_devices(); + + if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); + + g_device_count = g_sycl_gpu_mgr->get_gpu_count(); + g_work_group_size = g_sycl_gpu_mgr->work_group_size; + + print_gpu_device_list(); + int64_t total_vram = 0; -#if defined(GGML_SYCL_F16) - fprintf(stderr, "%s: GGML_SYCL_F16: yes\n", __func__); -#else - fprintf(stderr, "%s: GGML_SYCL_F16: no\n", __func__); -#endif - - +/* NOT REMOVE, keep it for next optimize for XMX. #if defined(SYCL_USE_XMX) fprintf(stderr, "%s: SYCL_USE_XMX: yes\n", __func__); #else fprintf(stderr, "%s: SYCL_USE_XMX: no\n", __func__); #endif - ggml_backend_sycl_print_sycl_devices(); +*/ for (int id = 0; id < GGML_SYCL_MAX_DEVICES; ++id) { - g_sycl_device_id2index[id].index = -1; g_device_caps[id].vmm = 0; g_device_caps[id].device_id = -1; g_device_caps[id].cc = 0; g_tensor_split[id] = 0; + g_default_tensor_split[id] = 0; } - int device_inx = -1; - for (int id = 0; id < g_all_sycl_device_count; ++id) { - if(id!=user_device_id) continue; - - device_inx++; - - g_device_caps[device_inx].vmm = 0; - g_device_caps[device_inx].device_id = id; - g_sycl_device_id2index[id].index = device_inx; + for (int i = 0; i < g_device_count; ++i) { + int device_id = g_sycl_gpu_mgr->gpus[i]; + g_device_caps[i].vmm = 0; dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(id)))); + prop, dpct::dev_mgr::instance().get_device(device_id)))); - g_tensor_split[device_inx] = total_vram; + g_default_tensor_split[i] = total_vram; total_vram += prop.get_global_mem_size(); - g_device_caps[device_inx].cc = + g_device_caps[i].cc = 100 * prop.get_major_version() + 10 * prop.get_minor_version(); - - } - device_inx = -1; - for (int id = 0; id < g_all_sycl_device_count; ++id) { - if(id!=user_device_id) continue; - device_inx++; - g_tensor_split[device_inx] /= total_vram; } - device_inx = -1; - for (int id = 0; id < g_all_sycl_device_count; ++id) { - if(id!=user_device_id) continue; - device_inx++; - SYCL_CHECK(ggml_sycl_set_device(id)); + for (int i = 0; i < g_device_count; ++i) { + g_default_tensor_split[i] /= total_vram; + } + + for (int i = 0; i < g_device_count; ++i) { + SYCL_CHECK(ggml_sycl_set_device(i)); // create sycl streams for (int is = 0; is < MAX_STREAMS; ++is) { - /* - DPCT1025:88: The SYCL queue is created ignoring the flag and - priority options. - */ SYCL_CHECK(CHECK_TRY_ERROR( - g_syclStreams[device_inx][is] = - dpct::get_current_device().create_queue())); + g_syclStreams[i][is] = + dpct::get_current_device().create_queue( + g_sycl_gpu_mgr->get_co_ctx(), dpct::get_current_device()))); } - const dpct::queue_ptr stream = g_syclStreams[device_inx][0]; + const dpct::queue_ptr stream = g_syclStreams[i][0]; // create sycl handle - SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[device_inx] = - stream)); - /* - DPCT1027:89: The call to syclSetMathMode was replaced with 0 - because this functionality is redundant in SYCL. - */ - SYCL_CHECK(0); + SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[i] = stream)); } - // configure logging to stdout - // SYCL_CHECK(syclLoggerConfigure(1, 1, 0, nullptr)); - - //hardcode, force set to 1 device - g_device_count = 1; - ggml_sycl_set_main_device(user_device_id); - ggml_sycl_set_device(user_device_id); - g_work_group_size = get_work_group_size(user_device_id); - // fprintf(stderr, "Using Device %d\n", user_device_id); - - // for (int id = 0; id < g_all_sycl_device_count; ++id) { - // GGML_SYCL_DEBUG("id=%d g_device_caps[%d].device_id=%d g_sycl_device_id2index[%d].index=%d ", id, id, - // g_device_caps[id].device_id, id, g_sycl_device_id2index[id].index); - // } - initialized = true; g_sycl_loaded = true; } @@ -11385,31 +11626,6 @@ catch (sycl::exception const &exc) { std::exit(1); } - -void ggml_sycl_set_tensor_split(const float * tensor_split) { - if (tensor_split == nullptr) { - return; - } - bool all_zero = true; - for (int i = 0; i < g_device_count; ++i) { - if (tensor_split[i] != 0.0f) { - all_zero = false; - break; - } - } - if (all_zero) { - return; - } - float split_sum = 0.0f; - for (int i = 0; i < g_device_count; ++i) { - g_tensor_split[i] = split_sum; - split_sum += tensor_split[i]; - } - for (int i = 0; i < g_device_count; ++i) { - g_tensor_split[i] /= split_sum; - } -} - void *ggml_sycl_host_malloc(size_t size) try { if (getenv("GGML_SYCL_NO_PINNED") != nullptr) { return nullptr; @@ -11419,28 +11635,14 @@ void *ggml_sycl_host_malloc(size_t size) try { //allow to use dpct::get_in_order_queue() for host malloc dpct::err0 err = CHECK_TRY_ERROR( ptr = (void *)sycl::malloc_host(size, dpct::get_in_order_queue())); - /* - DPCT1000:82: Error handling if-stmt was detected but could not be rewritten. - */ + if (err != 0) { // clear the error - /* - DPCT1026:83: The call to syclGetLastError was removed because this - functionality is redundant in SYCL. - */ - /* - DPCT1001:81: The statement could not be removed. - */ fprintf( stderr, "WARNING: failed to allocate %.2f MB of pinned memory: %s\n", - /* - DPCT1009:84: SYCL uses exceptions to report errors and does not use - the error codes. The original code was commented out and a warning - string was inserted. You need to rewrite this code. - */ size / 1024.0 / 1024.0, - "syclGetErrorString is not supported" /*syclGetErrorString(err)*/); + "syclGetErrorString is not supported"); return nullptr; } @@ -11480,7 +11682,7 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst, ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra; int id; SYCL_CHECK(CHECK_TRY_ERROR( - id = get_current_device_index())); + id = get_current_device_id())); // GGML_SYCL_DEBUG("current device index %d\n", id); src_ptr = (char *) extra->data_device[id]; } else { @@ -11714,7 +11916,6 @@ inline void ggml_sycl_op_tanh(const ggml_tensor *src0, const ggml_tensor *src1, GGML_ASSERT(src0->type == GGML_TYPE_F32); GGML_ASSERT( dst->type == GGML_TYPE_F32); - tanh_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); (void) src1; @@ -11737,6 +11938,37 @@ inline void ggml_sycl_op_relu(const ggml_tensor *src0, const ggml_tensor *src1, (void) src1_dd; } +static void ggml_sycl_op_hardsigmoid(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, + const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + hardsigmoid_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + +static void ggml_sycl_op_hardswish(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + hardswish_f32_sycl(src0_dd, dst_dd, ggml_nelements(src0), main_stream); + + (void) src1; + (void) dst; + (void) src1_dd; +} + inline void ggml_sycl_op_leaky_relu(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, const float *src1_dd, @@ -11905,7 +12137,7 @@ inline void ggml_sycl_op_mul_mat_q( int device_id; SYCL_CHECK( - CHECK_TRY_ERROR(device_id = dpct::dev_mgr::instance().current_device_id())); + CHECK_TRY_ERROR(device_id = get_current_device_id())); // the main device has a larger memory buffer to hold the results from all GPUs // nrows_dst == nrows of the matrix that the dequantize_mul_mat kernel writes into @@ -11957,16 +12189,16 @@ catch (sycl::exception const &exc) { std::exit(1); } -static int64_t get_row_rounding(ggml_type type) { +static int64_t get_row_rounding(ggml_type type, const std::array & tensor_split) { int64_t min_compute_capability = INT_MAX; int64_t max_compute_capability = INT_MIN; - for (int64_t id = 0; id < g_device_count; ++id) { - if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { - if (min_compute_capability > g_device_caps[id].cc) { - min_compute_capability = g_device_caps[id].cc; + for (int i = 0; i < g_device_count; ++i) { + if (tensor_split[i] < (i + 1 < g_device_count ? tensor_split[i + 1] : 1.0f)) { + if (min_compute_capability > g_device_caps[i].cc) { + min_compute_capability = g_device_caps[i].cc; } - if (max_compute_capability < g_device_caps[id].cc) { - max_compute_capability = g_device_caps[id].cc; + if (max_compute_capability < g_device_caps[i].cc) { + max_compute_capability = g_device_caps[i].cc; } } } @@ -11986,12 +12218,16 @@ static int64_t get_row_rounding(ggml_type type) { case GGML_TYPE_Q3_K: case GGML_TYPE_Q4_K: case GGML_TYPE_Q5_K: + case GGML_TYPE_IQ2_XXS: + case GGML_TYPE_IQ2_XS: + case GGML_TYPE_IQ3_XXS: return max_compute_capability >= VER_GEN9 ? 128 : 64; case GGML_TYPE_Q6_K: return 64; default: GGML_ASSERT(false); } + } inline void ggml_sycl_op_mul_mat_vec_q( @@ -12176,27 +12412,22 @@ inline void ggml_sycl_op_mul_mat_sycl( const int64_t row_diff = row_high - row_low; int id; - int device_id = dpct::dev_mgr::instance().current_device_id(); SYCL_CHECK( - CHECK_TRY_ERROR(id = get_current_device_index())); + CHECK_TRY_ERROR(id = get_current_device_id())); // the main device has a larger memory buffer to hold the results from all GPUs // ldc == nrows of the matrix that cuBLAS writes into - int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && device_id == g_main_device ? ne0 : row_diff; + int ldc = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device ? ne0 : row_diff; #ifdef GGML_SYCL_F16 bool use_fp16 = true; // TODO(Yu) SYCL capability check #else bool use_fp16 = false; #endif - // if (compute_capability >= VER_GEN9 && (src0->type == GGML_TYPE_F16 || - // ggml_is_quantized(src0->type)) && ggml_is_contiguous(src0) && row_diff == - // src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { if ((src0->type == GGML_TYPE_F16 || ggml_is_quantized(src0->type)) && use_fp16 && ggml_is_contiguous(src0) && row_diff == src0->ne[1] && dst->op_params[0] == GGML_PREC_DEFAULT) { - // convert src0 and src1 to fp16, multiply as fp16, convert dst to fp32 // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp16 path\n"); sycl_pool_alloc src0_as_f16; if (src0->type != GGML_TYPE_F16) { @@ -12225,7 +12456,6 @@ inline void ggml_sycl_op_mul_mat_sycl( const sycl::half alpha_f16 = 1.0f; const sycl::half beta_f16 = 0.0f; - SYCL_CHECK(CHECK_TRY_ERROR(g_sycl_handles[id] = stream)); SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm( *g_sycl_handles[id], oneapi::mkl::transpose::trans, @@ -12241,14 +12471,21 @@ inline void ggml_sycl_op_mul_mat_sycl( else { // GGML_SYCL_DEBUG("ggml_sycl_op_mul_mat_sycl - fp32 path\n"); sycl_pool_alloc src0_ddq_as_f32; - + sycl_pool_alloc src1_ddq_as_f32; if (src0->type != GGML_TYPE_F32) { const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src0->type); GGML_ASSERT(to_fp32_sycl != nullptr); src0_ddq_as_f32.alloc(row_diff*ne00); to_fp32_sycl(src0_dd_i, src0_ddq_as_f32.get(), row_diff*ne00, stream); } + if (src1->type != GGML_TYPE_F32) { + const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(src1->type); + GGML_ASSERT(to_fp32_sycl != nullptr); + src1_ddq_as_f32.alloc(src1_ncols*ne10); + to_fp32_sycl(src1_ddf_i, src1_ddq_as_f32.get(), src1_ncols*ne10, stream); + } const float * src0_ddf_i = src0->type == GGML_TYPE_F32 ? (const float *) src0_dd_i : src0_ddq_as_f32.get(); + const float * src1_ddf1_i = src1->type == GGML_TYPE_F32 ? (const float *) src1_ddf_i : src1_ddq_as_f32.get(); const float alpha = 1.0f; const float beta = 0.0f; @@ -12261,7 +12498,6 @@ inline void ggml_sycl_op_mul_mat_sycl( src1_ddf_i, ne10, dpct::get_value(&beta, *g_sycl_handles[id]), dst_dd_i, ldc))); } - (void) dst; (void) src1_ddq_i; (void) src1_padded_row_size; @@ -12382,6 +12618,48 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1, (void) src1_dd; } +static void ggml_sycl_op_pool2d(const ggml_tensor *src0, + const ggml_tensor *src1, ggml_tensor *dst, + const float *src0_dd, const float *src1_dd, + float *dst_dd, const dpct::queue_ptr &main_stream) { + + GGML_ASSERT(src0->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + const int32_t * opts = (const int32_t *)dst->op_params; + enum ggml_op_pool op = static_cast(opts[0]); + const int k0 = opts[1]; + const int k1 = opts[2]; + const int s0 = opts[3]; + const int s1 = opts[4]; + const int p0 = opts[5]; + const int p1 = opts[6]; + + const int64_t IH = src0->ne[1]; + const int64_t IW = src0->ne[0]; + + const int64_t N = dst->ne[3]; + const int64_t OC = dst->ne[2]; + const int64_t OH = dst->ne[1]; + const int64_t OW = dst->ne[0]; + + const int parallel_elements = N * OC * OH * OW; + const int num_blocks = (parallel_elements + SYCL_POOL2D_BLOCK_SIZE - 1) / SYCL_POOL2D_BLOCK_SIZE; + sycl::range<3> block_nums(1, 1, num_blocks); + main_stream->parallel_for( + sycl::nd_range<3>(block_nums * + sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE), + sycl::range<3>(1, 1, SYCL_IM2COL_BLOCK_SIZE)), + [=](sycl::nd_item<3> item_ct1) { + pool2d_nchw_kernel(IH, IW, OH, OW, k1, k0, s1, s0, p1, p0, + parallel_elements, src0_dd, dst_dd, op, + item_ct1); + }); + + (void) src1; + (void) src1_dd; +} + inline void ggml_sycl_op_im2col(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, const float *src0_dd, const float *src1_dd, @@ -12606,12 +12884,12 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, sycl_pool_alloc dst_f; ggml_sycl_set_device(g_main_device); - dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; - // GGML_SYCL_DEBUG("g_main_device_index=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n", - // g_main_device_index, main_stream, src0_on_device, src1_on_device, dst_on_device); + dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0]; + // GGML_SYCL_DEBUG("g_main_device=%d, main_stream=%p src0_on_device=%d, src1_on_device=%d, dst_on_device=%d\n", + // g_main_device, main_stream, src0_on_device, src1_on_device, dst_on_device); if (src0_on_device) { - src0_ddf = (float *) src0_extra->data_device[g_main_device_index]; + src0_ddf = (float *) src0_extra->data_device[g_main_device]; } else { src0_ddf = src0_f.alloc(ggml_nelements(src0)); // GGML_SYCL_DEBUG("before ggml_sycl_cpy_tensor_2d src0_ddf=%p, src0=%p\n", src0_ddf, src0); @@ -12620,15 +12898,14 @@ static void ggml_sycl_op_flatten(const ggml_tensor *src0, if (use_src1) { if (src1_on_device) { - src1_ddf = (float *) src1_extra->data_device[g_main_device_index]; + src1_ddf = (float *) src1_extra->data_device[g_main_device]; } else { src1_ddf = src1_f.alloc(ggml_nelements(src1)); SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src1_ddf, src1, 0, 0, 0, nrows1, main_stream)); } } if (dst_on_device) { - dst_ddf = (float *) dst_extra->data_device[g_main_device_index]; - // printf("zjy dst_ddf=%p main_stream=%p g_main_device_index=%d\n", dst_ddf, main_stream, g_main_device_index); + dst_ddf = (float *) dst_extra->data_device[g_main_device]; } else { dst_ddf = dst_f.alloc(ggml_nelements(dst)); } @@ -12672,21 +12949,19 @@ static void ggml_sycl_set_peer_access(const int n_tokens) { } #ifdef NDEBUG - for (int id = 0; id < g_device_count; ++id) { - SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); + for (int i = 0; i < g_device_count; ++i) { + SYCL_CHECK(ggml_sycl_set_device(i)); // SYCL_CHECK(syclDeviceSynchronize()); } - for (int id = 0; id < g_device_count; ++id) { - SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); - int device_id = g_device_caps[id].device_id; + for (int i = 0; i < g_device_count; ++i) { + SYCL_CHECK(ggml_sycl_set_device(i)); for (int id_other = 0; id_other < g_device_count; ++id_other) { - int device_id_other = g_device_caps[id_other].device_id; - if (device_id == id_other) { + if (i == id_other) { continue; } - if (device_id != g_main_device && device_id_other != g_main_device) { + if (i != g_main_device && id_other != g_main_device) { continue; } @@ -12706,6 +12981,10 @@ static void ggml_sycl_set_peer_access(const int n_tokens) { peer_access_enabled = enable_peer_access; } +struct ggml_backend_sycl_split_buffer_type_context { + std::array tensor_split; +}; + static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, const ggml_tensor *src1, ggml_tensor *dst, ggml_sycl_op_mul_mat_t op, @@ -12752,80 +13031,90 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, GGML_ASSERT(!(split && ne03 > 1)); GGML_ASSERT(!(split && ne02 < ne12)); - // dd = data device - char * src0_dd[GGML_SYCL_MAX_DEVICES] = {nullptr}; - float * src1_ddf[GGML_SYCL_MAX_DEVICES] = {nullptr}; // float - char * src1_ddq[GGML_SYCL_MAX_DEVICES] = {nullptr}; // q8_1 - float * dst_dd[GGML_SYCL_MAX_DEVICES] = {nullptr}; + std::array tensor_split; + if (split) { + // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_TYPE_GPU_SPLIT check + // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...); + ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *) src0->buffer->buft->context; + tensor_split = buft_ctx->tensor_split; + } - // as = actual size - size_t src0_as[GGML_SYCL_MAX_DEVICES] = {0}; - size_t src1_asf[GGML_SYCL_MAX_DEVICES] = {0}; - size_t src1_asq[GGML_SYCL_MAX_DEVICES] = {0}; - size_t dst_as[GGML_SYCL_MAX_DEVICES] = {0}; + struct dev_data { + sycl_pool_alloc src0_dd_alloc; + sycl_pool_alloc src1_ddf_alloc; + sycl_pool_alloc src1_ddq_alloc; + sycl_pool_alloc dst_dd_alloc; - int64_t row_low[GGML_SYCL_MAX_DEVICES]; - int64_t row_high[GGML_SYCL_MAX_DEVICES]; + char *src0_dd = nullptr; + float *src1_ddf = nullptr; // float + char *src1_ddq = nullptr; // q8_1 + float *dst_dd = nullptr; + + int64_t row_low; + int64_t row_high; + }; + + dev_data dev[GGML_SYCL_MAX_DEVICES]; int used_devices = 0; + dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0]; - for (int64_t id = 0; id < g_device_count; ++id) { + for (int i = 0; i < g_device_count; ++i) { // by default, use all rows - row_low[id] = 0; - row_high[id] = ne01; + dev[i].row_low = 0; + dev[i].row_high = ne01; // for multi GPU, get the row boundaries from tensor split // and round to mul_mat_q tile sizes if (split) { - const int64_t rounding = get_row_rounding(src0->type); + const int64_t rounding = get_row_rounding(src0->type, tensor_split); - if (id != 0) { - row_low[id] = ne01*g_tensor_split[id]; - if (row_low[id] < ne01) { - row_low[id] -= row_low[id] % rounding; + if (i != 0) { + dev[i].row_low = ne01*tensor_split[i]; + if (dev[i].row_low < ne01) { + dev[i].row_low -= dev[i].row_low % rounding; } } - if (id != g_device_count - 1) { - row_high[id] = ne01*g_tensor_split[id + 1]; - if (row_high[id] < ne01) { - row_high[id] -= row_high[id] % rounding; + if (i != g_device_count - 1) { + dev[i].row_high = ne01*tensor_split[i + 1]; + if (dev[i].row_high < ne01) { + dev[i].row_high -= dev[i].row_high % rounding; } } } } - for (int64_t id = 0; id < g_device_count; ++id) { - if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) { + for (int i = 0; i < g_device_count; ++i) { + if ((!split && i != g_main_device) || dev[i].row_low == dev[i].row_high) { continue; } used_devices++; - const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && i == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && i == g_main_device; - ggml_sycl_set_device(get_device_id_by_index(id)); - const dpct::queue_ptr stream = g_syclStreams[id][0]; + ggml_sycl_set_device(i); + dpct::queue_ptr stream = g_syclStreams[i][0]; if (src0_on_device && src0_is_contiguous) { - src0_dd[id] = (char *) src0_extra->data_device[id]; + dev[i].src0_dd = (char *) src0_extra->data_device[i]; } else { - // const size_t size_src0_ddq = split ? (row_high[id]-row_low[id])*ne00 * src0_ts/src0_bs : ggml_nbytes(src0); - src0_dd[id] = (char *) ggml_sycl_pool_malloc(ggml_nbytes(src0), &src0_as[id]); + dev[i].src0_dd = dev[i].src0_dd_alloc.alloc(ggml_nbytes(src0)); } if (src1_on_device && src1_is_contiguous) { - src1_ddf[id] = (float *) src1_extra->data_device[id]; + dev[i].src1_ddf = (float *) src1_extra->data_device[i]; } else { - src1_ddf[id] = (float *) ggml_sycl_pool_malloc(ggml_nbytes(src1), &src1_asf[id]); + dev[i].src1_ddf = dev[i].src1_ddf_alloc.alloc(ggml_nelements(src1)); } if (convert_src1_to_q8_1) { - src1_ddq[id] = (char *) ggml_sycl_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]); + dev[i].src1_ddq = dev[i].src1_ddq_alloc.alloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs); if (src1_on_device && src1_is_contiguous) { - quantize_row_q8_1_sycl(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream); + quantize_row_q8_1_sycl(dev[i].src1_ddf, dev[i].src1_ddq, ne10, nrows1, src1_padded_col_size, stream); /* DPCT1010:90: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to @@ -12836,25 +13125,25 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, } if (dst_on_device) { - dst_dd[id] = (float *) dst_extra->data_device[id]; + dev[i].dst_dd = (float *) dst_extra->data_device[i]; } else { - const size_t size_dst_ddf = split ? (row_high[id]-row_low[id])*ne1*sizeof(float) : ggml_nbytes(dst); - dst_dd[id] = (float *) ggml_sycl_pool_malloc(size_dst_ddf, &dst_as[id]); + const size_t size_dst_ddf = split ? (dev[i].row_high - dev[i].row_low)*ne1 : ggml_nelements(dst); + dev[i].dst_dd = dev[i].dst_dd_alloc.alloc(size_dst_ddf); } } // if multiple devices are used they need to wait for the main device // here an event is recorded that signals that the main device has finished calculating the input data if (split && used_devices > 1) { - SYCL_CHECK(ggml_sycl_set_device(g_main_device)); + ggml_sycl_set_device(g_main_device); /* DPCT1024:91: The original code returned the error code that was further consumed by the program logic. This original code was replaced with 0. You may need to rewrite the program logic consuming the error code. */ SYCL_CHECK(CHECK_TRY_ERROR( - *src0_extra->events[g_main_device_index][0] = - g_syclStreams[g_main_device_index][0]->ext_oneapi_submit_barrier())); + *src0_extra->events[g_main_device][0] = + g_syclStreams[g_main_device][0]->ext_oneapi_submit_barrier())); } const int64_t src1_col_stride = split && used_devices > 1 ? MUL_MAT_SRC1_COL_STRIDE : ne11; @@ -12862,22 +13151,27 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, const int64_t is = split ? (src1_col_0/src1_col_stride) % MAX_STREAMS : 0; const int64_t src1_ncols = src1_col_0 + src1_col_stride > ne11 ? ne11 - src1_col_0 : src1_col_stride; - for (int64_t id = 0; id < g_device_count; ++id) { - if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) { + for (int i = 0; i < g_device_count; ++i) { + if ((!split && i != g_main_device) || dev[i].row_low == dev[i].row_high) { continue; } - const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; - const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index; - const int64_t row_diff = row_high[id] - row_low[id]; + const bool src1_on_device = src1->backend == GGML_BACKEND_TYPE_GPU && i == g_main_device; + const bool dst_on_device = dst->backend == GGML_BACKEND_TYPE_GPU && i == g_main_device; + const int64_t row_diff = dev[i].row_high - dev[i].row_low; - ggml_sycl_set_device(get_device_id_by_index(id)); - const dpct::queue_ptr stream = g_syclStreams[id][is]; + ggml_sycl_set_device(i); + dpct::queue_ptr stream = g_syclStreams[i][is]; // wait for main GPU data if necessary - if (split && (id != g_main_device_index || is != 0)) { + if (split && (i != g_main_device || is != 0)) { + /* + DPCT1009:163: SYCL uses exceptions to report errors and does not + use the error codes. The original code was commented out and a + warning string was inserted. You need to rewrite this code. + */ SYCL_CHECK(CHECK_TRY_ERROR(stream->ext_oneapi_submit_barrier( - {*src0_extra->events[g_main_device_index][0]}))); + {*src0_extra->events[g_main_device][0]}))); } for (int64_t i0 = 0; i0 < ne13*ne12; ++i0) { @@ -12887,30 +13181,32 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, const size_t src1_ddq_i_offset = (i0*ne11 + src1_col_0) * src1_padded_col_size*q8_1_ts/q8_1_bs; // for split tensors the data begins at i0 == i0_offset_low - char * src0_dd_i = src0_dd[id] + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; - float * src1_ddf_i = src1_ddf[id] + (i0*ne11 + src1_col_0) * ne10; - char * src1_ddq_i = src1_ddq[id] + src1_ddq_i_offset; - float * dst_dd_i = dst_dd[id] + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); + char * src0_dd_i = dev[i].src0_dd + (i0/i02_divisor) * (ne01*ne00*src0_ts)/src0_bs; + float * src1_ddf_i = dev[i].src1_ddf + (i0*ne11 + src1_col_0) * ne10; + char * src1_ddq_i = dev[i].src1_ddq + src1_ddq_i_offset; + float * dst_dd_i = dev[i].dst_dd + (i0*ne1 + src1_col_0) * (dst_on_device ? ne0 : row_diff); // the main device memory buffer can be on VRAM scratch, with space for all partial results // in that case an offset on dst_ddf_i is needed - if (dst->backend == GGML_BACKEND_TYPE_GPU && id == g_main_device_index) { - dst_dd_i += row_low[id]; // offset is 0 if no tensor split + if (dst->backend == GGML_BACKEND_TYPE_GPU && i == g_main_device) { + dst_dd_i += dev[i].row_low; // offset is 0 if no tensor split } // copy src0, src1 to device if necessary if (src1->backend == GGML_BACKEND_TYPE_GPU && src1_is_contiguous) { - if (id != g_main_device_index) { + if (i != g_main_device) { if (convert_src1_to_q8_1) { - char * src1_ddq_i_source = src1_ddq[g_main_device_index] + src1_ddq_i_offset; - SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy( + char * src1_ddq_i_source = dev[g_main_device].src1_ddq + src1_ddq_i_offset; + SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy( src1_ddq_i, src1_ddq_i_source, src1_ncols * src1_padded_col_size * q8_1_ts / q8_1_bs))); } else { - float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device_index]; + + float * src1_ddf_i_source = (float *) src1_extra->data_device[g_main_device]; src1_ddf_i_source += (i0*ne11 + src1_col_0) * ne10; - SYCL_CHECK(CHECK_TRY_ERROR(stream->memcpy( + + SYCL_CHECK(CHECK_TRY_ERROR(dev2dev_memcpy(*stream, *main_stream, src1_ddf_i, src1_ddf_i_source, src1_ncols * ne10 * sizeof(float)))); } @@ -12933,14 +13229,14 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, } if (src1_col_0 == 0 && (!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { - SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, row_low[id], row_high[id], stream)); + SYCL_CHECK(ggml_sycl_cpy_tensor_2d(src0_dd_i, src0, i03, i02/i02_divisor, dev[i].row_low, dev[i].row_high, stream)); } if (src1->type == GGML_TYPE_F16) { src1_padded_col_size = (i0 * ne11 + src1_col_0) * ne10; } // do the computation op(src0, src1, dst, src0_dd_i, src1_ddf_i, src1_ddq_i, dst_dd_i, - row_low[id], row_high[id], src1_ncols, src1_padded_col_size, stream); + dev[i].row_low, dev[i].row_high, src1_ncols, src1_padded_col_size, stream); /* DPCT1010:93: SYCL uses exceptions to report errors and does not use the error codes. The call was replaced with 0. You need to @@ -12956,7 +13252,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, dst_off_device = dst->data; kind = dpct::device_to_host; } else if (dst->backend == GGML_BACKEND_TYPE_GPU) { - dst_off_device = dst_extra->data_device[g_main_device_index]; + dst_off_device = dst_extra->data_device[g_main_device]; kind = dpct::device_to_device; } else { GGML_ASSERT(false); @@ -12969,11 +13265,29 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, // If dst is a vector with ne0 == 1 then you don't have to do this but it still produces correct results. float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); - dhf_dst_i += src1_col_0*ne0 + row_low[id]; - SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy( - dhf_dst_i, ne0 * sizeof(float), dst_dd_i, - row_diff * sizeof(float), row_diff * sizeof(float), - src1_ncols, kind, *stream))); + dhf_dst_i += src1_col_0*ne0 + dev[i].row_low; + + //todo, dirty solution. Need be updated when device2device memcpy() is supported. + if (kind == dpct::device_to_device) { + size_t dst_size = ggml_nbytes_pad(dst); + float *host_buf = (float *)malloc(dst_size); + SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy( + host_buf, ne0 * sizeof(float), dst_dd_i, + row_diff * sizeof(float), row_diff * sizeof(float), + src1_ncols, dpct::device_to_host, *stream))); + dpct::dev_mgr::instance().get_device(g_sycl_gpu_mgr->gpus[i]).queues_wait_and_throw(); + SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy( + dhf_dst_i, ne0 * sizeof(float), host_buf, + row_diff * sizeof(float), row_diff * sizeof(float), + src1_ncols, dpct::host_to_device, *main_stream))); + dpct::dev_mgr::instance().get_device(g_sycl_gpu_mgr->gpus[g_main_device]).queues_wait_and_throw(); + free(host_buf); + } else { + SYCL_CHECK(CHECK_TRY_ERROR(dpct::async_dpct_memcpy( + dhf_dst_i, ne0 * sizeof(float), dst_dd_i, + row_diff * sizeof(float), row_diff * sizeof(float), + src1_ncols, kind, *stream))); + } } else { float * dhf_dst_i = (float *) ((char *) dst_off_device + i02*nb2 + i03*nb3); GGML_ASSERT(dst->nb[1] == ne0*sizeof(float)); @@ -12985,7 +13299,7 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, } // add event for the main device to wait on until other device is done - if (split && (id != g_main_device_index || is != 0)) { + if (split && (i != g_main_device || is != 0)) { /* DPCT1024:94: The original code returned the error code that was further consumed by the program logic. This original @@ -12993,48 +13307,27 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0, program logic consuming the error code. */ SYCL_CHECK(CHECK_TRY_ERROR( - *src0_extra->events[id][is] = + *src0_extra->events[i][is] = stream->ext_oneapi_submit_barrier())); } } } } - for (int64_t id = 0; id < g_device_count; ++id) { - if ((!split && id != g_main_device_index) || row_low[id] == row_high[id]) { - continue; - } - SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); - - // free buffers again when done - if (dst_as[id] > 0) { - ggml_sycl_pool_free(dst_dd[id], dst_as[id]); - } - if (src1_asq[id] > 0) { - ggml_sycl_pool_free(src1_ddq[id], src1_asq[id]); - } - if (src1_asf[id] > 0) { - ggml_sycl_pool_free(src1_ddf[id], src1_asf[id]); - } - if (src0_as[id] > 0) { - ggml_sycl_pool_free(src0_dd[id], src0_as[id]); - } - } - // main device waits for all other devices to be finished if (split && g_device_count > 1) { int64_t is_max = (ne11 + MUL_MAT_SRC1_COL_STRIDE - 1) / MUL_MAT_SRC1_COL_STRIDE; is_max = is_max <= MAX_STREAMS ? is_max : MAX_STREAMS; - SYCL_CHECK(ggml_sycl_set_device(g_main_device)); - for (int64_t id = 0; id < g_device_count; ++id) { - if (row_low[id] == row_high[id]) { + ggml_sycl_set_device(g_main_device); + for (int i = 0; i < g_device_count; ++i) { + if (dev[i].row_low == dev[i].row_high) { continue; } for (int64_t is = 0; is < is_max; ++is) { SYCL_CHECK(CHECK_TRY_ERROR( - g_syclStreams[g_main_device_index][0]->ext_oneapi_submit_barrier( - {*src0_extra->events[id][is]}))); + g_syclStreams[g_main_device][0]->ext_oneapi_submit_barrier( + {*src0_extra->events[i][is]}))); } } } @@ -13051,110 +13344,132 @@ catch (sycl::exception const &exc) { std::exit(1); } + static void ggml_sycl_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_repeat); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_get_rows); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_add); - // log_tensor_with_cnt("log_ggml_sycl_add_src0", (struct ggml_tensor *) src0, 6); - // log_tensor_with_cnt("log_ggml_sycl_add_src1", (struct ggml_tensor *)src1, 6); - // log_tensor_with_cnt("log_ggml_sycl_add_dst", dst, 6); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_acc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_acc); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_mul(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_mul); - // log_tensor_with_cnt("log_ggml_sycl_mul_src0", (struct ggml_tensor *)src0, 6); - // log_tensor_with_cnt("log_ggml_sycl_mul_src1", (struct ggml_tensor *)src1, 6); - // log_tensor_with_cnt("log_ggml_sycl_mul_dst", dst, 6); - + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_div(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_div); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_gelu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_gelu); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_silu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_silu); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_gelu_quick(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_gelu_quick); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_tanh(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_tanh); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_relu); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +static void ggml_sycl_hardsigmoid(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_hardsigmoid); + GGML_SYCL_DEBUG("call %s done\n", __func__); +} + +static void ggml_sycl_hardswish(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + GGML_SYCL_DEBUG("call %s\n", __func__); + ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_hardswish); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_leaky_relu(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_leaky_relu); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_sqr(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_sqr); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_norm); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_group_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_group_norm); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_concat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_concat); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_upscale(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_upscale); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_pad(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pad); + GGML_SYCL_DEBUG("call %s done\n", __func__); } static void ggml_sycl_rms_norm(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { GGML_SYCL_DEBUG("call %s\n", __func__); ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_rms_norm); - // log_tensor_with_cnt("log_ggml_sycl_rms_norm_src0", (struct ggml_tensor *)src0, 6); - // log_tensor_with_cnt("log_ggml_sycl_rms_norm_src1", (struct ggml_tensor *)src1, 6); - // log_tensor_with_cnt("log_ggml_sycl_rms_norm_dst", dst, 6); + GGML_SYCL_DEBUG("call %s done\n", __func__); } bool ggml_sycl_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst) { @@ -13189,16 +13504,16 @@ static void ggml_sycl_mul_mat_vec_p021(const ggml_tensor *src0, const int64_t ne12 = src1->ne[2]; SYCL_CHECK(ggml_sycl_set_device(g_main_device)); - dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; + dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0]; ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - void * src0_ddq = src0_extra->data_device[g_main_device_index]; + void * src0_ddq = src0_extra->data_device[g_main_device]; ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index]; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index]; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; ggml_mul_mat_p021_f16_f32_sycl(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream); } @@ -13228,16 +13543,16 @@ static void ggml_sycl_mul_mat_vec_nc(const ggml_tensor *src0, const int64_t ne12 = src1->ne[2]; SYCL_CHECK(ggml_sycl_set_device(g_main_device)); - dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; + dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0]; ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - void * src0_ddq = src0_extra->data_device[g_main_device_index]; + void * src0_ddq = src0_extra->data_device[g_main_device]; ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index]; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index]; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; const int64_t row_stride_x = nb01 / sizeof(sycl::half); const int64_t channel_stride_x = nb02 / sizeof(sycl::half); @@ -13280,38 +13595,37 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, ggml_tensor *dst) try { GGML_ASSERT(!ggml_is_transposed(src0)); GGML_ASSERT(!ggml_is_transposed(src1)); - GGML_ASSERT(src0->backend != GGML_BACKEND_TYPE_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_TENSOR_BINARY_OP_LOCALS - const int64_t ne_dst = ggml_nelements(dst); + const int64_t ne_dst = ggml_nelements(dst); SYCL_CHECK(ggml_sycl_set_device(g_main_device)); - dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; + dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0]; SYCL_CHECK( - CHECK_TRY_ERROR(g_sycl_handles[g_main_device_index] = main_stream)); + CHECK_TRY_ERROR(g_sycl_handles[g_main_device] = main_stream)); ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - void * src0_ddq = src0_extra->data_device[g_main_device_index]; + void * src0_ddq = src0_extra->data_device[g_main_device]; sycl::half *src0_as_f16 = (sycl::half *)src0_ddq; ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index]; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index]; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; // convert src1 to fp16 sycl_pool_alloc src1_f16_alloc; if (src1->type != GGML_TYPE_F16) { - const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); - const int64_t ne_src1 = ggml_nelements(src1); - src1_f16_alloc.alloc(ne_src1); - GGML_ASSERT(to_fp16_sycl != nullptr); - to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream); + const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); + const int64_t ne_src1 = ggml_nelements(src1); + src1_f16_alloc.alloc(ne_src1); + GGML_ASSERT(to_fp16_sycl != nullptr); + to_fp16_sycl(src1_ddf, src1_f16_alloc.get(), ne_src1, main_stream); } sycl::half *src1_f16 = src1->type == GGML_TYPE_F16 ? (sycl::half *)src1_ddf : src1_f16_alloc.get(); @@ -13358,7 +13672,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, int i02 = i12 / r2; SYCL_CHECK( - syclGemmEx(g_sycl_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N, + syclGemmEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, alpha, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , SYCL_R_16F, nb01/sizeof(half), (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, SYCL_R_16F, nb11/sizeof(float), @@ -13371,9 +13685,8 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, #else if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) { // there is no broadcast and src0, src1 are contiguous across dims 2, 3 - // use syclGemmStridedBatchedEx SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( - *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans, + *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const char *)src0_as_f16, dpct::library_data_t::real_half, nb01 / nb00, nb02 / nb00, @@ -13382,7 +13695,6 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, (char *)dst_t, cu_data_type, ne01, nb2 / nb0, ne12 * ne13, cu_compute_type))); } else { - // use syclGemmBatchedEx const int ne23 = ne12*ne13; sycl_pool_alloc ptrs_src(2*ne23); @@ -13415,7 +13727,7 @@ static void ggml_sycl_mul_mat_batched_sycl(const ggml_tensor *src0, }); } SYCL_CHECK(CHECK_TRY_ERROR(dpct::gemm_batch( - *g_sycl_handles[g_main_device_index], oneapi::mkl::transpose::trans, + *g_sycl_handles[g_main_device], oneapi::mkl::transpose::trans, oneapi::mkl::transpose::nontrans, ne01, ne11, ne10, alpha, (const void **)(ptrs_src.get() + 0 * ne23), dpct::library_data_t::real_half, nb01 / nb00, @@ -13435,6 +13747,7 @@ catch (sycl::exception const &exc) { std::exit(1); } + static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { const bool all_on_device = (src0->backend == GGML_BACKEND_TYPE_GPU || src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT) && @@ -13444,9 +13757,9 @@ static void ggml_sycl_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 const bool split = src0->backend == GGML_BACKEND_TYPE_GPU_SPLIT; int64_t min_compute_capability = INT_MAX; - for (int64_t id = 0; id < g_device_count; ++id) { - if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) { - min_compute_capability = g_device_caps[id].cc; + for (int i = 0; i < g_device_count; ++i) { + if (min_compute_capability > g_device_caps[i].cc && g_tensor_split[i] < (i + 1 < g_device_count ? g_tensor_split[i + 1] : 1.0f)) { + min_compute_capability = g_device_caps[i].cc; } } @@ -13587,30 +13900,30 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { const int64_t ne = ggml_nelements(dst); SYCL_CHECK(ggml_sycl_set_device(g_main_device)); - syclStream_t main_stream = g_syclStreams[g_main_device_index][0]; + syclStream_t main_stream = g_syclStreams[g_main_device][0]; - SYCL_CHECK(syclSetStream(g_sycl_handles[g_main_device_index], main_stream)); + SYCL_CHECK(syclSetStream(g_sycl_handles[g_main_device], main_stream)); //ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; - //void * src0_ddq = src0_extra->data_device[g_main_device_index]; + //void * src0_ddq = src0_extra->data_device[g_main_device]; //half * src0_as_f16 = (half *) src0_ddq; ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - float * src1_ddf = (float *) src1_extra->data_device[g_main_device_index]; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; - float * dst_ddf = (float *) dst_extra->data_device[g_main_device_index]; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; // convert src1 to fp16 const to_fp16_sycl_t to_fp16_sycl = ggml_get_to_fp16_sycl(src1->type); GGML_ASSERT(to_fp16_sycl != nullptr); size_t src1_as = 0; - half * src1_as_f16 = (half *) ggml_sycl_pool_malloc(ne1 * sizeof(half), &src1_as); + half * src1_as_f16 = (half *) ggml_sycl_pool_malloc(g_main_device, ne1 * sizeof(half), &src1_as); to_fp16_sycl(src1_ddf, src1_as_f16, ne1, main_stream); size_t dst_as = 0; - half * dst_f16 = (half *) ggml_sycl_pool_malloc(ne * sizeof(half), &dst_as); + half * dst_f16 = (half *) ggml_sycl_pool_malloc(g_main_device, ne * sizeof(half), &dst_as); GGML_ASSERT(ne12 % ne02 == 0); GGML_ASSERT(ne13 % ne03 == 0); @@ -13631,14 +13944,14 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { size_t ptrs_src_s = 0; size_t ptrs_dst_s = 0; - ptrs_src = (const void **) ggml_sycl_pool_malloc(2*ne23*sizeof(void *), &ptrs_src_s); - ptrs_dst = ( void **) ggml_sycl_pool_malloc(1*ne23*sizeof(void *), &ptrs_dst_s); + ptrs_src = (const void **) ggml_sycl_pool_malloc(g_main_device, 2*ne23*sizeof(void *), &ptrs_src_s); + ptrs_dst = ( void **) ggml_sycl_pool_malloc(g_main_device, 1*ne23*sizeof(void *), &ptrs_dst_s); int64_t src0_ne = ggml_nelements(src00); half * src0_as_f16 = nullptr; size_t src0_as = 0; if (src00->type != GGML_TYPE_F16) { - src0_as_f16 = (half *) ggml_sycl_pool_malloc(src0_ne * sizeof(half), &src0_as); + src0_as_f16 = (half *) ggml_sycl_pool_malloc(g_main_device, src0_ne * sizeof(half), &src0_as); } static_assert(GGML_MAX_SRC == 6, "GGML_MAX_SRC == 6"); @@ -13653,16 +13966,16 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { r2, r3, src00->type, src0_as_f16, src0_ne, src1_as_f16, dst_f16, - (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index], id, - dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device_index] : nullptr, - dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device_index] : nullptr, - dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device_index] : nullptr, - dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device_index] : nullptr + (const int *)((ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device], id, + dst->src[2] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[2]->extra)->data_device[g_main_device] : nullptr, + dst->src[3] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[3]->extra)->data_device[g_main_device] : nullptr, + dst->src[4] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[4]->extra)->data_device[g_main_device] : nullptr, + dst->src[5] ? (const half *)((ggml_tensor_extra_gpu *)dst->src[5]->extra)->data_device[g_main_device] : nullptr ); SYCL_CHECK(syclGetLastError()); SYCL_CHECK( - syclGemmBatchedEx(g_sycl_handles[g_main_device_index], CUBLAS_OP_T, CUBLAS_OP_N, + syclGemmBatchedEx(g_sycl_handles[g_main_device], CUBLAS_OP_T, CUBLAS_OP_N, ne01, ne11, ne10, &alpha_f16, (const void **) (ptrs_src + 0*ne23), SYCL_R_16F, ne00, (const void **) (ptrs_src + 1*ne23), SYCL_R_16F, ne10, @@ -13672,20 +13985,20 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) { CUBLAS_GEMM_DEFAULT_TENSOR_OP)); if (src0_as != 0) { - ggml_sycl_pool_free(src0_as_f16, src0_as); + ggml_sycl_pool_free(g_main_device, src0_as_f16, src0_as); } if (ptrs_src_s != 0) { - ggml_sycl_pool_free(ptrs_src, ptrs_src_s); + ggml_sycl_pool_free(g_main_device, ptrs_src, ptrs_src_s); } if (ptrs_dst_s != 0) { - ggml_sycl_pool_free(ptrs_dst, ptrs_dst_s); + ggml_sycl_pool_free(g_main_device, ptrs_dst, ptrs_dst_s); } const to_fp32_sycl_t to_fp32_sycl = ggml_get_to_fp32_sycl(GGML_TYPE_F16); to_fp32_sycl(dst_f16, dst_ddf, ne, main_stream); - ggml_sycl_pool_free(src1_as_f16, src1_as); - ggml_sycl_pool_free(dst_f16, dst_as); + ggml_sycl_pool_free(g_main_device, src1_as_f16, src1_as); + ggml_sycl_pool_free(g_main_device, dst_f16, dst_as); } #endif @@ -13706,10 +14019,10 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, std::vector ids_host(ggml_nbytes(ids)); - const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; + const dpct::queue_ptr stream = g_syclStreams[g_main_device][0]; if (ids->backend == GGML_BACKEND_TYPE_GPU) { - const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device_index]; + const char * ids_dev = (const char *)((const ggml_tensor_extra_gpu *)ids->extra)->data_device[g_main_device]; SYCL_CHECK(CHECK_TRY_ERROR( stream->memcpy(ids_host.data(), ids_dev, ggml_nbytes(ids)))); SYCL_CHECK(CHECK_TRY_ERROR(stream->wait())); @@ -13733,9 +14046,9 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, dst_row.extra = &dst_row_extra; char * src1_original = src1->backend == GGML_BACKEND_TYPE_CPU ? - (char *) src1->data : (char *) src1_extra->data_device[g_main_device_index]; + (char *) src1->data : (char *) src1_extra->data_device[g_main_device]; char * dst_original = dst->backend == GGML_BACKEND_TYPE_CPU ? - (char *) dst->data : (char *) dst_extra->data_device[g_main_device_index]; + (char *) dst->data : (char *) dst_extra->data_device[g_main_device]; if (src1->ne[1] == 1) { GGML_ASSERT(src1->backend == GGML_BACKEND_TYPE_GPU); @@ -13752,10 +14065,10 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, const struct ggml_tensor * src0_row = dst->src[row_id + 2]; - src1_row_extra.data_device[g_main_device_index] = src1_original + i01*src1->nb[1]; + src1_row_extra.data_device[g_main_device] = src1_original + i01*src1->nb[1]; src1_row.data = (char *) src1->data + i01*src1->nb[1]; // TODO why is this set? - dst_row_extra.data_device[g_main_device_index] = dst_original + i01*dst->nb[1]; + dst_row_extra.data_device[g_main_device] = dst_original + i01*dst->nb[1]; dst_row.data = (char *) dst->data + i01*dst->nb[1]; // TODO why is this set? ggml_sycl_mul_mat(src0_row, &src1_row, &dst_row); @@ -13764,8 +14077,8 @@ static void ggml_sycl_mul_mat_id(const ggml_tensor *src0, sycl_pool_alloc src1_contiguous(sizeof(float)*ggml_nelements(src1)); sycl_pool_alloc dst_contiguous(sizeof(float)*ggml_nelements(dst)); - src1_row_extra.data_device[g_main_device_index] = src1_contiguous.get(); - dst_row_extra.data_device[g_main_device_index] = dst_contiguous.get(); + src1_row_extra.data_device[g_main_device] = src1_contiguous.get(); + dst_row_extra.data_device[g_main_device] = dst_contiguous.get(); for (int32_t row_id = 0; row_id < n_as; ++row_id) { const struct ggml_tensor * src0_row = dst->src[row_id + 2]; @@ -13853,13 +14166,13 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, GGML_TENSOR_BINARY_OP_LOCALS; SYCL_CHECK(ggml_sycl_set_device(g_main_device)); - dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0]; + dpct::queue_ptr main_stream = g_syclStreams[g_main_device][0]; const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; - char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; - char * src1_ddc = (char *) src1_extra->data_device[g_main_device_index]; + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; + char * src1_ddc = (char *) src1_extra->data_device[g_main_device]; if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_F32) { ggml_cpy_f32_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); @@ -13871,6 +14184,8 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1, ggml_cpy_f32_q4_0_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F32 && src1->type == GGML_TYPE_Q4_1) { ggml_cpy_f32_q4_1_sycl(src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); + } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32) { + ggml_cpy_f16_f32_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F16) { ggml_cpy_f16_f16_sycl (src0_ddc, src1_ddc, ne, ne00, ne01, ne02, nb00, nb01, nb02, nb03, ne10, ne11, ne12, nb10, nb11, nb12, nb13, main_stream); } else if (src0->type == GGML_TYPE_I16 && src1->type == GGML_TYPE_I16) { @@ -13914,6 +14229,10 @@ static void ggml_sycl_alibi(const ggml_tensor * src0, const ggml_tensor * src1, ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_alibi); } +static void ggml_sycl_pool2d(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { + ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_pool2d); +} + static void ggml_sycl_im2col(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { ggml_sycl_op_flatten(src0, src1, dst, ggml_sycl_op_im2col); } @@ -13940,93 +14259,6 @@ static size_t ggml_nbytes_split(const struct ggml_tensor * tensor, int nrows_spl return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]); } -void ggml_sycl_transform_tensor(void *data, struct ggml_tensor *tensor) try { - const int64_t nrows = ggml_nrows(tensor); - - const int64_t ne0 = tensor->ne[0]; - - const size_t nb1 = tensor->nb[1]; - - ggml_backend_type backend = tensor->backend; - ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu; - memset(extra, 0, sizeof(*extra)); - - for (int64_t id = 0; id < g_device_count; ++id) { - if (backend == GGML_BACKEND_TYPE_GPU && id != g_main_device_index) { - continue; - } - ggml_sycl_set_device(get_device_id_by_index(id)); - const dpct::queue_ptr stream = g_syclStreams[id][0]; - - int64_t row_low, row_high; - if (backend == GGML_BACKEND_TYPE_GPU) { - row_low = 0; - row_high = nrows; - } else if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) { - const int64_t rounding = get_row_rounding(tensor->type); - - row_low = id == 0 ? 0 : nrows*g_tensor_split[id]; - row_low -= row_low % rounding; - - if (id == g_device_count - 1) { - row_high = nrows; - } else { - row_high = nrows*g_tensor_split[id + 1]; - row_high -= row_high % rounding; - } - } else { - GGML_ASSERT(false); - } - if (row_low == row_high) { - continue; - } - - int64_t nrows_split = row_high - row_low; - - const size_t offset_split = row_low*nb1; - size_t size = ggml_nbytes_split(tensor, nrows_split); - const size_t original_size = size; - - // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses - if (ne0 % MATRIX_ROW_PADDING != 0) { - size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); - } - - char * buf; - SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device( - size, *stream))); - char * buf_host = (char *)data + offset_split; - - // set padding to 0 to avoid possible NaN values - if (size > original_size) { - SYCL_CHECK(CHECK_TRY_ERROR( - (*stream) - .memset(buf + original_size, 0, size - original_size) - .wait())); - } - - SYCL_CHECK(CHECK_TRY_ERROR((*stream) - .memcpy(buf, buf_host, original_size) - .wait())); - - extra->data_device[id] = buf; - - if (backend == GGML_BACKEND_TYPE_GPU_SPLIT) { - for (int64_t is = 0; is < MAX_STREAMS; ++is) { - SYCL_CHECK(CHECK_TRY_ERROR(extra->events[id][is] = - new sycl::event())); - } - } - } - - tensor->extra = extra; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - void ggml_sycl_free_data(struct ggml_tensor *tensor) try { if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_TYPE_GPU && tensor->backend != GGML_BACKEND_TYPE_GPU_SPLIT) ) { return; @@ -14034,18 +14266,18 @@ void ggml_sycl_free_data(struct ggml_tensor *tensor) try { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; - for (int64_t id = 0; id < g_device_count; ++id) { - const dpct::queue_ptr stream = g_syclStreams[id][0]; - if (extra->data_device[id] != nullptr) { - SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); - SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(extra->data_device[id], *stream))); + for (int i = 0; i < g_device_count; ++i) { + const dpct::queue_ptr stream = g_syclStreams[i][0]; + if (extra->data_device[i] != nullptr) { + SYCL_CHECK(ggml_sycl_set_device(i)); + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *stream))); } for (int64_t is = 0; is < MAX_STREAMS; ++is) { - if (extra->events[id][is] != nullptr) { - SYCL_CHECK(ggml_sycl_set_device(get_device_id_by_index(id))); + if (extra->events[i][is] != nullptr) { + SYCL_CHECK(ggml_sycl_set_device(i)); SYCL_CHECK(CHECK_TRY_ERROR( - dpct::destroy_event(extra->events[id][is]))); + dpct::destroy_event(extra->events[i][is]))); } } } @@ -14105,22 +14337,22 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor, const size_t size = ggml_nbytes(tensor); SYCL_CHECK(ggml_sycl_set_device(g_main_device)); - const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; + const dpct::queue_ptr stream = g_syclStreams[g_main_device][0]; if (inplace && (tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU || tensor->src[0]->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) { ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra; - char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; + char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t offset = 0; if (tensor->op == GGML_OP_VIEW) { memcpy(&offset, tensor->op_params, sizeof(size_t)); } extra = ggml_sycl_alloc_temp_tensor_extra(); - extra->data_device[g_main_device_index] = src0_ddc + offset; + extra->data_device[g_main_device] = src0_ddc + offset; } else if (tensor->op == GGML_OP_CPY) { ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra; - void * src1_ddv = src1_extra->data_device[g_main_device_index]; + void * src1_ddv = src1_extra->data_device[g_main_device]; extra = ggml_sycl_alloc_temp_tensor_extra(); - extra->data_device[g_main_device_index] = src1_ddv; + extra->data_device[g_main_device] = src1_ddv; } else if (scratch) { GGML_ASSERT(size <= g_scratch_size); if (g_scratch_offset + size > g_scratch_size) { @@ -14135,7 +14367,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor, g_scratch_buffer = data; } extra = ggml_sycl_alloc_temp_tensor_extra(); - extra->data_device[g_main_device_index] = data + g_scratch_offset; + extra->data_device[g_main_device] = data + g_scratch_offset; g_scratch_offset += size; @@ -14148,44 +14380,7 @@ static void ggml_sycl_assign_buffers_impl(struct ggml_tensor *tensor, (*stream).memset(data, 0, size).wait())); extra = new ggml_tensor_extra_gpu; memset(extra, 0, sizeof(*extra)); - extra->data_device[g_main_device_index] = data; - } - - tensor->extra = extra; -} -catch (sycl::exception const &exc) { - std::cerr << exc.what() << "Exception caught at file:" << __FILE__ - << ", line:" << __LINE__ << std::endl; - std::exit(1); -} - -void ggml_sycl_assign_scratch_offset(struct ggml_tensor *tensor, - size_t offset) try { - if (g_scratch_size == 0) { - return; - } - if (g_scratch_buffer == nullptr) { - ggml_sycl_set_device(g_main_device); - const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; - SYCL_CHECK( - CHECK_TRY_ERROR(g_scratch_buffer = (void *)sycl::malloc_device( - g_scratch_size, *stream))); - } - - ggml_tensor_extra_gpu * extra = ggml_sycl_alloc_temp_tensor_extra(); - - const bool inplace = tensor->view_src != nullptr; - - if (inplace && (tensor->view_src->backend == GGML_BACKEND_TYPE_GPU || tensor->view_src->backend == GGML_BACKEND_TYPE_GPU_SPLIT)) { - ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra; - char * src0_ddc = (char *) src0_extra->data_device[g_main_device_index]; - size_t view_offset = 0; - if (tensor->op == GGML_OP_VIEW) { - memcpy(&view_offset, tensor->op_params, sizeof(size_t)); - } - extra->data_device[g_main_device_index] = src0_ddc + view_offset; - } else { - extra->data_device[g_main_device_index] = (char *) g_scratch_buffer + offset; + extra->data_device[g_main_device] = data; } tensor->extra = extra; @@ -14202,9 +14397,9 @@ void ggml_sycl_copy_to_device(struct ggml_tensor *tensor) try { ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra; SYCL_CHECK(ggml_sycl_set_device(g_main_device)); - const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; + const dpct::queue_ptr stream = g_syclStreams[g_main_device][0]; SYCL_CHECK(CHECK_TRY_ERROR((*stream) - .memcpy(extra->data_device[g_main_device_index], + .memcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor)) .wait())); } @@ -14231,21 +14426,17 @@ void ggml_sycl_assign_buffers_force_inplace(struct ggml_tensor * tensor) { } void ggml_sycl_set_main_device(const int main_device) try { + if (g_main_device == main_device) return; + check_allow_gpu_index(main_device); + g_main_device = main_device; + g_main_device_id = g_sycl_gpu_mgr->gpus[main_device]; - if (main_device >= g_all_sycl_device_count) { - fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n", - main_device, g_all_sycl_device_count, g_main_device); - return; - } - - if (g_main_device != main_device && g_device_count >= 1) { - g_main_device = main_device; - g_main_device_index = get_device_index_by_id(g_main_device); + if (g_ggml_sycl_debug) { dpct::device_info prop; SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(g_main_device)))); + prop, dpct::dev_mgr::instance().get_device(g_main_device_id)))); fprintf(stderr, "Using device %d (%s) as main device\n", - g_main_device, prop.get_name()); + g_main_device_id, prop.get_name()); } } catch (sycl::exception const &exc) { @@ -14268,7 +14459,7 @@ void ggml_sycl_free_scratch() try { return; } ggml_sycl_set_device(g_main_device); - const dpct::queue_ptr stream = g_syclStreams[g_main_device_index][0]; + const dpct::queue_ptr stream = g_syclStreams[g_main_device][0]; SYCL_CHECK(CHECK_TRY_ERROR( sycl::free(g_scratch_buffer, *stream))); @@ -14340,6 +14531,12 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ case GGML_UNARY_OP_RELU: func = ggml_sycl_relu; break; + case GGML_UNARY_OP_HARDSIGMOID: + func = ggml_sycl_hardsigmoid; + break; + case GGML_UNARY_OP_HARDSWISH: + func = ggml_sycl_hardswish; + break; default: return false; } @@ -14414,6 +14611,9 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ case GGML_OP_IM2COL: func = ggml_sycl_im2col; break; + case GGML_OP_POOL_2D: + func = ggml_sycl_pool2d; + break; case GGML_OP_SUM_ROWS: func = ggml_sycl_sum_rows; break; @@ -14439,27 +14639,15 @@ bool ggml_sycl_compute_forward(struct ggml_compute_params * params, struct ggml_ } GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len) try { - int max_compute_units = -1; - for(int i=0;igpus.size();i++){ + if (i>=max_len) break; + id_list[i] = g_sycl_gpu_mgr->gpus[i]; } return; } @@ -14486,8 +14674,9 @@ catch (sycl::exception const &exc) { GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size) try { dpct::device_info prop; + int device_id = g_sycl_gpu_mgr->gpus[device]; SYCL_CHECK(CHECK_TRY_ERROR(dpct::get_device_info( - prop, dpct::dev_mgr::instance().get_device(device)))); + prop, dpct::dev_mgr::instance().get_device(device_id)))); snprintf(description, description_size, "%s", prop.get_name()); } catch (sycl::exception const &exc) { @@ -14496,17 +14685,36 @@ catch (sycl::exception const &exc) { std::exit(1); } +GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, + size_t *total) try { + ggml_sycl_set_device(device); + + /* + DPCT1009:218: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string was + inserted. You need to rewrite this code. + */ + /* + DPCT1106:217: 'cudaMemGetInfo' was migrated with the Intel extensions for + device information which may not be supported by all compilers or runtimes. + You may need to adjust the code. + */ + int device_id = g_sycl_gpu_mgr->gpus[device]; + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::dev_mgr::instance().get_device(device_id).get_memory_info(*free, *total))); +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + //////////////////////////////////////////////////////////////////////////////// // backend interface #define UNUSED GGML_UNUSED -struct ggml_backend_sycl_context { - int device; - std::string name; -}; - // sycl buffer struct ggml_backend_sycl_buffer_context { @@ -14516,7 +14724,12 @@ struct ggml_backend_sycl_buffer_context { size_t temp_tensor_extra_index = 0; std::string name; - ggml_backend_sycl_buffer_context(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {} + ggml_backend_sycl_buffer_context(int device, void * dev_ptr) : + device(device), dev_ptr(dev_ptr) { + check_allow_gpu_index(device); + int id = g_sycl_gpu_mgr->gpus[device]; + name = (GGML_SYCL_NAME + std::to_string(id)); + } ~ ggml_backend_sycl_buffer_context() { delete[] temp_tensor_extras; @@ -14547,10 +14760,9 @@ GGML_CALL static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) static void ggml_backend_sycl_buffer_free_buffer(ggml_backend_buffer_t buffer) try { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); - int device_index = get_device_index_by_id(ctx->device); - const dpct::queue_ptr stream = g_syclStreams[device_index][0]; + const dpct::queue_ptr stream = g_syclStreams[ctx->device][0]; SYCL_CHECK( CHECK_TRY_ERROR(sycl::free(ctx->dev_ptr, *stream))); @@ -14563,13 +14775,14 @@ catch (sycl::exception const &exc) { } static void * ggml_backend_sycl_buffer_get_base(ggml_backend_buffer_t buffer) { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; return ctx->dev_ptr; } -static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, - ggml_tensor *tensor) try { - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; +GGML_CALL static void +ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor) try { + ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context; if (tensor->view_src != NULL && tensor->view_offs == 0) { assert(tensor->view_src->buffer->buft == buffer->buft); @@ -14581,27 +14794,20 @@ static void ggml_backend_sycl_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor_extra_gpu * extra = ctx->ggml_sycl_alloc_temp_tensor_extra(); extra->data_device[ctx->device] = tensor->data; - tensor->backend = GGML_BACKEND_TYPE_GPU; tensor->extra = extra; if (ggml_is_quantized(tensor->type)) { // initialize padding to 0 to avoid possible NaN values - int64_t row_low = 0; - int64_t row_high = ggml_nrows(tensor); - int64_t nrows_split = row_high - row_low; - - size_t original_size = ggml_nbytes_split(tensor, nrows_split); + size_t original_size = ggml_nbytes(tensor); size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); if (padded_size > original_size && tensor->view_src == nullptr) { SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[ctx->device][0]->memset( (char *)tensor->data + original_size, 0, - padded_size - original_size))); + padded_size - original_size).wait())); } } - - UNUSED(buffer); } catch (sycl::exception const &exc) { std::cerr << exc.what() << "Exception caught at file:" << __FILE__ @@ -14615,13 +14821,12 @@ static void ggml_backend_sycl_buffer_set_tensor(ggml_backend_buffer_t buffer, size_t size) try { GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); - int device_index = get_device_index_by_id(ctx->device); - const dpct::queue_ptr stream = g_syclStreams[device_index][0]; + const dpct::queue_ptr stream = g_syclStreams[ctx->device][0]; SYCL_CHECK( - CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw())); + CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); SYCL_CHECK( CHECK_TRY_ERROR((*stream) @@ -14640,14 +14845,13 @@ static void ggml_backend_sycl_buffer_get_tensor(ggml_backend_buffer_t buffer, size_t size) try { GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); - ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; + ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); - int device_index = get_device_index_by_id(ctx->device); - const dpct::queue_ptr stream = g_syclStreams[device_index][0]; + const dpct::queue_ptr stream = g_syclStreams[ctx->device][0]; SYCL_CHECK( - CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw())); + CHECK_TRY_ERROR(dpct::dev_mgr::instance().get_device(ctx->device).queues_wait_and_throw())); SYCL_CHECK(CHECK_TRY_ERROR( (*stream) @@ -14660,13 +14864,73 @@ catch (sycl::exception const &exc) { std::exit(1); } +GGML_CALL static bool +ggml_backend_sycl_buffer_cpy_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor *src, + ggml_tensor *dst) try { + if (ggml_backend_buffer_is_sycl(src->buffer)) { + ggml_backend_sycl_buffer_context * src_ctx = (ggml_backend_sycl_buffer_context *)src->buffer->context; + ggml_backend_sycl_buffer_context * dst_ctx = (ggml_backend_sycl_buffer_context *)buffer->context; + + ggml_sycl_set_device(src_ctx->device); + /* + DPCT1009:198: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::dev_mgr::instance().get_device(src_ctx->device).queues_wait_and_throw())); + ggml_sycl_set_device(dst_ctx->device); + /* + DPCT1009:199: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw())); + /* + DPCT1009:200: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + + dpct::queue_ptr stream_dst = g_syclStreams[dst_ctx->device][0]; + dpct::queue_ptr stream_src = g_syclStreams[src_ctx->device][0]; + size_t size = ggml_nbytes(src); + + //todo. it's dirty solutino to walkaroud known issue:device2device cross GPUs. + dev2dev_memcpy(*stream_dst, *stream_src, dst->data, src->data, size); + +//todo, it's known issue:error in device2device cross GPUs. reused when the issue is fixed. DON"T remove +#if 0 + SYCL_CHECK(CHECK_TRY_ERROR((*stream).memcpy( + (char *)dst->data, (const char *)src->data, size).wait())); + + /* + DPCT1009:201: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::dev_mgr::instance().get_device(dst_ctx->device).queues_wait_and_throw())); +#endif + return true; + } + return false; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + + static void ggml_backend_sycl_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) try { ggml_backend_sycl_buffer_context * ctx = ( ggml_backend_sycl_buffer_context *)buffer->context; ggml_sycl_set_device(ctx->device); - int device_index = get_device_index_by_id(ctx->device); - const dpct::queue_ptr stream = g_syclStreams[device_index][0]; + const dpct::queue_ptr stream = g_syclStreams[ctx->device][0]; SYCL_CHECK( CHECK_TRY_ERROR(dpct::get_current_device().queues_wait_and_throw())); @@ -14687,7 +14951,7 @@ static struct ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = { /* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor, /* .set_tensor = */ ggml_backend_sycl_buffer_set_tensor, /* .get_tensor = */ ggml_backend_sycl_buffer_get_tensor, - /* .cpy_tensor = */ NULL, + /* .cpy_tensor = */ ggml_backend_sycl_buffer_cpy_tensor, /* .clear = */ ggml_backend_sycl_buffer_clear, /* .reset = */ NULL, }; @@ -14698,29 +14962,28 @@ struct ggml_backend_sycl_buffer_type_context { std::string name; }; +struct ggml_backend_sycl_context { + int device; + std::string name; +}; + GGML_CALL static const char * ggml_backend_sycl_buffer_type_name(ggml_backend_buffer_type_t buft) { ggml_backend_sycl_buffer_type_context * ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; return ctx->name.c_str(); } - -static ggml_backend_buffer_t +GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) try { ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; - int device = (int) buft_ctx->device; - - ggml_sycl_set_device(device); - int device_index = get_device_index_by_id(device); - const dpct::queue_ptr stream = g_syclStreams[device_index][0]; + ggml_sycl_set_device(buft_ctx->device); + const dpct::queue_ptr stream = g_syclStreams[buft_ctx->device][0]; size = std::max(size, (size_t)1); // syclMalloc returns null for size 0 void * dev_ptr; SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device( size, *stream))); - - ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(device, dev_ptr); - + ggml_backend_sycl_buffer_context * ctx = new ggml_backend_sycl_buffer_context(buft_ctx->device, dev_ptr); return ggml_backend_buffer_init(buft, ggml_backend_sycl_buffer_interface, ctx, size); } catch (sycl::exception const &exc) { @@ -14729,9 +14992,8 @@ catch (sycl::exception const &exc) { std::exit(1); } -static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { +GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { return 128; - UNUSED(buft); } @@ -14741,13 +15003,8 @@ static size_t ggml_backend_sycl_buffer_type_get_max_size(ggml_backend_buffer_typ UNUSED(buft); } -static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { - int64_t row_low = 0; - int64_t row_high = ggml_nrows(tensor); - int64_t nrows_split = row_high - row_low; - - size_t size = ggml_nbytes_split(tensor, nrows_split); - +GGML_CALL static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + size_t size = ggml_nbytes(tensor); int64_t ne0 = tensor->ne[0]; if (ggml_is_quantized(tensor->type)) { @@ -14761,10 +15018,13 @@ static size_t ggml_backend_sycl_buffer_type_get_alloc_size(ggml_backend_buffer_t UNUSED(buft); } -static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { - return ggml_backend_is_sycl(backend); - - UNUSED(buft); +GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + if (!ggml_backend_is_sycl(backend)) { + return false; + } + ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context; + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; + return buft_ctx->device == sycl_ctx->device; } static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = { @@ -14783,10 +15043,10 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { static bool ggml_backend_sycl_buffer_type_initialized = false; if (!ggml_backend_sycl_buffer_type_initialized) { - for (int i = 0; i < GGML_SYCL_MAX_DEVICES; i++) { + for (int i = 0; i < g_device_count; i++) { ggml_backend_sycl_buffer_types[i] = { /* .iface = */ ggml_backend_sycl_buffer_type_interface, - /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(i)}, + /* .context = */ new ggml_backend_sycl_buffer_type_context{i, GGML_SYCL_NAME + std::to_string(g_sycl_gpu_mgr->gpus[i])}, }; } ggml_backend_sycl_buffer_type_initialized = true; @@ -14795,6 +15055,391 @@ ggml_backend_buffer_type_t ggml_backend_sycl_buffer_type(int device) { return &ggml_backend_sycl_buffer_types[device]; } +// sycl split buffer type +static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array & tensor_split, int id) { + const int64_t nrows = ggml_nrows(tensor); + const int64_t rounding = get_row_rounding(tensor->type, tensor_split); + + *row_low = id == 0 ? 0 : nrows*tensor_split[id]; + *row_low -= *row_low % rounding; + if (id == g_device_count - 1) { + *row_high = nrows; + } else { + *row_high = nrows*tensor_split[id + 1]; + *row_high -= *row_high % rounding; + } +} + +struct ggml_backend_sycl_split_buffer_context { + ~ggml_backend_sycl_split_buffer_context() try { + for (ggml_tensor_extra_gpu * extra : tensor_extras) { + for (int i = 0; i < g_device_count; ++i) { + // int id = g_sycl_gpu_mgr->gpus[i]; + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + if (extra->events[i][is] != nullptr) { + /* + DPCT1009:206: SYCL uses exceptions to report errors and + does not use the error codes. The original code was + commented out and a warning string was inserted. You + need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + dpct::destroy_event(extra->events[i][is]))); + } + } + if (extra->data_device[i] != nullptr) { + /* + DPCT1009:207: SYCL uses exceptions to report errors and does + not use the error codes. The original code was commented out + and a warning string was inserted. You need to rewrite this + code. + */ + ggml_sycl_set_device(i); + SYCL_CHECK(CHECK_TRY_ERROR(sycl::free( + extra->data_device[i], *g_syclStreams[i][0]))); + } + } + delete extra; + } + } + catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); + } + + std::vector tensor_extras; +}; + +GGML_CALL static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) { + return GGML_SYCL_NAME "_Split"; + + UNUSED(buffer); +} + +// unused at the moment +//static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) { +// return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name; +//} + +GGML_CALL static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) { + ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; + delete ctx; +} + +GGML_CALL static void * ggml_backend_sycl_split_buffer_get_base(ggml_backend_buffer_t buffer) { + // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced + return (void *)0x1000; + + UNUSED(buffer); +} + +GGML_CALL static void +ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor) try { + GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported + + ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context; + ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + + ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{}; + + ctx->tensor_extras.push_back(extra); + + for (int i = 0; i < g_device_count; ++i) { + // int id = g_sycl_gpu_mgr->gpus[i]; + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + // FIXME: do not crash if cudaMalloc fails + // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first + ggml_sycl_set_device(i); + char * buf; + /* + DPCT1009:208: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device( + size, *g_syclStreams[i][0]))); + + // set padding to 0 to avoid possible NaN values + if (size > original_size) { + /* + DPCT1009:209: SYCL uses exceptions to report errors and does not use + the error codes. The original code was commented out and a warning + string was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR( + (*g_syclStreams[i][0]) + .memset(buf + original_size, 0, size - original_size) + .wait())); + } + + extra->data_device[i] = buf; + + for (int64_t is = 0; is < MAX_STREAMS; ++is) { + /* + DPCT1009:210: SYCL uses exceptions to report errors and does not use + the error codes. The original code was commented out and a warning + string was inserted. You need to rewrite this code. + */ + SYCL_CHECK( + CHECK_TRY_ERROR(extra->events[i][is] = new sycl::event())); + } + } + tensor->backend = GGML_BACKEND_TYPE_GPU_SPLIT; + tensor->extra = extra; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +GGML_CALL static void +ggml_backend_sycl_split_buffer_set_tensor(ggml_backend_buffer_t buffer, + ggml_tensor *tensor, const void *data, + size_t offset, size_t size) try { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int i = 0; i < g_device_count; ++i) { + // int id = g_sycl_gpu_mgr->gpus[i]; + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + const char * buf_host = (const char *)data + offset_split; + /* + DPCT1009:211: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + ggml_sycl_set_device(i); + SYCL_CHECK(CHECK_TRY_ERROR( + (*g_syclStreams[i][0]) + .memcpy(extra->data_device[i], buf_host, original_size) + .wait())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +GGML_CALL static void +ggml_backend_sycl_split_buffer_get_tensor(ggml_backend_buffer_t buffer, + const ggml_tensor *tensor, void *data, + size_t offset, size_t size) try { + // split tensors must always be set in their entirety at once + GGML_ASSERT(offset == 0); + GGML_ASSERT(size == ggml_nbytes(tensor)); + + ggml_backend_sycl_split_buffer_type_context * buft_ctx = (ggml_backend_sycl_split_buffer_type_context *)buffer->buft->context; + + const int64_t ne0 = tensor->ne[0]; + const size_t nb1 = tensor->nb[1]; + ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra; + + for (int i = 0; i < g_device_count; ++i) { + // int id = g_sycl_gpu_mgr->gpus[i]; + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, i); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + const size_t offset_split = row_low*nb1; + size_t size = ggml_nbytes_split(tensor, nrows_split); + const size_t original_size = size; + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + + char * buf_host = (char *)data + offset_split; + /* + DPCT1009:212: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + ggml_sycl_set_device(i); + SYCL_CHECK(CHECK_TRY_ERROR( + (*g_syclStreams[i][0]) + .memcpy(buf_host, extra->data_device[i], original_size) + .wait())); + } +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + +GGML_CALL static void ggml_backend_sycl_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) { + UNUSED(buffer); + UNUSED(value); +} + +static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = { + /* .get_name = */ ggml_backend_sycl_split_buffer_get_name, + /* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer, + /* .get_base = */ ggml_backend_sycl_split_buffer_get_base, + /* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor, + /* .set_tensor = */ ggml_backend_sycl_split_buffer_set_tensor, + /* .get_tensor = */ ggml_backend_sycl_split_buffer_get_tensor, + /* .cpy_tensor = */ NULL, + /* .clear = */ ggml_backend_sycl_split_buffer_clear, + /* .reset = */ NULL, +}; + +GGML_CALL static const char * ggml_backend_sycl_split_buffer_type_name(ggml_backend_buffer_type_t buft) { + return GGML_SYCL_NAME "_Split"; + + UNUSED(buft); +} + +GGML_CALL static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { + // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point + // instead, we allocate them for each tensor separately in init_tensor + // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated, + // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct. + ggml_backend_sycl_split_buffer_context * ctx = new ggml_backend_sycl_split_buffer_context(); + + return ggml_backend_buffer_init(buft, ggml_backend_sycl_split_buffer_interface, ctx, size); +} + +GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) { + return 128; + UNUSED(buft); +} + +GGML_CALL static size_t ggml_backend_sycl_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) { + ggml_backend_sycl_split_buffer_type_context * ctx = (ggml_backend_sycl_split_buffer_type_context *)buft->context; + + size_t total_size = 0; + + const int64_t ne0 = tensor->ne[0]; + + for (int i = 0; i < g_device_count; ++i) { + // int id = g_sycl_gpu_mgr->gpus[i]; + int64_t row_low, row_high; + get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, i); + + int64_t nrows_split = row_high - row_low; + if (nrows_split == 0) { + continue; + } + + total_size += ggml_nbytes_split(tensor, nrows_split); + + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses + if (ne0 % MATRIX_ROW_PADDING != 0) { + total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING); + } + } + + return total_size; +} + +GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) { + return ggml_backend_is_sycl(backend); + + UNUSED(buft); +} + +GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) { + return false; + + UNUSED(buft); +} + +static ggml_backend_buffer_type_i ggml_backend_sycl_split_buffer_type_interface = { + /* .get_name = */ ggml_backend_sycl_split_buffer_type_name, + /* .alloc_buffer = */ ggml_backend_sycl_split_buffer_type_alloc_buffer, + /* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment, + /* .get_max_size = */ NULL, // defaults to SIZE_MAX + /* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size, + /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend, + /* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host, +}; + +GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split) { + // FIXME: this is not thread safe + static std::map, struct ggml_backend_buffer_type> buft_map; + + std::array tensor_split_arr = {}; + + bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_SYCL_MAX_DEVICES, [](float x) { return x == 0.0f; }); + if (all_zero) { + tensor_split_arr = g_default_tensor_split; + } else { + float split_sum = 0.0f; + for (int i = 0; i < g_device_count; ++i) { + // int id = g_sycl_gpu_mgr->gpus[i]; + tensor_split_arr[i] = split_sum; + split_sum += tensor_split[i]; + } + for (int i = 0; i < g_device_count; ++i) { + // int id = g_sycl_gpu_mgr->gpus[i]; + tensor_split_arr[i] /= split_sum; + } + } + + auto it = buft_map.find(tensor_split_arr); + if (it != buft_map.end()) { + return &it->second; + } + + struct ggml_backend_buffer_type buft { + /* .iface = */ ggml_backend_sycl_split_buffer_type_interface, + /* .context = */ new ggml_backend_sycl_split_buffer_type_context{tensor_split_arr}, + }; + + auto result = buft_map.emplace(tensor_split_arr, buft); + return &result.first->second; +} + // host buffer type GGML_CALL static const char * ggml_backend_sycl_host_buffer_type_name(ggml_backend_buffer_type_t buft) { @@ -14824,6 +15469,7 @@ static ggml_backend_buffer_t ggml_backend_sycl_host_buffer_type_alloc_buffer(ggm // FIXME: this is a hack to avoid having to implement a new buffer type ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size); buffer->buft = buft; + buffer->iface.get_name = ggml_backend_sycl_host_buffer_name; buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer; return buffer; @@ -14848,34 +15494,33 @@ ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type() { // backend -static const char * ggml_backend_sycl_name(ggml_backend_t backend) { - return GGML_SYCL_NAME; +GGML_CALL static const char * ggml_backend_sycl_name(ggml_backend_t backend) { - UNUSED(backend); + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; + + return sycl_ctx->name.c_str(); } -static void ggml_backend_sycl_free(ggml_backend_t backend) { +GGML_CALL static void ggml_backend_sycl_free(ggml_backend_t backend) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; delete sycl_ctx; delete backend; } -static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { - ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; +GGML_CALL static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) { + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; return ggml_backend_sycl_buffer_type(sycl_ctx->device); } -static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, +GGML_CALL static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend, ggml_tensor *tensor, const void *data, size_t offset, size_t size) try { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); - SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( (char *)tensor->data + offset, data, size))); } @@ -14885,15 +15530,13 @@ catch (sycl::exception const &exc) { std::exit(1); } -static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, +GGML_CALL static void ggml_backend_sycl_get_tensor_async(ggml_backend_t backend, const ggml_tensor *tensor, void *data, size_t offset, size_t size) try { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - GGML_ASSERT(tensor->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && "unsupported buffer type"); GGML_ASSERT(tensor->backend == GGML_BACKEND_TYPE_GPU); - SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( data, (const char *)tensor->data + offset, size))); } @@ -14903,9 +15546,31 @@ catch (sycl::exception const &exc) { std::exit(1); } +GGML_CALL static bool ggml_backend_sycl_cpy_tensor_async(ggml_backend_t backend, + const ggml_tensor *src, + ggml_tensor *dst) try { + ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; + if (dst->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device) && ggml_backend_buffer_is_sycl(src->buffer)) { + /* + DPCT1009:215: SYCL uses exceptions to report errors and does not use the + error codes. The original code was commented out and a warning string + was inserted. You need to rewrite this code. + */ + SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->memcpy( + dst->data, src->data, ggml_nbytes(dst)))); + return true; + } + + return false; +} +catch (sycl::exception const &exc) { + std::cerr << exc.what() << "Exception caught at file:" << __FILE__ + << ", line:" << __LINE__ << std::endl; + std::exit(1); +} + static void ggml_backend_sycl_synchronize(ggml_backend_t backend) try { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - SYCL_CHECK(CHECK_TRY_ERROR(g_syclStreams[sycl_ctx->device][0]->wait())); UNUSED(backend); @@ -14916,32 +15581,8 @@ catch (sycl::exception const &exc) { std::exit(1); } -static ggml_backend_graph_plan_t ggml_backend_sycl_graph_plan_create(ggml_backend_t backend, const ggml_cgraph * cgraph) { - GGML_ASSERT(!"not implemented"); - - return nullptr; - - UNUSED(backend); - UNUSED(cgraph); -} - -static void ggml_backend_sycl_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); - - UNUSED(backend); - UNUSED(plan); -} - -static void ggml_backend_sycl_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) { - GGML_ASSERT(!"not implemented"); - - UNUSED(backend); - UNUSED(plan); -} - -static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { +GGML_CALL static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) { ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context; - ggml_sycl_set_main_device(sycl_ctx->device); ggml_compute_params params = {}; @@ -14949,63 +15590,41 @@ static bool ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph params.ith = 0; for (int i = 0; i < cgraph->n_nodes; i++) { ggml_tensor * node = cgraph->nodes[i]; - - if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE) + if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) { continue; - - assert(node->backend == GGML_BACKEND_TYPE_GPU); + } +#ifndef NDEBUG + assert(node->backend == GGML_BACKEND_TYPE_GPU || node->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->extra != nullptr); for (int j = 0; j < GGML_MAX_SRC; j++) { if (node->src[j] != nullptr) { - assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU); + assert(node->src[j]->backend == GGML_BACKEND_TYPE_GPU || node->src[j]->backend == GGML_BACKEND_TYPE_GPU_SPLIT); assert(node->src[j]->buffer->buft == ggml_backend_sycl_buffer_type(sycl_ctx->device)); assert(node->src[j]->extra != nullptr); } } - +#endif bool ok = ggml_sycl_compute_forward(¶ms, node); if (!ok) { fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op)); } GGML_ASSERT(ok); - -#if 0 - if (node->type == GGML_TYPE_F32) { - syclDeviceSynchronize(); - std::vector tmp(ggml_nelements(node), 0.0f); - syclMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), syclMemcpyDeviceToHost); - printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op), - ggml_type_name(node->src[0]->type), - node->src[1] ? ggml_type_name(node->src[1]->type) : "none", - node->src[0]->name, - node->src[1] ? node->src[1]->name : "none"); - double sum = 0.0; - double sq_sum = 0.0; - for (int i = 0; i < ggml_nelements(node); i++) { - printf("%f ", tmp[i]); - sum += tmp[i]; - sq_sum += tmp[i]*tmp[i]; - } - printf("\n"); - printf("sum: %f, ", sum); - printf("sq_sum: %f\n", sq_sum); - } -#endif } - UNUSED(backend); return true; } -static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { +GGML_CALL static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_tensor * op) { switch (op->op) { case GGML_OP_UNARY: switch (ggml_get_unary_op(op)) { case GGML_UNARY_OP_GELU: case GGML_UNARY_OP_SILU: case GGML_UNARY_OP_RELU: + case GGML_UNARY_OP_HARDSIGMOID: + case GGML_UNARY_OP_HARDSWISH: case GGML_UNARY_OP_GELU_QUICK: case GGML_UNARY_OP_TANH: return true; @@ -15081,16 +15700,17 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F16) { return true; } + if (src0_type == GGML_TYPE_F16 && src1_type == GGML_TYPE_F32) { + return true; + } return false; } break; + case GGML_OP_DUP: + case GGML_OP_REPEAT: case GGML_OP_CONCAT: { ggml_type src0_type = op->src[0]->type; - if (src0_type == GGML_TYPE_F32) { - return true; - } else { - return false; - } + return src0_type != GGML_TYPE_I32 && src0_type != GGML_TYPE_I16; } break; case GGML_OP_NONE: case GGML_OP_RESHAPE: @@ -15098,8 +15718,6 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten case GGML_OP_PERMUTE: case GGML_OP_TRANSPOSE: case GGML_OP_NORM: - case GGML_OP_REPEAT: - case GGML_OP_DUP: case GGML_OP_ADD: case GGML_OP_MUL: case GGML_OP_DIV: @@ -15113,6 +15731,7 @@ static bool ggml_backend_sycl_supports_op(ggml_backend_t backend, const ggml_ten case GGML_OP_ROPE: case GGML_OP_ALIBI: case GGML_OP_IM2COL: + case GGML_OP_POOL_2D: case GGML_OP_SUM_ROWS: case GGML_OP_ARGSORT: case GGML_OP_ACC: @@ -15134,11 +15753,11 @@ static ggml_backend_i ggml_backend_sycl_interface = { /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type, /* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async, /* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async, - /* .cpy_tensor_async = */ NULL, + /* .cpy_tensor_async = */ ggml_backend_sycl_cpy_tensor_async, /* .synchronize = */ ggml_backend_sycl_synchronize, - /* .graph_plan_create = */ ggml_backend_sycl_graph_plan_create, - /* .graph_plan_free = */ ggml_backend_sycl_graph_plan_free, - /* .graph_plan_compute = */ ggml_backend_sycl_graph_plan_compute, + /* .graph_plan_create = */ NULL, + /* .graph_plan_free = */ NULL, + /* .graph_plan_compute = */ NULL, /* .graph_compute = */ ggml_backend_sycl_graph_compute, /* .supports_op = */ ggml_backend_sycl_supports_op, }; @@ -15148,20 +15767,17 @@ static ggml_guid_t ggml_backend_sycl_guid() { return &guid; } -ggml_backend_t ggml_backend_sycl_init(int device) { +GGML_CALL ggml_backend_t ggml_backend_sycl_init(int device) { ggml_init_sycl(); // TODO: remove from ggml.c - if (device < 0 || device >= ggml_sycl_get_device_count()) { - fprintf(stderr, "%s: error: invalid device %d\n", __func__, device); - return nullptr; - } + check_allow_gpu_index(device); // not strictly necessary, but it may reduce the overhead of the first graph_compute ggml_sycl_set_main_device(device); - + int id = g_sycl_gpu_mgr->gpus[device]; ggml_backend_sycl_context * ctx = new ggml_backend_sycl_context { /* .device = */ device, - /* .name = */ GGML_SYCL_NAME + std::to_string(device), + /* .name = */ GGML_SYCL_NAME + std::to_string(id), }; ggml_backend_t sycl_backend = new ggml_backend { @@ -15177,22 +15793,33 @@ bool ggml_backend_is_sycl(ggml_backend_t backend) { return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_sycl_guid()); } -static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) { +GGML_CALL int ggml_backend_sycl_get_device_count() { + if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); + return g_sycl_gpu_mgr->get_gpu_count(); +} + +GGML_CALL static ggml_backend_t ggml_backend_reg_sycl_init(const char * params, void * user_data) { ggml_backend_t sycl_backend = ggml_backend_sycl_init((int) (intptr_t) user_data); return sycl_backend; UNUSED(params); } +GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id) { + return g_sycl_gpu_mgr->get_index(device_id); +} + extern "C" int ggml_backend_sycl_reg_devices(); int ggml_backend_sycl_reg_devices() { - int device_count = ggml_sycl_get_device_count(); - - for (int i = 0; i < device_count; i++) { + if (!g_sycl_gpu_mgr) g_sycl_gpu_mgr = new sycl_gpu_mgr(); + g_device_count = g_sycl_gpu_mgr->get_gpu_count(); + assert(g_device_count>0); + for (int i = 0; i < g_device_count; i++) { + int id = g_sycl_gpu_mgr->gpus[i]; char name[128]; - snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, i); + snprintf(name, sizeof(name), "%s%d", GGML_SYCL_NAME, id); ggml_backend_register(name, ggml_backend_reg_sycl_init, ggml_backend_sycl_buffer_type(i), (void *) (intptr_t) i); } - return device_count; + return g_device_count; } diff --git a/ggml-sycl.h b/ggml-sycl.h index 891f2d00a..bf5b11b36 100644 --- a/ggml-sycl.h +++ b/ggml-sycl.h @@ -24,6 +24,11 @@ GGML_API ggml_backend_buffer_type_t ggml_backend_sycl_host_buffer_type(void); GGML_API void ggml_backend_sycl_print_sycl_devices(void); GGML_API GGML_CALL void ggml_sycl_get_gpu_list(int *id_list, int max_len); GGML_API GGML_CALL void ggml_sycl_get_device_description(int device, char *description, size_t description_size); +GGML_API GGML_CALL int ggml_backend_sycl_get_device_count(); +GGML_API GGML_CALL ggml_backend_buffer_type_t ggml_backend_sycl_split_buffer_type(const float * tensor_split); +GGML_API GGML_CALL void ggml_backend_sycl_get_device_memory(int device, size_t *free, size_t *total); +GGML_API GGML_CALL int ggml_backend_sycl_get_device_index(int device_id); + #ifdef __cplusplus } #endif diff --git a/llama.cpp b/llama.cpp index b1db5b179..cb6266a43 100644 --- a/llama.cpp +++ b/llama.cpp @@ -104,6 +104,7 @@ #define LLAMA_MAX_NODES 8192 #define LLAMA_MAX_EXPERTS 8 + // // logging // @@ -1429,7 +1430,9 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer buft = ggml_backend_cuda_host_buffer_type(); } #elif defined(GGML_USE_SYCL) - buft = ggml_backend_sycl_host_buffer_type(); + if (host_buffer) { + buft = ggml_backend_sycl_host_buffer_type(); + } #elif defined(GGML_USE_CPU_HBM) buft = ggml_backend_cpu_hbm_buffer_type(); #elif defined(GGML_USE_VULKAN) @@ -1483,6 +1486,12 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g } #endif +#ifdef GGML_USE_SYCL + if (ggml_backend_sycl_get_device_count() > 1) { + buft = ggml_backend_sycl_split_buffer_type(tensor_split); + } +#endif + if (buft == nullptr) { buft = llama_default_buffer_type_offload(fallback_gpu); } @@ -1494,6 +1503,8 @@ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_g static size_t llama_get_device_count() { #if defined(GGML_USE_CUBLAS) return ggml_backend_cuda_get_device_count(); +#elif defined(GGML_USE_SYCL) + return ggml_backend_sycl_get_device_count(); #elif defined(GGML_USE_VULKAN) return ggml_backend_vk_get_device_count(); #else @@ -1507,6 +1518,11 @@ static size_t llama_get_device_memory(int device) { size_t free; ggml_backend_cuda_get_device_memory(device, &total, &free); return free; +#elif defined(GGML_USE_SYCL) + size_t total; + size_t free; + ggml_backend_sycl_get_device_memory(device, &total, &free); + return free; #elif defined(GGML_USE_VULKAN) size_t total; size_t free; @@ -12075,13 +12091,31 @@ struct llama_context * llama_new_context_with_model( } #elif defined(GGML_USE_SYCL) if (model->n_gpu_layers > 0) { - ggml_backend_t backend = ggml_backend_sycl_init(model->main_gpu); - if (backend == nullptr) { - LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d backend\n", __func__, model->main_gpu); - llama_free(ctx); - return nullptr; + // with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used + if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) { + int main_gpu_index = ggml_backend_sycl_get_device_index(model->main_gpu); + ggml_backend_t backend = ggml_backend_sycl_init(main_gpu_index); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, model->main_gpu, main_gpu_index); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } else { + // LLAMA_SPLIT_LAYER requires a backend for each GPU + int id_list[GGML_SYCL_MAX_DEVICES]; + ggml_sycl_get_gpu_list(id_list, GGML_SYCL_MAX_DEVICES); + for (int i = 0; i < ggml_backend_sycl_get_device_count(); ++i) { + int device_id = id_list[i]; + ggml_backend_t backend = ggml_backend_sycl_init(i); + if (backend == nullptr) { + LLAMA_LOG_ERROR("%s: failed to initialize SYCL%d (index %d)backend\n", __func__, device_id, i); + llama_free(ctx); + return nullptr; + } + ctx->backends.push_back(backend); + } } - ctx->backends.push_back(backend); } #elif defined(GGML_USE_KOMPUTE) if (model->n_gpu_layers > 0) { @@ -12161,7 +12195,6 @@ struct llama_context * llama_new_context_with_model( ggml_set_name(ctx->inp_cls, "inp_cls"); ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true)); - LLAMA_LOG_INFO("%s: %10s input buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(ctx->buf_input), ggml_backend_buffer_get_size(ctx->buf_input) / 1024.0 / 1024.0); From 802da0091ba646ecf02e1a8fae2da0b8e76409bd Mon Sep 17 00:00:00 2001 From: compilade <113953597+compilade@users.noreply.github.com> Date: Sat, 2 Mar 2024 08:42:56 -0500 Subject: [PATCH 18/26] llama : fix segfault from unknown model arch name (#5820) * llama : fix segfault from unknown model arch name * llama : make all LLM maps const This also requires using `std::map::at` instead of its `operator[]` which does not exist for const maps. * llama : name LLM_ARCH_UNKNOWN to "(unknown)" This avoids errors from `std::map::at` when getting the general name of the model architecture. Using "(unknown)" instead of an empty string as per suggestion https://github.com/ggerganov/llama.cpp/pull/5820#issuecomment-1973735284 * llama : remove redundant inner const for LLM_TENSOR_NAMES The extra const won't do anything here as const maps return const references to values. Co-authored-by: Jared Van Bortel * llama : remove redundant nullptr check in llm_arch_from_string Since LLM_ARCH_NAMES is a const map, no spurious elements with a NULL name are inserted anymore, so this check is dead code. --------- Co-authored-by: Jared Van Bortel --- llama.cpp | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/llama.cpp b/llama.cpp index cb6266a43..790c2740f 100644 --- a/llama.cpp +++ b/llama.cpp @@ -216,7 +216,7 @@ enum llm_arch { LLM_ARCH_UNKNOWN, }; -static std::map LLM_ARCH_NAMES = { +static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA, "llama" }, { LLM_ARCH_FALCON, "falcon" }, { LLM_ARCH_GPT2, "gpt2" }, @@ -241,6 +241,7 @@ static std::map LLM_ARCH_NAMES = { { LLM_ARCH_MINICPM, "minicpm" }, { LLM_ARCH_GEMMA, "gemma" }, { LLM_ARCH_STARCODER2, "starcoder2" }, + { LLM_ARCH_UNKNOWN, "(unknown)" }, }; enum llm_kv { @@ -301,7 +302,7 @@ enum llm_kv { LLM_KV_TOKENIZER_RWKV, }; -static std::map LLM_KV_NAMES = { +static const std::map LLM_KV_NAMES = { { LLM_KV_GENERAL_ARCHITECTURE, "general.architecture" }, { LLM_KV_GENERAL_QUANTIZATION_VERSION, "general.quantization_version" }, { LLM_KV_GENERAL_ALIGNMENT, "general.alignment" }, @@ -365,7 +366,7 @@ struct LLM_KV { llm_arch arch; std::string operator()(llm_kv kv) const { - return ::format(LLM_KV_NAMES[kv], LLM_ARCH_NAMES[arch]); + return ::format(LLM_KV_NAMES.at(kv), LLM_ARCH_NAMES.at(arch)); } }; @@ -400,7 +401,7 @@ enum llm_tensor { LLM_TENSOR_LAYER_OUT_NORM, }; -static std::map> LLM_TENSOR_NAMES = { +static const std::map> LLM_TENSOR_NAMES = { { LLM_ARCH_LLAMA, { @@ -833,38 +834,38 @@ struct LLM_TN { llm_arch arch; std::string operator()(llm_tensor tensor) const { - if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return LLM_TENSOR_NAMES[arch].at(tensor); + return LLM_TENSOR_NAMES.at(arch).at(tensor); } std::string operator()(llm_tensor tensor, const std::string & suffix) const { - if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return LLM_TENSOR_NAMES[arch].at(tensor) + "." + suffix; + return LLM_TENSOR_NAMES.at(arch).at(tensor) + "." + suffix; } std::string operator()(llm_tensor tensor, int bid) const { - if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid); + return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid); } std::string operator()(llm_tensor tensor, const std::string & suffix, int bid) const { - if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid) + "." + suffix; + return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid) + "." + suffix; } std::string operator()(llm_tensor tensor, const std::string & suffix, int bid, int xid) const { - if (LLM_TENSOR_NAMES[arch].find(tensor) == LLM_TENSOR_NAMES[arch].end()) { + if (LLM_TENSOR_NAMES.at(arch).find(tensor) == LLM_TENSOR_NAMES.at(arch).end()) { return "__missing__"; } - return ::format(LLM_TENSOR_NAMES[arch].at(tensor).c_str(), bid, xid) + "." + suffix; + return ::format(LLM_TENSOR_NAMES.at(arch).at(tensor).c_str(), bid, xid) + "." + suffix; } }; @@ -872,7 +873,7 @@ struct LLM_TN { // gguf helpers // -static std::map LLAMA_ROPE_SCALING_TYPES = { +static const std::map LLAMA_ROPE_SCALING_TYPES = { { LLAMA_ROPE_SCALING_TYPE_NONE, "none" }, { LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" }, { LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" }, From 6c32d8c7ad8ba7b6ad2a162e929a21dd04fcdca0 Mon Sep 17 00:00:00 2001 From: Xuan Son Nguyen Date: Sat, 2 Mar 2024 15:19:09 +0100 Subject: [PATCH 19/26] llama : refactor internal quantization functions (#5830) --- llama.cpp | 81 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 43 insertions(+), 38 deletions(-) diff --git a/llama.cpp b/llama.cpp index 790c2740f..697e85e89 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10836,7 +10836,7 @@ struct quantize_state_internal { {} }; -static void llama_convert_tensor_internal( +static void llama_tensor_dequantize_internal( struct ggml_tensor * tensor, std::vector> & output, std::vector & workers, const size_t nelements, const int nthread ) { @@ -11177,6 +11177,46 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty return new_type; } +static int32_t llama_tensor_quantize_internal(enum ggml_type new_type, const float * f32_data, void * new_data, const int chunk_size, int nrows, int n_per_row, int64_t * hist_cur, const float * imatrix, std::vector & workers, const int nthread) { + std::mutex mutex; + int counter = 0; + size_t new_size = 0; + if (nthread < 2) { + // single-thread + return ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur, imatrix); + } + auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size, + nrows, n_per_row, imatrix]() { + std::array local_hist = {}; + const int nrows_per_chunk = chunk_size / n_per_row; + size_t local_size = 0; + while (true) { + std::unique_lock lock(mutex); + int first_row = counter; counter += nrows_per_chunk; + if (first_row >= nrows) { + if (local_size > 0) { + for (int j=0; jftype; @@ -11289,7 +11329,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s std::vector workers; workers.reserve(nthread); - std::mutex mutex; int idx = 0; @@ -11403,7 +11442,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) { throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type))); } else { - llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread); + llama_tensor_dequantize_internal(tensor, f32_conv_buf, workers, nelements, nthread); f32_data = (float *) f32_conv_buf.data(); } @@ -11424,41 +11463,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s const int nchunk = (nelements + chunk_size - 1)/chunk_size; const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1; - if (nthread_use < 2) { - new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix); - } else { - int counter = 0; - new_size = 0; - auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size, - nrows, n_per_row, imatrix]() { - std::array local_hist = {}; - const int nrows_per_chunk = chunk_size / n_per_row; - size_t local_size = 0; - while (true) { - std::unique_lock lock(mutex); - int first_row = counter; counter += nrows_per_chunk; - if (first_row >= nrows) { - if (local_size > 0) { - for (int j=0; j %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0); int64_t tot_count = 0; From ef2cd694c4155fbf25bae61c5178c47eb3676dba Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 2 Mar 2024 16:54:08 +0200 Subject: [PATCH 20/26] scripts : add pod-llama.sh --- scripts/pod-llama.sh | 213 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 213 insertions(+) create mode 100644 scripts/pod-llama.sh diff --git a/scripts/pod-llama.sh b/scripts/pod-llama.sh new file mode 100644 index 000000000..6cf1ab4f3 --- /dev/null +++ b/scripts/pod-llama.sh @@ -0,0 +1,213 @@ +#!/bin/bash +# +# Use this script only on fresh pods (runpod.io)! +# Otherwise, it can break your environment! +# + +if [ -z "$1" ]; then + echo "Usage: $0 " + echo " 0: no models" + echo " 1: tinyllama-1b" + echo " 2: codellama-7b" + echo " 3: codellama-13b" + echo " 4: codellama-34b" + echo " 5: codellama-7b-instruct" + echo " 6: codellama-13b-instruct" + echo " 7: codellama-34b-instruct" + + exit 1 +fi + +set -x + +# setup deps +apt-get update +apt-get install -y git-lfs cmake cmake-curses-gui vim ruby +git-lfs install + +if [ ! -d "/workspace" ]; then + ln -sfn $(pwd) /workspace +fi + +# download data +cd /workspace + +# this is useful to git clone repos without doubling the disk size due to .git +git clone https://github.com/iboB/git-lfs-download +ln -sfn /workspace/git-lfs-download/git-lfs-download /usr/local/bin/git-lfs-download + +# llama.cpp +cd /workspace +git clone https://github.com/ggerganov/llama.cpp + +cd llama.cpp + +LLAMA_CUBLAS=1 make -j + +ln -sfn /workspace/TinyLlama-1.1B-Chat-v0.3 ./models/tinyllama-1b +ln -sfn /workspace/CodeLlama-7b-hf ./models/codellama-7b +ln -sfn /workspace/CodeLlama-13b-hf ./models/codellama-13b +ln -sfn /workspace/CodeLlama-34b-hf ./models/codellama-34b +ln -sfn /workspace/CodeLlama-7b-Instruct-hf ./models/codellama-7b-instruct +ln -sfn /workspace/CodeLlama-13b-Instruct-hf ./models/codellama-13b-instruct +ln -sfn /workspace/CodeLlama-34b-Instruct-hf ./models/codellama-34b-instruct + +pip install -r requirements.txt + +# cmake +cd /workspace/llama.cpp + +mkdir build-cublas +cd build-cublas + +cmake -DLLAMA_CUBLAS=1 ../ +make -j + +if [ "$1" -eq "0" ]; then + exit 0 +fi + +# more models +if [ "$1" -eq "1" ]; then + cd /workspace + + git-lfs-download https://huggingface.co/PY007/TinyLlama-1.1B-Chat-v0.3 + + cd /workspace/llama.cpp + + python3 convert.py ./models/tinyllama-1b --outfile ./models/tinyllama-1b/ggml-model-f16.gguf --outtype f16 + + ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_0.gguf q4_0 + ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q4_k.gguf q4_k + ./quantize ./models/tinyllama-1b/ggml-model-f16.gguf ./models/tinyllama-1b/ggml-model-q8_0.gguf q8_0 +fi + +if [ "$1" -eq "2" ]; then + cd /workspace + + git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-hf --without *safetensors* + rm -v ./CodeLlama-7b-hf/*safetensors* + + cd /workspace/llama.cpp + + python3 convert.py ./models/codellama-7b --outfile ./models/codellama-7b/ggml-model-f16.gguf --outtype f16 + + ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_0.gguf q4_0 + ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q4_k.gguf q4_k + ./quantize ./models/codellama-7b/ggml-model-f16.gguf ./models/codellama-7b/ggml-model-q8_0.gguf q8_0 +fi + +if [ "$1" -eq "3" ]; then + cd /workspace + + git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-hf --without *safetensors* + rm -v ./CodeLlama-13b-hf/*safetensors* + + cd /workspace/llama.cpp + + python3 convert.py ./models/codellama-13b --outfile ./models/codellama-13b/ggml-model-f16.gguf --outtype f16 + + ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_0.gguf q4_0 + ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q4_k.gguf q4_k + ./quantize ./models/codellama-13b/ggml-model-f16.gguf ./models/codellama-13b/ggml-model-q8_0.gguf q8_0 +fi + +if [ "$1" -eq "4" ]; then + cd /workspace + + git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-hf --without *safetensors* + rm -v ./CodeLlama-34b-hf/*safetensors* + + cd /workspace/llama.cpp + + python3 convert.py ./models/codellama-34b --outfile ./models/codellama-34b/ggml-model-f16.gguf --outtype f16 + + ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_0.gguf q4_0 + ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q4_k.gguf q4_k + ./quantize ./models/codellama-34b/ggml-model-f16.gguf ./models/codellama-34b/ggml-model-q8_0.gguf q8_0 +fi + +if [ "$1" -eq "5" ]; then + cd /workspace + + git-lfs-download https://huggingface.co/codellama/CodeLlama-7b-Instruct-hf --without *safetensors* + rm -v ./CodeLlama-7b-Instruct-hf/*safetensors* + + cd /workspace/llama.cpp + + python3 convert.py ./models/codellama-7b-instruct --outfile ./models/codellama-7b-instruct/ggml-model-f16.gguf --outtype f16 + + ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_0.gguf q4_0 + ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q4_k.gguf q4_k + ./quantize ./models/codellama-7b-instruct/ggml-model-f16.gguf ./models/codellama-7b-instruct/ggml-model-q8_0.gguf q8_0 +fi + +if [ "$1" -eq "6" ]; then + cd /workspace + + git-lfs-download https://huggingface.co/codellama/CodeLlama-13b-Instruct-hf --without *safetensors* + rm -v ./CodeLlama-13b-Instruct-hf/*safetensors* + + cd /workspace/llama.cpp + + python3 convert.py ./models/codellama-13b-instruct --outfile ./models/codellama-13b-instruct/ggml-model-f16.gguf --outtype f16 + + ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_0.gguf q4_0 + ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q4_k.gguf q4_k + ./quantize ./models/codellama-13b-instruct/ggml-model-f16.gguf ./models/codellama-13b-instruct/ggml-model-q8_0.gguf q8_0 +fi + +if [ "$1" -eq "7" ]; then + cd /workspace + + git-lfs-download https://huggingface.co/codellama/CodeLlama-34b-Instruct-hf --without *safetensors* + rm -v ./CodeLlama-34b-Instruct-hf/*safetensors* + + cd /workspace/llama.cpp + + python3 convert.py ./models/codellama-34b-instruct --outfile ./models/codellama-34b-instruct/ggml-model-f16.gguf --outtype f16 + + ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_0.gguf q4_0 + ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q4_k.gguf q4_k + ./quantize ./models/codellama-34b-instruct/ggml-model-f16.gguf ./models/codellama-34b-instruct/ggml-model-q8_0.gguf q8_0 +fi + +if [ "$1" -eq "1" ]; then + # perf + perplexity + cd /workspace/llama.cpp/build-cublas + + make -j && ../scripts/run-all-perf.sh tinyllama-1b "f16" "-ngl 99 -t 1 -p 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,32,64,128,256,512,1024,2048 -n 128" + + ../scripts/get-wikitext-2.sh + unzip wikitext-2-raw-v1.zip + + make -j && ./bin/perplexity -m ../models/tinyllama-1b/ggml-model-f16.gguf -f ./wikitext-2-raw/wiki.test.raw -ngl 100 --chunks 32 + + # batched + cd /workspace/llama.cpp + + LLAMA_CUBLAS=1 make -j && ./batched ./models/tinyllama-1b/ggml-model-f16.gguf "Hello, my name is" 8 128 999 + + # batched-bench + cd /workspace/llama.cpp + + LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/tinyllama-1b/ggml-model-f16.gguf 4608 1 99 0 512 128 1,2,3,4,5,6,7,8,16,32 + + # parallel + cd /workspace/llama.cpp + + LLAMA_CUBLAS=1 make -j && ./parallel -m ./models/tinyllama-1b/ggml-model-f16.gguf -t 1 -ngl 100 -c 4096 -b 512 -s 1 -np 8 -ns 128 -n 100 -cb + +fi + +# speculative +#if [ "$1" -eq "7" ]; then +# cd /workspace/llama.cpp +# +# LLAMA_CUBLAS=1 make -j && ./speculative -m ./models/codellama-34b-instruct/ggml-model-f16.gguf -md ./models/codellama-7b-instruct/ggml-model-q4_0.gguf -p "# Dijkstra's shortest path algorithm in Python (4 spaces indentation) + complexity analysis:\n\n" -e -ngl 999 -ngld 999 -t 4 -n 512 -c 4096 -s 21 --draft 16 -np 1 --temp 0.0 +#fi + +# more benches +#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-7b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 +#LLAMA_CUBLAS=1 make -j && ./batched-bench ./models/codellama-13b/ggml-model-q4_k.gguf 4096 1 99 1 512,3200 128,128,800 1 + From bbde6eb2561153aabbdfac5001c690fe00cad639 Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sat, 2 Mar 2024 17:00:51 +0200 Subject: [PATCH 21/26] ggml : IQ3_S improvements (#5829) * iq3_s: somewhat faster AVX2 dot product On Ryzen a 7950X TG-128 increases to 16 t/s from 15.5 t/s using 16 threads. For 8 threads it is 13.85 t/s vs 11.75 t/s. PP-512 increases to 28.5 t/s from 23.8 t/s. * iq3_s: somewhat faster ARM_NEON dot product Still dog slow - 10.7 t/s up from 9.9 t/s. * iq3_s: another small ARM_NEON improvement 10.7 -> 11.0 t/s. Using vmulq_s8 is faster than the xor - sub trick that works best on AVX2. * iq3_s: minor improvement on Metal 49.4 t/s -> 50.3 t/s * iq3_s: PPL improvement E.g., for a context of 4096 LLaMA-v2-7B goes to 5.1340 from 5.1653. * iq3_s: use new grid everywhere * Fix ARM_NEON --------- Co-authored-by: Iwan Kawrakow --- ggml-cuda.cu | 143 ++++++++++++------------ ggml-metal.metal | 152 ++++++++++++------------- ggml-quants.c | 280 +++++++++++++++++++++++++++-------------------- 3 files changed, 310 insertions(+), 265 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 0c6501e98..7ed97430f 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2018,74 +2018,73 @@ static const __device__ uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; -static const __device__ uint32_t iq3xs_grid[512] = { - 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, - 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, - 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, - 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, - 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, - 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, - 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, - 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, - 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, - 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, - 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, - 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, - 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, - 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, - 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, - 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, - 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, - 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, - 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, - 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, - 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, - 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, - 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, - 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, - 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, - 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, - 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, - 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, - 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, - 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, - 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, - 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, - 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, - 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, - 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, - 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, - 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, - 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, - 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, - 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, - 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, - 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, - 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, - 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, - 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, - 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, - 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, - 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, - 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, - 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, - 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, - 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, - 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, - 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, - 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, - 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, - 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, - 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, - 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, - 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, - 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, - 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, - 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, - 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +static const __device__ uint32_t iq3s_grid[512] = { + 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, + 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, + 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, + 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, + 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, + 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, + 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, + 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, + 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, + 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, + 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, + 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, + 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, + 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, + 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, + 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, + 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, + 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, + 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, + 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, + 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, + 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, + 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, + 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, + 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, + 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, + 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, + 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, + 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, + 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, + 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, + 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, + 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, + 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, + 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, + 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, + 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, + 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, + 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, + 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, + 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, + 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, + 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, + 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, + 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, + 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, + 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, + 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, + 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, + 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, + 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, + 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, + 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, + 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, + 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, + 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, + 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, + 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, + 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, + 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, + 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, + 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, + 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, + 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, }; - static const __device__ uint64_t iq1s_grid[512] = { 0xffffffffffff0101, 0xffffffffff01ff00, 0xffffffffff010100, 0xffffffff00000000, 0xffffffff01ff00ff, 0xffffffff01ff0001, 0xffffffff0101ffff, 0xffffffff0101ff01, @@ -2392,9 +2391,9 @@ static __global__ void dequantize_block_iq3_s(const void * __restrict__ vx, dst_ const int ib = tid%8; // 0...7 dst_t * y = yy + i*QK_K + 32*ib + 8*il; const uint8_t * qs = x[i].qs + 8*ib; - const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); - const float d = (float)x[i].d * (0.5f + ((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)) * 0.5f; + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*il+0] | ((x[i].qh[ib] << (8-2*il)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*il+1] | ((x[i].qh[ib] << (7-2*il)) & 256))); + const float d = (float)x[i].d * (1 + 2*((x[i].scales[ib/2] >> 4*(ib%2)) & 0xf)); const uint8_t signs = x[i].signs[4*ib + il]; for (int j = 0; j < 4; ++j) { y[j+0] = d * grid1[j] * (signs & kmask_iq2xs[j+0] ? -1.f : 1.f); @@ -5211,8 +5210,8 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( const int8_t * q8 = bq8_1[ib32].qs; int sumi = 0; for (int l = 0; l < 4; ++l) { - const uint32_t * grid1 = iq3xs_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); - const uint32_t * grid2 = iq3xs_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); + const uint32_t * grid1 = iq3s_grid + (qs[2*l+0] | ((bq2->qh[ib32] << (8 - 2*l)) & 256)); + const uint32_t * grid2 = iq3s_grid + (qs[2*l+1] | ((bq2->qh[ib32] << (7 - 2*l)) & 256)); uint32_t signs0 = __vcmpeq4(((bq2->signs[4*ib32+l] & 0xf) * 0x01010101) & 0x08040201, 0x08040201); uint32_t signs1 = __vcmpeq4(((bq2->signs[4*ib32+l] >> 4) * 0x01010101) & 0x08040201, 0x08040201); const int grid_l = __vsub4(grid1[0] ^ signs0, signs0); @@ -5221,7 +5220,7 @@ static __device__ __forceinline__ float vec_dot_iq3_s_q8_1( sumi = __dp4a(grid_h, *((int *)q8+1), sumi); q8 += 8; } - const float d = (float)bq2->d * (0.5f + ((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds) * 0.5f; + const float d = (float)bq2->d * (1 + 2*((bq2->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * __low2float(bq8_1[ib32].ds); return d * sumi; #else assert(false); diff --git a/ggml-metal.metal b/ggml-metal.metal index 74a5e0b03..8b9488437 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -4087,71 +4087,71 @@ constexpr constant static uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; -constexpr constant static uint32_t iq3xs_grid[512] = { - 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, - 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, - 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, - 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, - 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, - 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, - 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, - 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, - 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, - 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, - 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, - 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, - 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, - 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, - 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, - 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, - 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, - 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, - 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, - 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, - 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, - 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, - 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, - 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, - 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, - 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, - 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, - 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, - 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, - 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, - 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, - 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, - 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, - 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, - 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, - 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, - 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, - 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, - 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, - 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, - 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, - 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, - 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, - 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, - 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, - 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, - 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, - 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, - 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, - 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, - 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, - 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, - 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, - 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, - 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, - 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, - 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, - 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, - 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, - 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, - 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, - 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, - 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, - 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +constexpr constant static uint32_t iq3s_grid[512] = { + 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, + 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, + 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, + 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, + 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, + 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, + 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, + 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, + 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, + 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, + 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, + 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, + 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, + 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, + 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, + 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, + 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, + 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, + 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, + 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, + 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, + 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, + 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, + 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, + 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, + 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, + 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, + 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, + 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, + 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, + 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, + 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, + 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, + 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, + 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, + 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, + 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, + 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, + 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, + 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, + 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, + 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, + 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, + 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, + 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, + 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, + 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, + 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, + 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, + 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, + 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, + 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, + 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, + 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, + 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, + 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, + 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, + 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, + 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, + 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, + 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, + 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, + 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, + 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, }; #define NGRID_IQ1S 512 @@ -4742,7 +4742,7 @@ void kernel_mul_mv_iq3_s_f32_impl( { int nval = 8; int pos = (32*sgitg + tiisg)*nval; - for (int i = 0; i < nval; ++i) values[pos + i] = iq3xs_grid[pos + i]; + for (int i = 0; i < nval; ++i) values[pos + i] = iq3s_grid[pos + i]; threadgroup_barrier(mem_flags::mem_threadgroup); } @@ -4769,12 +4769,14 @@ void kernel_mul_mv_iq3_s_f32_impl( for (int row = 0; row < N_DST; row++) { const float db = dh[0]; - const float d = db * (0.5f + ((sc[0] >> 4*(ib%2)) & 0xf)); + const float d = db * (1 + 2*((sc[0] >> 4*(ib%2)) & 0xf)); float2 sum = {0}; for (int l = 0; l < 4; ++l) { - const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(values + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); - const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(values + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); + const threadgroup uint32_t * table1 = qh[0] & kmask_iq2xs[2*l+0] ? values + 256 : values; + const threadgroup uint32_t * table2 = qh[0] & kmask_iq2xs[2*l+1] ? values + 256 : values; + const threadgroup uint8_t * grid1 = (const threadgroup uint8_t *)(table1 + qs[2*l+0]); + const threadgroup uint8_t * grid2 = (const threadgroup uint8_t *)(table2 + qs[2*l+1]); for (int j = 0; j < 4; ++j) { sum[0] += yl[8*l + j + 0] * grid1[j] * select(1, -1, signs[l] & kmask_iq2xs[j+0]); sum[1] += yl[8*l + j + 4] * grid2[j] * select(1, -1, signs[l] & kmask_iq2xs[j+4]); @@ -4795,7 +4797,7 @@ void kernel_mul_mv_iq3_s_f32_impl( for (int row = 0; row < N_DST; ++row) { all_sum = simd_sum(sumf[row]); if (tiisg == 0) { - dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum * 0.5f; + dst[r1*ne0 + im*ne0*ne1 + first_row + row] = all_sum; } } } @@ -5685,15 +5687,15 @@ void dequantize_iq3_s(device const block_iq3_s * xb, short il, thread type4x4 & device const uint8_t * qs = xb->qs + 8*ib32; device const uint8_t * signs = xb->signs + 4*ib32 + 2*il; const uint8_t qh = xb->qh[ib32] >> 4*il; - const float dl = d * (0.5f + ((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)) * 0.5f; - constant uint8_t * grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+0] | ((qh << 8) & 256))); - constant uint8_t * grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+1] | ((qh << 7) & 256))); + const float dl = d * (1 + 2*((xb->scales[ib32/2] >> 4*(ib32%2)) & 0xf)); + constant uint8_t * grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+0] | ((qh << 8) & 256))); + constant uint8_t * grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+1] | ((qh << 7) & 256))); for (int i = 0; i < 4; ++i) { reg[0][i] = dl * grid1[i] * select(1, -1, signs[0] & kmask_iq2xs[i+0]); reg[1][i] = dl * grid2[i] * select(1, -1, signs[0] & kmask_iq2xs[i+4]); } - grid1 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+2] | ((qh << 6) & 256))); - grid2 = (constant uint8_t *)(iq3xs_grid + (qs[4*il+3] | ((qh << 5) & 256))); + grid1 = (constant uint8_t *)(iq3s_grid + (qs[4*il+2] | ((qh << 6) & 256))); + grid2 = (constant uint8_t *)(iq3s_grid + (qs[4*il+3] | ((qh << 5) & 256))); for (int i = 0; i < 4; ++i) { reg[2][i] = dl * grid1[i] * select(1, -1, signs[1] & kmask_iq2xs[i+0]); reg[3][i] = dl * grid2[i] * select(1, -1, signs[1] & kmask_iq2xs[i+4]); diff --git a/ggml-quants.c b/ggml-quants.c index 371826f14..492a1b9a6 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -3818,71 +3818,71 @@ static const uint32_t iq3xxs_grid[256] = { 0x3e1c1c1c, 0x3e1c3404, 0x3e24140c, 0x3e24240c, 0x3e2c0404, 0x3e2c0414, 0x3e2c1424, 0x3e341c04, }; -static const uint32_t iq3xs_grid[512] = { - 0x04040404, 0x0404040c, 0x04040414, 0x0404042c, 0x0404043e, 0x04040c04, 0x04040c0c, 0x04040c14, - 0x04040c24, 0x04040c34, 0x04041404, 0x0404140c, 0x0404142c, 0x04041c1c, 0x04042404, 0x04042414, - 0x0404242c, 0x0404243e, 0x04042c0c, 0x04042c1c, 0x04043404, 0x04043414, 0x04043e0c, 0x04043e24, - 0x04043e3e, 0x040c0404, 0x040c040c, 0x040c0414, 0x040c0424, 0x040c0c04, 0x040c0c0c, 0x040c0c2c, - 0x040c1404, 0x040c141c, 0x040c143e, 0x040c1c0c, 0x040c1c2c, 0x040c2424, 0x040c340c, 0x040c342c, - 0x040c3e14, 0x04140404, 0x0414040c, 0x0414042c, 0x0414043e, 0x04140c04, 0x04140c1c, 0x04140c34, - 0x0414140c, 0x0414142c, 0x04141c04, 0x04141c24, 0x04142414, 0x0414242c, 0x0414243e, 0x04142c0c, - 0x04142c1c, 0x04143e04, 0x04143e1c, 0x041c041c, 0x041c0c0c, 0x041c0c2c, 0x041c1404, 0x041c1414, - 0x041c1c0c, 0x041c1c1c, 0x041c1c34, 0x041c2424, 0x041c2c04, 0x041c2c14, 0x041c343e, 0x041c3e0c, - 0x041c3e2c, 0x04240404, 0x04240c1c, 0x04240c3e, 0x0424140c, 0x04241424, 0x04241c14, 0x04242404, - 0x0424241c, 0x04242c0c, 0x04243e04, 0x042c0414, 0x042c0424, 0x042c1404, 0x042c1414, 0x042c1434, - 0x042c1c1c, 0x042c240c, 0x042c242c, 0x042c243e, 0x042c3434, 0x042c3e1c, 0x04340434, 0x04340c0c, - 0x04340c1c, 0x04341c0c, 0x04342c14, 0x04343e0c, 0x043e0404, 0x043e0414, 0x043e0424, 0x043e1404, - 0x043e1414, 0x043e1434, 0x043e1c1c, 0x043e2c04, 0x043e2c24, 0x0c040404, 0x0c04040c, 0x0c040414, - 0x0c040424, 0x0c040c04, 0x0c040c0c, 0x0c040c1c, 0x0c040c2c, 0x0c040c3e, 0x0c041404, 0x0c041414, - 0x0c041c0c, 0x0c041c24, 0x0c041c34, 0x0c042c24, 0x0c042c34, 0x0c04340c, 0x0c043e14, 0x0c0c0404, - 0x0c0c040c, 0x0c0c041c, 0x0c0c0434, 0x0c0c0c04, 0x0c0c0c24, 0x0c0c140c, 0x0c0c1c04, 0x0c0c1c1c, - 0x0c0c240c, 0x0c0c2c04, 0x0c0c2c14, 0x0c0c3e04, 0x0c0c3e34, 0x0c140404, 0x0c140c14, 0x0c140c2c, - 0x0c140c3e, 0x0c141404, 0x0c141424, 0x0c141c14, 0x0c142404, 0x0c14241c, 0x0c142c2c, 0x0c143404, - 0x0c143e14, 0x0c1c040c, 0x0c1c0424, 0x0c1c043e, 0x0c1c0c04, 0x0c1c0c1c, 0x0c1c140c, 0x0c1c143e, - 0x0c1c1c04, 0x0c1c1c24, 0x0c1c240c, 0x0c1c3414, 0x0c1c3e04, 0x0c24041c, 0x0c24042c, 0x0c240c14, - 0x0c240c24, 0x0c241c0c, 0x0c241c1c, 0x0c242414, 0x0c242434, 0x0c242c04, 0x0c242c24, 0x0c2c040c, - 0x0c2c0c04, 0x0c2c0c1c, 0x0c2c140c, 0x0c2c1c04, 0x0c2c1c14, 0x0c2c2c0c, 0x0c341404, 0x0c341424, - 0x0c34143e, 0x0c342424, 0x0c342434, 0x0c3e040c, 0x0c3e041c, 0x0c3e0c04, 0x0c3e0c14, 0x0c3e140c, - 0x0c3e1c2c, 0x0c3e240c, 0x0c3e3414, 0x0c3e3e04, 0x14040404, 0x1404040c, 0x1404041c, 0x1404042c, - 0x1404043e, 0x14040c04, 0x14040c14, 0x14040c24, 0x14040c34, 0x1404140c, 0x1404141c, 0x1404143e, - 0x14041c04, 0x14041c14, 0x1404240c, 0x1404241c, 0x1404242c, 0x14042c04, 0x14042c14, 0x1404343e, - 0x14043e04, 0x14043e1c, 0x14043e2c, 0x140c0404, 0x140c0414, 0x140c0c04, 0x140c0c1c, 0x140c0c3e, - 0x140c1414, 0x140c142c, 0x140c1c0c, 0x140c1c24, 0x140c2414, 0x140c2c0c, 0x1414040c, 0x14140424, - 0x1414043e, 0x1414140c, 0x1414141c, 0x14141c04, 0x14141c3e, 0x1414240c, 0x14142c1c, 0x14142c3e, - 0x14143e0c, 0x14143e24, 0x141c0404, 0x141c0414, 0x141c042c, 0x141c0c0c, 0x141c1414, 0x141c1424, - 0x141c1c0c, 0x141c1c1c, 0x141c2414, 0x141c2c04, 0x141c3434, 0x1424040c, 0x1424043e, 0x14241404, - 0x1424141c, 0x14241c14, 0x14241c2c, 0x1424240c, 0x14243e14, 0x14243e2c, 0x142c0424, 0x142c0c0c, - 0x142c1414, 0x142c1c3e, 0x142c2404, 0x142c2c1c, 0x142c3e04, 0x14340404, 0x14340414, 0x1434043e, - 0x1434140c, 0x14342c2c, 0x1434340c, 0x143e042c, 0x143e0c0c, 0x143e1434, 0x143e1c04, 0x143e241c, - 0x143e2c04, 0x1c040414, 0x1c040c0c, 0x1c040c1c, 0x1c040c2c, 0x1c040c3e, 0x1c041414, 0x1c041c0c, - 0x1c041c1c, 0x1c041c2c, 0x1c042414, 0x1c042424, 0x1c04243e, 0x1c042c0c, 0x1c04341c, 0x1c043e0c, - 0x1c0c040c, 0x1c0c041c, 0x1c0c042c, 0x1c0c0c24, 0x1c0c140c, 0x1c0c141c, 0x1c0c2404, 0x1c0c3404, - 0x1c0c3e14, 0x1c0c3e34, 0x1c140404, 0x1c140c14, 0x1c141404, 0x1c141c14, 0x1c141c24, 0x1c142c04, - 0x1c1c040c, 0x1c1c0c04, 0x1c1c0c24, 0x1c1c140c, 0x1c1c141c, 0x1c1c143e, 0x1c1c1c04, 0x1c1c240c, - 0x1c1c241c, 0x1c1c243e, 0x1c1c2c2c, 0x1c1c3e1c, 0x1c24041c, 0x1c240c0c, 0x1c240c34, 0x1c241414, - 0x1c241c0c, 0x1c242c14, 0x1c243404, 0x1c243424, 0x1c2c040c, 0x1c2c0c04, 0x1c2c0c14, 0x1c2c142c, - 0x1c2c1c14, 0x1c2c2424, 0x1c2c2c34, 0x1c2c3e1c, 0x1c340c34, 0x1c34240c, 0x1c3e040c, 0x1c3e041c, - 0x1c3e1404, 0x1c3e1414, 0x1c3e1c2c, 0x24040404, 0x24040424, 0x24040c14, 0x24041404, 0x24041424, - 0x2404143e, 0x24041c14, 0x2404240c, 0x24042c04, 0x24043e04, 0x240c0414, 0x240c043e, 0x240c0c0c, - 0x240c0c1c, 0x240c1414, 0x240c1c04, 0x240c1c2c, 0x240c241c, 0x240c2c0c, 0x240c2c2c, 0x2414040c, - 0x2414041c, 0x24140c04, 0x24140c2c, 0x2414140c, 0x24141c1c, 0x24142404, 0x24142c3e, 0x24143414, - 0x24143e04, 0x241c0424, 0x241c0c0c, 0x241c0c1c, 0x241c1404, 0x241c1414, 0x241c1c0c, 0x241c1c2c, - 0x24240404, 0x24240414, 0x24241424, 0x24241c3e, 0x24242404, 0x24243e0c, 0x242c042c, 0x242c043e, - 0x242c140c, 0x242c3414, 0x24340c1c, 0x24341c24, 0x24343404, 0x243e0c04, 0x243e0c2c, 0x243e1c04, - 0x243e241c, 0x243e2c0c, 0x2c040414, 0x2c040c04, 0x2c040c24, 0x2c041414, 0x2c042404, 0x2c042424, - 0x2c04243e, 0x2c042c14, 0x2c043434, 0x2c043e24, 0x2c0c040c, 0x2c0c041c, 0x2c0c042c, 0x2c0c0c14, - 0x2c0c140c, 0x2c0c1c14, 0x2c0c3e14, 0x2c140404, 0x2c140c0c, 0x2c14141c, 0x2c141c04, 0x2c141c34, - 0x2c142c1c, 0x2c1c0414, 0x2c1c043e, 0x2c1c0c04, 0x2c1c143e, 0x2c1c2424, 0x2c1c2c0c, 0x2c1c342c, - 0x2c1c3e1c, 0x2c24040c, 0x2c240424, 0x2c241404, 0x2c241c14, 0x2c242434, 0x2c2c0c14, 0x2c2c1434, - 0x2c2c2c0c, 0x2c2c2c1c, 0x2c342414, 0x2c3e0414, 0x2c3e0424, 0x2c3e1414, 0x34040c0c, 0x34040c1c, - 0x34040c2c, 0x34041c0c, 0x34041c1c, 0x34043404, 0x340c0404, 0x340c1404, 0x340c143e, 0x340c3424, - 0x34140c14, 0x34141c24, 0x34142414, 0x34142c2c, 0x34143414, 0x34143e04, 0x341c0404, 0x341c0c24, - 0x341c140c, 0x341c2404, 0x3424142c, 0x3424241c, 0x34243414, 0x342c0404, 0x342c041c, 0x342c1c24, - 0x342c3404, 0x3434042c, 0x34342404, 0x343e0c0c, 0x343e0c1c, 0x3e040404, 0x3e040424, 0x3e04043e, - 0x3e041404, 0x3e041414, 0x3e041c34, 0x3e042404, 0x3e042c24, 0x3e043414, 0x3e0c0414, 0x3e0c0c0c, - 0x3e0c1424, 0x3e0c241c, 0x3e0c242c, 0x3e14040c, 0x3e140424, 0x3e140c04, 0x3e140c34, 0x3e14140c, - 0x3e141c04, 0x3e142c0c, 0x3e1c0414, 0x3e1c1c14, 0x3e1c1c2c, 0x3e1c2c1c, 0x3e24040c, 0x3e24042c, - 0x3e240c1c, 0x3e241404, 0x3e242c04, 0x3e2c1414, 0x3e2c2414, 0x3e340414, 0x3e341c0c, 0x3e3e0404, +static const uint32_t iq3s_grid[512] = { + 0x01010101, 0x01010103, 0x01010105, 0x0101010b, 0x0101010f, 0x01010301, 0x01010303, 0x01010305, + 0x01010309, 0x0101030d, 0x01010501, 0x01010503, 0x0101050b, 0x01010707, 0x01010901, 0x01010905, + 0x0101090b, 0x0101090f, 0x01010b03, 0x01010b07, 0x01010d01, 0x01010d05, 0x01010f03, 0x01010f09, + 0x01010f0f, 0x01030101, 0x01030103, 0x01030105, 0x01030109, 0x01030301, 0x01030303, 0x0103030b, + 0x01030501, 0x01030507, 0x0103050f, 0x01030703, 0x0103070b, 0x01030909, 0x01030d03, 0x01030d0b, + 0x01030f05, 0x01050101, 0x01050103, 0x0105010b, 0x0105010f, 0x01050301, 0x01050307, 0x0105030d, + 0x01050503, 0x0105050b, 0x01050701, 0x01050709, 0x01050905, 0x0105090b, 0x0105090f, 0x01050b03, + 0x01050b07, 0x01050f01, 0x01050f07, 0x01070107, 0x01070303, 0x0107030b, 0x01070501, 0x01070505, + 0x01070703, 0x01070707, 0x0107070d, 0x01070909, 0x01070b01, 0x01070b05, 0x01070d0f, 0x01070f03, + 0x01070f0b, 0x01090101, 0x01090307, 0x0109030f, 0x01090503, 0x01090509, 0x01090705, 0x01090901, + 0x01090907, 0x01090b03, 0x01090f01, 0x010b0105, 0x010b0109, 0x010b0501, 0x010b0505, 0x010b050d, + 0x010b0707, 0x010b0903, 0x010b090b, 0x010b090f, 0x010b0d0d, 0x010b0f07, 0x010d010d, 0x010d0303, + 0x010d0307, 0x010d0703, 0x010d0b05, 0x010d0f03, 0x010f0101, 0x010f0105, 0x010f0109, 0x010f0501, + 0x010f0505, 0x010f050d, 0x010f0707, 0x010f0b01, 0x010f0b09, 0x03010101, 0x03010103, 0x03010105, + 0x03010109, 0x03010301, 0x03010303, 0x03010307, 0x0301030b, 0x0301030f, 0x03010501, 0x03010505, + 0x03010703, 0x03010709, 0x0301070d, 0x03010b09, 0x03010b0d, 0x03010d03, 0x03010f05, 0x03030101, + 0x03030103, 0x03030107, 0x0303010d, 0x03030301, 0x03030309, 0x03030503, 0x03030701, 0x03030707, + 0x03030903, 0x03030b01, 0x03030b05, 0x03030f01, 0x03030f0d, 0x03050101, 0x03050305, 0x0305030b, + 0x0305030f, 0x03050501, 0x03050509, 0x03050705, 0x03050901, 0x03050907, 0x03050b0b, 0x03050d01, + 0x03050f05, 0x03070103, 0x03070109, 0x0307010f, 0x03070301, 0x03070307, 0x03070503, 0x0307050f, + 0x03070701, 0x03070709, 0x03070903, 0x03070d05, 0x03070f01, 0x03090107, 0x0309010b, 0x03090305, + 0x03090309, 0x03090703, 0x03090707, 0x03090905, 0x0309090d, 0x03090b01, 0x03090b09, 0x030b0103, + 0x030b0301, 0x030b0307, 0x030b0503, 0x030b0701, 0x030b0705, 0x030b0b03, 0x030d0501, 0x030d0509, + 0x030d050f, 0x030d0909, 0x030d090d, 0x030f0103, 0x030f0107, 0x030f0301, 0x030f0305, 0x030f0503, + 0x030f070b, 0x030f0903, 0x030f0d05, 0x030f0f01, 0x05010101, 0x05010103, 0x05010107, 0x0501010b, + 0x0501010f, 0x05010301, 0x05010305, 0x05010309, 0x0501030d, 0x05010503, 0x05010507, 0x0501050f, + 0x05010701, 0x05010705, 0x05010903, 0x05010907, 0x0501090b, 0x05010b01, 0x05010b05, 0x05010d0f, + 0x05010f01, 0x05010f07, 0x05010f0b, 0x05030101, 0x05030105, 0x05030301, 0x05030307, 0x0503030f, + 0x05030505, 0x0503050b, 0x05030703, 0x05030709, 0x05030905, 0x05030b03, 0x05050103, 0x05050109, + 0x0505010f, 0x05050503, 0x05050507, 0x05050701, 0x0505070f, 0x05050903, 0x05050b07, 0x05050b0f, + 0x05050f03, 0x05050f09, 0x05070101, 0x05070105, 0x0507010b, 0x05070303, 0x05070505, 0x05070509, + 0x05070703, 0x05070707, 0x05070905, 0x05070b01, 0x05070d0d, 0x05090103, 0x0509010f, 0x05090501, + 0x05090507, 0x05090705, 0x0509070b, 0x05090903, 0x05090f05, 0x05090f0b, 0x050b0109, 0x050b0303, + 0x050b0505, 0x050b070f, 0x050b0901, 0x050b0b07, 0x050b0f01, 0x050d0101, 0x050d0105, 0x050d010f, + 0x050d0503, 0x050d0b0b, 0x050d0d03, 0x050f010b, 0x050f0303, 0x050f050d, 0x050f0701, 0x050f0907, + 0x050f0b01, 0x07010105, 0x07010303, 0x07010307, 0x0701030b, 0x0701030f, 0x07010505, 0x07010703, + 0x07010707, 0x0701070b, 0x07010905, 0x07010909, 0x0701090f, 0x07010b03, 0x07010d07, 0x07010f03, + 0x07030103, 0x07030107, 0x0703010b, 0x07030309, 0x07030503, 0x07030507, 0x07030901, 0x07030d01, + 0x07030f05, 0x07030f0d, 0x07050101, 0x07050305, 0x07050501, 0x07050705, 0x07050709, 0x07050b01, + 0x07070103, 0x07070301, 0x07070309, 0x07070503, 0x07070507, 0x0707050f, 0x07070701, 0x07070903, + 0x07070907, 0x0707090f, 0x07070b0b, 0x07070f07, 0x07090107, 0x07090303, 0x0709030d, 0x07090505, + 0x07090703, 0x07090b05, 0x07090d01, 0x07090d09, 0x070b0103, 0x070b0301, 0x070b0305, 0x070b050b, + 0x070b0705, 0x070b0909, 0x070b0b0d, 0x070b0f07, 0x070d030d, 0x070d0903, 0x070f0103, 0x070f0107, + 0x070f0501, 0x070f0505, 0x070f070b, 0x09010101, 0x09010109, 0x09010305, 0x09010501, 0x09010509, + 0x0901050f, 0x09010705, 0x09010903, 0x09010b01, 0x09010f01, 0x09030105, 0x0903010f, 0x09030303, + 0x09030307, 0x09030505, 0x09030701, 0x0903070b, 0x09030907, 0x09030b03, 0x09030b0b, 0x09050103, + 0x09050107, 0x09050301, 0x0905030b, 0x09050503, 0x09050707, 0x09050901, 0x09050b0f, 0x09050d05, + 0x09050f01, 0x09070109, 0x09070303, 0x09070307, 0x09070501, 0x09070505, 0x09070703, 0x0907070b, + 0x09090101, 0x09090105, 0x09090509, 0x0909070f, 0x09090901, 0x09090f03, 0x090b010b, 0x090b010f, + 0x090b0503, 0x090b0d05, 0x090d0307, 0x090d0709, 0x090d0d01, 0x090f0301, 0x090f030b, 0x090f0701, + 0x090f0907, 0x090f0b03, 0x0b010105, 0x0b010301, 0x0b010309, 0x0b010505, 0x0b010901, 0x0b010909, + 0x0b01090f, 0x0b010b05, 0x0b010d0d, 0x0b010f09, 0x0b030103, 0x0b030107, 0x0b03010b, 0x0b030305, + 0x0b030503, 0x0b030705, 0x0b030f05, 0x0b050101, 0x0b050303, 0x0b050507, 0x0b050701, 0x0b05070d, + 0x0b050b07, 0x0b070105, 0x0b07010f, 0x0b070301, 0x0b07050f, 0x0b070909, 0x0b070b03, 0x0b070d0b, + 0x0b070f07, 0x0b090103, 0x0b090109, 0x0b090501, 0x0b090705, 0x0b09090d, 0x0b0b0305, 0x0b0b050d, + 0x0b0b0b03, 0x0b0b0b07, 0x0b0d0905, 0x0b0f0105, 0x0b0f0109, 0x0b0f0505, 0x0d010303, 0x0d010307, + 0x0d01030b, 0x0d010703, 0x0d010707, 0x0d010d01, 0x0d030101, 0x0d030501, 0x0d03050f, 0x0d030d09, + 0x0d050305, 0x0d050709, 0x0d050905, 0x0d050b0b, 0x0d050d05, 0x0d050f01, 0x0d070101, 0x0d070309, + 0x0d070503, 0x0d070901, 0x0d09050b, 0x0d090907, 0x0d090d05, 0x0d0b0101, 0x0d0b0107, 0x0d0b0709, + 0x0d0b0d01, 0x0d0d010b, 0x0d0d0901, 0x0d0f0303, 0x0d0f0307, 0x0f010101, 0x0f010109, 0x0f01010f, + 0x0f010501, 0x0f010505, 0x0f01070d, 0x0f010901, 0x0f010b09, 0x0f010d05, 0x0f030105, 0x0f030303, + 0x0f030509, 0x0f030907, 0x0f03090b, 0x0f050103, 0x0f050109, 0x0f050301, 0x0f05030d, 0x0f050503, + 0x0f050701, 0x0f050b03, 0x0f070105, 0x0f070705, 0x0f07070b, 0x0f070b07, 0x0f090103, 0x0f09010b, + 0x0f090307, 0x0f090501, 0x0f090b01, 0x0f0b0505, 0x0f0b0905, 0x0f0d0105, 0x0f0d0703, 0x0f0f0101, }; #define NGRID_IQ2XXS 512 @@ -4162,11 +4162,11 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in const uint8_t * signs = x[i].signs; for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { - const float db1 = d * (0.5f + (x[i].scales[ib32/2] & 0xf)) * 0.5f; - const float db2 = d * (0.5f + (x[i].scales[ib32/2] >> 4)) * 0.5f; + const float db1 = d * (1 + 2*(x[i].scales[ib32/2] & 0xf)); + const float db2 = d * (1 + 2*(x[i].scales[ib32/2] >> 4)); for (int l = 0; l < 4; ++l) { - const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[0] << (7-2*l)) & 256))); for (int j = 0; j < 4; ++j) { y[j+0] = db1 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = db1 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); @@ -4176,8 +4176,8 @@ void dequantize_row_iq3_s(const block_iq3_s * restrict x, float * restrict y, in qs += 8; signs += 4; for (int l = 0; l < 4; ++l) { - const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256))); + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[1] << (7-2*l)) & 256))); for (int j = 0; j < 4; ++j) { y[j+0] = db2 * grid1[j] * (signs[l] & kmask_iq2xs[j+0] ? -1.f : 1.f); y[j+4] = db2 * grid2[j] * (signs[l] & kmask_iq2xs[j+4] ? -1.f : 1.f); @@ -10089,18 +10089,34 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v #if defined(__ARM_NEON) + typedef union { + uint16x8_t vec_index; + uint16_t index[8]; + } vec_index_t; + static const uint8_t k_mask1[32] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03 }; static const uint8_t k_mask2[16] = {0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,}; - const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1); - const uint8x16_t mask2 = vld1q_u8(k_mask2); + static const int16_t k_shift[8] = {8, 7, 6, 5, 4, 3, 2, 1}; + + const uint8x16x2_t mask1 = vld1q_u8_x2(k_mask1); + const uint8x16_t mask2 = vld1q_u8(k_mask2); + const int16x8_t hshift = vld1q_s16(k_shift); + const uint16x8_t m256 = vdupq_n_u16(256); + const uint8x16_t m1 = vdupq_n_u8(1); uint8x16x2_t vs; ggml_int8x16x4_t q3s; ggml_int8x16x4_t q8b; + vec_index_t idx; + +#if QK_K == 256 + uint32_t scales32[2]; + const uint8_t * scales8 = (const uint8_t *)scales32; +#endif float sumf = 0; for (int i = 0; i < nb; ++i) { @@ -10109,47 +10125,63 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v const uint8_t * restrict qh = x[i].qh; const uint16_t * restrict signs = (const uint16_t *)x[i].signs; const int8_t * restrict q8 = y[i].qs; + +#if QK_K == 256 + memcpy(scales32, x[i].scales, 4); + scales32[1] = (((scales32[0] >> 4) & 0x0f0f0f0f) << 1) | 0x01010101; + scales32[0] = ((scales32[0] & 0x0f0f0f0f) << 1) | 0x01010101; +#endif + int sumi1 = 0, sumi2 = 0; for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { q8b = ggml_vld1q_s8_x4(q8); q8 += 64; - const uint32x4_t aux32x4_0 = {iq3xs_grid[qs[ 0] | ((qh[ib32+0] << 8) & 256)], iq3xs_grid[qs[ 1] | ((qh[ib32+0] << 7) & 256)], - iq3xs_grid[qs[ 2] | ((qh[ib32+0] << 6) & 256)], iq3xs_grid[qs[ 3] | ((qh[ib32+0] << 5) & 256)]}; - const uint32x4_t aux32x4_1 = {iq3xs_grid[qs[ 4] | ((qh[ib32+0] << 4) & 256)], iq3xs_grid[qs[ 5] | ((qh[ib32+0] << 3) & 256)], - iq3xs_grid[qs[ 6] | ((qh[ib32+0] << 2) & 256)], iq3xs_grid[qs[ 7] | ((qh[ib32+0] << 1) & 256)]}; - const uint32x4_t aux32x4_2 = {iq3xs_grid[qs[ 8] | ((qh[ib32+1] << 8) & 256)], iq3xs_grid[qs[ 9] | ((qh[ib32+1] << 7) & 256)], - iq3xs_grid[qs[10] | ((qh[ib32+1] << 6) & 256)], iq3xs_grid[qs[11] | ((qh[ib32+1] << 5) & 256)]}; - const uint32x4_t aux32x4_3 = {iq3xs_grid[qs[12] | ((qh[ib32+1] << 4) & 256)], iq3xs_grid[qs[13] | ((qh[ib32+1] << 3) & 256)], - iq3xs_grid[qs[14] | ((qh[ib32+1] << 2) & 256)], iq3xs_grid[qs[15] | ((qh[ib32+1] << 1) & 256)]}; - qs += 16; + + const uint8x16_t idx_l = vld1q_u8(qs); qs += 16; + idx.vec_index = vorrq_u16(vmovl_u8(vget_low_u8 (idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+0]), hshift), m256)); + const uint32x4_t aux32x4_0 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], + iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]}; + const uint32x4_t aux32x4_1 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], + iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]}; + idx.vec_index = vorrq_u16(vmovl_u8(vget_high_u8(idx_l)), vandq_u16(vshlq_u16(vdupq_n_u16(qh[ib32+1]), hshift), m256)); + const uint32x4_t aux32x4_2 = {iq3s_grid[idx.index[0]], iq3s_grid[idx.index[1]], + iq3s_grid[idx.index[2]], iq3s_grid[idx.index[3]]}; + const uint32x4_t aux32x4_3 = {iq3s_grid[idx.index[4]], iq3s_grid[idx.index[5]], + iq3s_grid[idx.index[6]], iq3s_grid[idx.index[7]]}; + vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[0] | (signs[1] << 16))); vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); - vs.val[0] = vceqq_u8(vs.val[0], mask2); - vs.val[1] = vceqq_u8(vs.val[1], mask2); + vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); + vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); - q3s.val[0] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_0))), vreinterpretq_s8_u8(vs.val[0])); - q3s.val[1] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_1))), vreinterpretq_s8_u8(vs.val[1])); + q3s.val[0] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_0)); + q3s.val[1] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_1)); vs.val[0] = vreinterpretq_u8_u32(vdupq_n_u32(signs[2] | (signs[3] << 16))); vs.val[1] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[1]), mask2); vs.val[0] = vandq_u8(ggml_vqtbl1q_u8(vs.val[0], mask1.val[0]), mask2); - vs.val[0] = vceqq_u8(vs.val[0], mask2); - vs.val[1] = vceqq_u8(vs.val[1], mask2); + vs.val[0] = vorrq_u8(vceqq_u8(vs.val[0], mask2), m1); + vs.val[1] = vorrq_u8(vceqq_u8(vs.val[1], mask2), m1); signs += 4; - q3s.val[2] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[0], vreinterpretq_u8_u32(aux32x4_2))), vreinterpretq_s8_u8(vs.val[0])); - q3s.val[3] = vsubq_s8(vreinterpretq_s8_u8(veorq_u8(vs.val[1], vreinterpretq_u8_u32(aux32x4_3))), vreinterpretq_s8_u8(vs.val[1])); + q3s.val[2] = vmulq_s8(vreinterpretq_s8_u8(vs.val[0]), vreinterpretq_s8_u32(aux32x4_2)); + q3s.val[3] = vmulq_s8(vreinterpretq_s8_u8(vs.val[1]), vreinterpretq_s8_u32(aux32x4_3)); const int32x4_t p1 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[0], q8b.val[0]), q3s.val[1], q8b.val[1]); const int32x4_t p2 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), q3s.val[2], q8b.val[2]), q3s.val[3], q8b.val[3]); +#if QK_K == 256 + sumi1 += vaddvq_s32(p1) * scales8[ib32/2+0]; + sumi2 += vaddvq_s32(p2) * scales8[ib32/2+4]; +#else sumi1 += vaddvq_s32(p1) * (1 + 2*(x[i].scales[ib32/2] & 0xf)); sumi2 += vaddvq_s32(p2) * (1 + 2*(x[i].scales[ib32/2] >> 4)); +#endif } sumf += d*(sumi1 + sumi2); } - *s = 0.25f * sumf; + *s = sumf; #elif defined(__AVX2__) @@ -10164,6 +10196,16 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v const __m256i mask1 = _mm256_loadu_si256((const __m256i*)k_mask1); const __m256i mask2 = _mm256_loadu_si256((const __m256i*)k_mask2); + const __m256i idx_shift = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + const __m256i idx_mask = _mm256_set1_epi32(256); + + typedef union { + __m256i vec[2]; + uint32_t index[16]; + } index_t; + + index_t idx; + __m256 accumf = _mm256_setzero_ps(); for (int i = 0; i < nb; ++i) { const float d = GGML_FP16_TO_FP32(x[i].d) * y[i].d; @@ -10176,24 +10218,25 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v for (int ib32 = 0; ib32 < QK_K/32; ib32 += 2) { const __m256i q8_1 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; const __m256i q8_2 = _mm256_loadu_si256((const __m256i *)q8); q8 += 32; - const __m256i q2_1 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+0] << 1) & 256)], - iq3xs_grid[qs[6] | ((qh[ib32+0] << 2) & 256)], - iq3xs_grid[qs[5] | ((qh[ib32+0] << 3) & 256)], - iq3xs_grid[qs[4] | ((qh[ib32+0] << 4) & 256)], - iq3xs_grid[qs[3] | ((qh[ib32+0] << 5) & 256)], - iq3xs_grid[qs[2] | ((qh[ib32+0] << 6) & 256)], - iq3xs_grid[qs[1] | ((qh[ib32+0] << 7) & 256)], - iq3xs_grid[qs[0] | ((qh[ib32+0] << 8) & 256)]); - qs += 8; - const __m256i q2_2 = _mm256_set_epi32(iq3xs_grid[qs[7] | ((qh[ib32+1] << 1) & 256)], - iq3xs_grid[qs[6] | ((qh[ib32+1] << 2) & 256)], - iq3xs_grid[qs[5] | ((qh[ib32+1] << 3) & 256)], - iq3xs_grid[qs[4] | ((qh[ib32+1] << 4) & 256)], - iq3xs_grid[qs[3] | ((qh[ib32+1] << 5) & 256)], - iq3xs_grid[qs[2] | ((qh[ib32+1] << 6) & 256)], - iq3xs_grid[qs[1] | ((qh[ib32+1] << 7) & 256)], - iq3xs_grid[qs[0] | ((qh[ib32+1] << 8) & 256)]); - qs += 8; + const __m256i idx_l = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)qs)); qs += 16; + idx.vec[0] = _mm256_set1_epi32(qh[ib32+0]); + idx.vec[1] = _mm256_set1_epi32(qh[ib32+1]); + idx.vec[0] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[0], idx_shift), idx_mask); + idx.vec[1] = _mm256_and_si256(_mm256_sllv_epi32(idx.vec[1], idx_shift), idx_mask); + idx.vec[0] = _mm256_or_si256(idx.vec[0], _mm256_cvtepi16_epi32(_mm256_castsi256_si128(idx_l))); + idx.vec[1] = _mm256_or_si256(idx.vec[1], _mm256_cvtepi16_epi32(_mm256_extractf128_si256(idx_l, 1))); + + // At leat on my CPU (Ryzen 7950X), using _mm256_i32gather_epi32 is slower than _mm256_set_epi32. Strange. + //const __m256i q2_1 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[0], 4); + //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4); + const __m256i q2_1 = _mm256_set_epi32( + iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]], + iq3s_grid[idx.index[3]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] + ); + const __m256i q2_2 = _mm256_set_epi32( + iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]], + iq3s_grid[idx.index[11]], iq3s_grid[idx.index[10]], iq3s_grid[idx.index[ 9]], iq3s_grid[idx.index[ 8]] + ); __m256i aux256 = _mm256_set1_epi32(signs[0] | (signs[1] << 16)); aux256 = _mm256_and_si256(_mm256_shuffle_epi8(aux256,mask1), mask2); @@ -10221,7 +10264,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v } - *s = 0.25f * hsum_float_8(accumf); + *s = hsum_float_8(accumf); #else @@ -10238,8 +10281,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v const uint32_t ls2 = 2*(x[i].scales[ib32/2] >> 4) + 1; int32_t sumi = 0; for (int l = 0; l < 4; ++l) { - const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+0] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+0] << (7-2*l)) & 256))); for (int j = 0; j < 4; ++j) { sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); @@ -10251,8 +10294,8 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v bsum += sumi * ls1; sumi = 0; for (int l = 0; l < 4; ++l) { - const uint8_t * grid1 = (const uint8_t *)(iq3xs_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); - const uint8_t * grid2 = (const uint8_t *)(iq3xs_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); + const uint8_t * grid1 = (const uint8_t *)(iq3s_grid + (qs[2*l+0] | ((qh[ib32+1] << (8-2*l)) & 256))); + const uint8_t * grid2 = (const uint8_t *)(iq3s_grid + (qs[2*l+1] | ((qh[ib32+1] << (7-2*l)) & 256))); for (int j = 0; j < 4; ++j) { sumi += grid1[j] * q8[j+0] * (signs[l] & kmask_iq2xs[j+0] ? -1 : 1); sumi += grid2[j] * q8[j+4] * (signs[l] & kmask_iq2xs[j+4] ? -1 : 1); @@ -10265,7 +10308,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v } sumf += d * bsum; } - *s = 0.25f * sumf; + *s = sumf; #endif } @@ -11912,7 +11955,8 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo } float best = 0; float scale = max/(2*kMaxQ-1); - for (int is = -15; is <= 15; ++is) { + for (int k = 0; k < bs4; ++k) is_on_grid[k] = false; + for (int is = -9; is <= 9; ++is) { float id = (2*kMaxQ-1+is*0.2f)/max; float this_scale = 1/id; for (int k = 0; k < bs4; ++k) { @@ -11948,7 +11992,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo if (n_not_ongrid > 0 && scale > 0) { float id = 1/scale; for (int k = 0; k < bs4; ++k) { - if (is_on_grid[k]) continue; + //if (is_on_grid[k]) continue; uint16_t u = 0; for (int i = 0; i < 4; ++i) { int l = nearest_int(0.5f*(id*xval[4*k+i]-1)); @@ -12004,7 +12048,7 @@ static void quantize_row_iq3_s_impl(int block_size, const float * restrict x, vo } float d = max_scale/31; - y[ibl].d = GGML_FP32_TO_FP16(d); + y[ibl].d = GGML_FP32_TO_FP16(d * 1.033f); float id = 1/d; for (int ib = 0; ib < QK_K/block_size; ib += 2) { int l1 = nearest_int(0.5f*(id*scales[ib+0]-1)); From c7a0ad8ec9ebb5ddb1c1c80c82f2ee041c525d47 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sat, 2 Mar 2024 12:21:47 -0500 Subject: [PATCH 22/26] convert-hf : make model class definitions self-contained (#5825) --- convert-hf-to-gguf.py | 204 ++++++++++++++++++------------------ gguf-py/gguf/gguf_writer.py | 2 +- 2 files changed, 101 insertions(+), 105 deletions(-) diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py index 28b92ac38..fa9d4f22f 100755 --- a/convert-hf-to-gguf.py +++ b/convert-hf-to-gguf.py @@ -8,9 +8,10 @@ import json import os import re import sys +from abc import ABC, abstractmethod from enum import IntEnum from pathlib import Path -from typing import TYPE_CHECKING, Any, ContextManager, Iterator, Sequence, cast +from typing import TYPE_CHECKING, Any, Callable, ContextManager, Iterator, Sequence, TypeVar, cast import numpy as np import torch @@ -35,8 +36,11 @@ class SentencePieceTokenTypes(IntEnum): UNUSED = 5 BYTE = 6 +AnyModel = TypeVar("AnyModel", bound="type[Model]") + +class Model(ABC): + _model_classes: dict[str, type[Model]] = {} -class Model: def __init__(self, dir_model: Path, ftype: int, fname_out: Path, is_big_endian: bool): self.dir_model = dir_model self.ftype = ftype @@ -47,10 +51,14 @@ class Model: self.num_parts = Model.count_model_parts(self.dir_model, ".safetensors" if self.is_safetensors else ".bin") self.part_names = self._get_part_names() self.hparams = Model.load_hparams(self.dir_model) - self.model_arch = self._get_model_architecture() self.gguf_writer = gguf.GGUFWriter(fname_out, gguf.MODEL_ARCH_NAMES[self.model_arch], endianess=self.endianess, use_temp_file=False) self.block_count = self.find_hparam(["n_layers", "num_hidden_layers", "n_layer"]) + @property + @abstractmethod + def model_arch(self) -> gguf.MODEL_ARCH: + pass + def find_hparam(self, keys: Sequence[str], optional: bool = False) -> Any: key = next((k for k in keys if k in self.hparams), None) if key is not None: @@ -176,55 +184,21 @@ class Model: with open(dir_model / "config.json", "r", encoding="utf-8") as f: return json.load(f) - @staticmethod - def from_model_architecture(model_architecture): - if model_architecture == "GPTNeoXForCausalLM": - return GPTNeoXModel - if model_architecture == "BloomForCausalLM": - return BloomModel - if model_architecture == "MPTForCausalLM": - return MPTModel - if model_architecture in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return BaichuanModel - if model_architecture in ("FalconForCausalLM", "RWForCausalLM"): - return FalconModel - if model_architecture == "GPTBigCodeForCausalLM": - return StarCoderModel - if model_architecture == "GPTRefactForCausalLM": - return RefactModel - if model_architecture == "PersimmonForCausalLM": - return PersimmonModel - if model_architecture in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): - return StableLMModel - if model_architecture == "QWenLMHeadModel": - return QwenModel - if model_architecture == "Qwen2ForCausalLM": - return Model - if model_architecture == "MixtralForCausalLM": - return MixtralModel - if model_architecture == "GPT2LMHeadModel": - return GPT2Model - if model_architecture == "PhiForCausalLM": - return Phi2Model - if model_architecture == "PlamoForCausalLM": - return PlamoModel - if model_architecture == "CodeShellForCausalLM": - return CodeShellModel - if model_architecture == "OrionForCausalLM": - return OrionModel - if model_architecture == "InternLM2ForCausalLM": - return InternLM2Model - if model_architecture == "MiniCPMForCausalLM": - return MiniCPMModel - if model_architecture == "BertModel": - return BertModel - if model_architecture == "NomicBertModel": - return NomicBertModel - if model_architecture == "GemmaForCausalLM": - return GemmaModel - if model_architecture == "Starcoder2ForCausalLM": - return Model - return Model + @classmethod + def register(cls, *names: str) -> Callable[[AnyModel], AnyModel]: + assert names + def func(modelcls: type[Model]): + for name in names: + cls._model_classes[name] = modelcls + return modelcls + return func + + @classmethod + def from_model_architecture(cls, arch): + try: + return cls._model_classes[arch] + except KeyError: + raise NotImplementedError(f'Architecture {arch!r} not supported!') from None def _is_model_safetensors(self) -> bool: return Model.count_model_parts(self.dir_model, ".safetensors") > 0 @@ -239,57 +213,6 @@ class Model: return ("pytorch_model.bin",) return (f"pytorch_model-{n:05}-of-{self.num_parts:05}.bin" for n in range(1, self.num_parts + 1)) - def _get_model_architecture(self) -> gguf.MODEL_ARCH: - arch = self.hparams["architectures"][0] - if arch == "GPTNeoXForCausalLM": - return gguf.MODEL_ARCH.GPTNEOX - if arch == "BloomForCausalLM": - return gguf.MODEL_ARCH.BLOOM - if arch == "MPTForCausalLM": - return gguf.MODEL_ARCH.MPT - if arch in ("BaichuanForCausalLM", "BaiChuanForCausalLM"): - return gguf.MODEL_ARCH.BAICHUAN - if arch in ("FalconForCausalLM", "RWForCausalLM"): - return gguf.MODEL_ARCH.FALCON - if arch == "GPTBigCodeForCausalLM": - return gguf.MODEL_ARCH.STARCODER - if arch == "GPTRefactForCausalLM": - return gguf.MODEL_ARCH.REFACT - if arch == "PersimmonForCausalLM": - return gguf.MODEL_ARCH.PERSIMMON - if arch in ("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM"): - return gguf.MODEL_ARCH.STABLELM - if arch == "QWenLMHeadModel": - return gguf.MODEL_ARCH.QWEN - if arch == "Qwen2ForCausalLM": - return gguf.MODEL_ARCH.QWEN2 - if arch == "MixtralForCausalLM": - return gguf.MODEL_ARCH.LLAMA - if arch == "GPT2LMHeadModel": - return gguf.MODEL_ARCH.GPT2 - if arch == "PhiForCausalLM": - return gguf.MODEL_ARCH.PHI2 - if arch == "PlamoForCausalLM": - return gguf.MODEL_ARCH.PLAMO - if arch == "CodeShellForCausalLM": - return gguf.MODEL_ARCH.CODESHELL - if arch == "OrionForCausalLM": - return gguf.MODEL_ARCH.ORION - if arch == "InternLM2ForCausalLM": - return gguf.MODEL_ARCH.INTERNLM2 - if arch == "MiniCPMForCausalLM": - return gguf.MODEL_ARCH.MINICPM - if arch == "BertModel": - return gguf.MODEL_ARCH.BERT - if arch == "NomicBertModel": - return gguf.MODEL_ARCH.NOMIC_BERT - if arch == "GemmaForCausalLM": - return gguf.MODEL_ARCH.GEMMA - if arch == "Starcoder2ForCausalLM": - return gguf.MODEL_ARCH.STARCODER2 - - raise NotImplementedError(f'Architecture "{arch}" not supported!') - def _set_vocab_gpt2(self): dir_model = self.dir_model hparams = self.hparams @@ -457,7 +380,10 @@ class Model: special_vocab.add_to_gguf(self.gguf_writer) +@Model.register("GPTNeoXForCausalLM") class GPTNeoXModel(Model): + model_arch = gguf.MODEL_ARCH.GPTNEOX + def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] @@ -474,7 +400,10 @@ class GPTNeoXModel(Model): self.gguf_writer.add_layer_norm_eps(self.hparams["layer_norm_eps"]) +@Model.register("BloomForCausalLM") class BloomModel(Model): + model_arch = gguf.MODEL_ARCH.BLOOM + def set_gguf_parameters(self): self.gguf_writer.add_name("Bloom") n_embed = self.hparams.get("hidden_size", self.hparams.get("n_embed")) @@ -566,7 +495,10 @@ class BloomModel(Model): print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") +@Model.register("MPTForCausalLM") class MPTModel(Model): + model_arch = gguf.MODEL_ARCH.MPT + def set_gguf_parameters(self): block_count = self.hparams["n_layers"] self.gguf_writer.add_name(self.dir_model.name) @@ -629,7 +561,10 @@ class MPTModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("OrionForCausalLM") class OrionModel(Model): + model_arch = gguf.MODEL_ARCH.ORION + def set_vocab(self): self._set_vocab_sentencepiece() @@ -708,7 +643,10 @@ class OrionModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("BaichuanForCausalLM", "BaiChuanForCausalLM") class BaichuanModel(Model): + model_arch = gguf.MODEL_ARCH.BAICHUAN + def set_vocab(self): self._set_vocab_sentencepiece() @@ -823,7 +761,10 @@ class BaichuanModel(Model): return weights[r * n_part:r * n_part + r, ...] +@Model.register("FalconForCausalLM", "RWForCausalLM") class FalconModel(Model): + model_arch = gguf.MODEL_ARCH.FALCON + def set_gguf_parameters(self): block_count = self.hparams.get("num_hidden_layers") if block_count is None: @@ -916,7 +857,10 @@ class FalconModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("GPTBigCodeForCausalLM") class StarCoderModel(Model): + model_arch = gguf.MODEL_ARCH.STARCODER + def set_gguf_parameters(self): block_count = self.hparams["n_layer"] @@ -931,7 +875,10 @@ class StarCoderModel(Model): self.gguf_writer.add_file_type(self.ftype) +@Model.register("GPTRefactForCausalLM") class RefactModel(Model): + model_arch = gguf.MODEL_ARCH.REFACT + def set_gguf_parameters(self): hidden_dim = self.hparams["n_embd"] inner_dim = 4 * hidden_dim @@ -1015,7 +962,10 @@ class RefactModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("PersimmonForCausalLM") class PersimmonModel(Model): + model_arch = gguf.MODEL_ARCH.PERSIMMON + def set_gguf_parameters(self): block_count = self.hparams.get("num_layers", self.hparams.get("num_hidden_layers")) head_count = self.hparams["num_attention_heads"] @@ -1063,7 +1013,10 @@ class PersimmonModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("StableLmForCausalLM", "StableLMEpochForCausalLM", "LlavaStableLMEpochForCausalLM") class StableLMModel(Model): + model_arch = gguf.MODEL_ARCH.STABLELM + def set_vocab(self): if (self.dir_model / "tokenizer.json").is_file(): self._set_vocab_gpt2() @@ -1087,12 +1040,18 @@ class StableLMModel(Model): self.gguf_writer.add_layer_norm_eps(self.find_hparam(["layer_norm_eps", "norm_eps"])) +@Model.register("MixtralForCausalLM") class MixtralModel(Model): + model_arch = gguf.MODEL_ARCH.LLAMA + def set_vocab(self): self._set_vocab_sentencepiece() +@Model.register("MiniCPMForCausalLM") class MiniCPMModel(Model): + model_arch = gguf.MODEL_ARCH.MINICPM + def set_gguf_parameters(self): block_count = self.hparams["num_hidden_layers"] self.gguf_writer.add_name("MiniCPM") @@ -1169,7 +1128,10 @@ class MiniCPMModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("QWenLMHeadModel") class QwenModel(Model): + model_arch = gguf.MODEL_ARCH.QWEN + @staticmethod def token_bytes_to_string(b): from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode @@ -1249,7 +1211,15 @@ class QwenModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("Qwen2ForCausalLM") +class Qwen2Model(Model): + model_arch = gguf.MODEL_ARCH.QWEN2 + + +@Model.register("GPT2LMHeadModel") class GPT2Model(Model): + model_arch = gguf.MODEL_ARCH.GPT2 + def set_gguf_parameters(self): self.gguf_writer.add_name(self.dir_model.name) self.gguf_writer.add_block_count(self.hparams["n_layer"]) @@ -1311,7 +1281,10 @@ class GPT2Model(Model): self.gguf_writer.add_tensor("output.weight", data) +@Model.register("PhiForCausalLM") class Phi2Model(Model): + model_arch = gguf.MODEL_ARCH.PHI2 + def set_gguf_parameters(self): block_count = self.find_hparam(["num_hidden_layers", "n_layer"]) @@ -1333,7 +1306,10 @@ class Phi2Model(Model): self.gguf_writer.add_add_bos_token(False) +@Model.register("PlamoForCausalLM") class PlamoModel(Model): + model_arch = gguf.MODEL_ARCH.PLAMO + def set_vocab(self): self._set_vocab_sentencepiece() @@ -1412,7 +1388,10 @@ class PlamoModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("CodeShellForCausalLM") class CodeShellModel(Model): + model_arch = gguf.MODEL_ARCH.CODESHELL + def set_gguf_parameters(self): block_count = self.hparams["n_layer"] @@ -1477,7 +1456,10 @@ class CodeShellModel(Model): print(name, f"=> output.weight, shape = {data.shape}, {old_dtype} --> {data.dtype}") +@Model.register("InternLM2ForCausalLM") class InternLM2Model(Model): + model_arch = gguf.MODEL_ARCH.INTERNLM2 + def set_vocab(self): # (TODO): Is there a better way? # Copy from _set_vocab_sentencepiece, The only difference is that we will treat the character @@ -1649,7 +1631,10 @@ in chat mode so that the conversation can end normally.") self.post_write_tensors(tensor_map, name, data_torch) +@Model.register("BertModel") class BertModel(Model): + model_arch = gguf.MODEL_ARCH.BERT + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.vocab_size = None @@ -1679,7 +1664,7 @@ class BertModel(Model): else: raise NotImplementedError("Only MEAN and CLS pooling types supported") - self.gguf_writer.add_pooling_type(pooling_type.value) + self.gguf_writer.add_pooling_type(pooling_type) def set_vocab(self): path = self.dir_model @@ -1755,7 +1740,10 @@ class BertModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("NomicBertModel") class NomicBertModel(BertModel): + model_arch = gguf.MODEL_ARCH.NOMIC_BERT + def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) @@ -1792,7 +1780,10 @@ class NomicBertModel(BertModel): yield name, data +@Model.register("GemmaForCausalLM") class GemmaModel(Model): + model_arch = gguf.MODEL_ARCH.GEMMA + def set_vocab(self): self._set_vocab_sentencepiece() @@ -1848,6 +1839,11 @@ class GemmaModel(Model): self.gguf_writer.add_tensor(new_name, data) +@Model.register("Starcoder2ForCausalLM") +class StarCoder2Model(Model): + model_arch = gguf.MODEL_ARCH.STARCODER2 + + ###### CONVERSION LOGIC ###### diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index e4681475c..801160832 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -362,7 +362,7 @@ class GGUFWriter: self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value) def add_pooling_type(self, value: PoolingType) -> None: - self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value) + self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value.value) def add_rope_dimension_count(self, count: int) -> None: self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count) From 4d4d2366fc9c54d4a275065cfe9299c6cf7c5b78 Mon Sep 17 00:00:00 2001 From: Jared Van Bortel Date: Sat, 2 Mar 2024 12:27:26 -0500 Subject: [PATCH 23/26] convert : automatically fall back to HfVocab if tokenizer.model doesn't exist (#5821) --- README.md | 4 +- convert-llama-ggml-to-gguf.py | 6 +-- convert.py | 72 +++++++++++++++++------------------ examples/infill/infill.cpp | 4 +- 4 files changed, 41 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 67717c1e3..939646753 100644 --- a/README.md +++ b/README.md @@ -786,7 +786,7 @@ And after 4.45 hours, you will have the final perplexity. ### Interactive mode If you want a more ChatGPT-like experience, you can run in interactive mode by passing `-i` as a parameter. -In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMa emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. +In this mode, you can always interrupt generation by pressing Ctrl+C and entering one or more lines of text, which will be converted into tokens and appended to the current context. You can also specify a *reverse prompt* with the parameter `-r "reverse prompt string"`. This will result in user input being prompted whenever the exact tokens of the reverse prompt string are encountered in the generation. A typical use is to use a prompt that makes LLaMA emulate a chat between multiple users, say Alice and Bob, and pass `-r "Alice:"`. Here is an example of a few-shot interaction, invoked with the command @@ -850,7 +850,7 @@ Sample run: ``` == Running in interactive mode. == - Press Ctrl+C to interject at any time. - - Press Return to return control to LLaMa. + - Press Return to return control to LLaMA. - If you want to submit another line, end your input in '\'. Below is an instruction that describes a task. Write a response that appropriately completes the request. diff --git a/convert-llama-ggml-to-gguf.py b/convert-llama-ggml-to-gguf.py index b33108062..cd9644fcb 100755 --- a/convert-llama-ggml-to-gguf.py +++ b/convert-llama-ggml-to-gguf.py @@ -373,7 +373,7 @@ def handle_metadata(cfg, hp): raise ValueError('Unable to load metadata') vocab_path = Path(cfg.vocab_dir if cfg.vocab_dir is not None else cfg.model_metadata_dir) vocab_factory = convert.VocabFactory(vocab_path) - vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype, cfg.model_metadata_dir) + vocab, special_vocab = vocab_factory.load_vocab(cfg.vocabtype.split(","), cfg.model_metadata_dir) convert.check_vocab_size(params, vocab) return params, vocab, special_vocab @@ -398,8 +398,8 @@ def handle_args(): help ='Load HuggingFace/.pth vocab and metadata from the specified directory') parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file - only meaningful with --model-metadata-dir") - parser.add_argument("--vocabtype", choices=["spm", "bpe"], default="spm", - help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm)") + parser.add_argument("--vocabtype", default="spm,hfft", + help="vocab format - only meaningful with --model-metadata-dir and/or --vocab-dir (default: spm,hfft)") return parser.parse_args() diff --git a/convert.py b/convert.py index 63a0a5d78..6e3a0319b 100755 --- a/convert.py +++ b/convert.py @@ -1282,35 +1282,32 @@ def load_some_model(path: Path) -> ModelPlus: class VocabFactory: + _FILES = {"spm": "tokenizer.model", "bpe": "vocab.json", "hfft": "tokenizer.json"} + def __init__(self, path: Path): self.path = path - self.files: dict[str, Path | None] = { - "tokenizer.model": None, - "vocab.json": None, - "tokenizer.json": None, - } - self._detect_files() + self.file_paths = self._detect_files() + print(f"Found vocab files: {self.file_paths}") - def _detect_files(self): - for file in self.files.keys(): - file_path = self.path / file - parent_file_path = self.path.parent / file - if file_path.exists(): - self.files[file] = file_path - elif parent_file_path.exists(): - self.files[file] = parent_file_path - print(f"Found vocab files: {self.files}") + def _detect_files(self) -> dict[str, Path | None]: + def locate(file: str) -> Path | None: + if (path := self.path / file).exists(): + return path + if (path := self.path.parent / file).exists(): + return path + return None - def _select_file(self, vocabtype: str | None) -> Path: - if vocabtype in ["spm", "bpe"]: - for file_key in self.files.keys(): - if (file := self.files[file_key]) is not None: - return file - raise FileNotFoundError(f"{vocabtype} vocab not found.") - if vocabtype == "hfft": - # For Hugging Face Fast Tokenizer, return the directory path instead of a specific file - return self.path - raise ValueError(f"Unsupported vocabulary type {vocabtype}") + return {vt: locate(f) for vt, f in self._FILES.items()} + + def _select_file(self, vocab_types: list[str]) -> tuple[str, Path]: + for vtype in vocab_types: + try: + path = self.file_paths[vtype] + except KeyError: + raise ValueError(f"Unsupported vocabulary type {vtype}") from None + if path is not None: + return vtype, path + raise FileNotFoundError(f"Could not find any of {[self._FILES[vt] for vt in vocab_types]}") def _create_special_vocab(self, vocab: Vocab, vocabtype: str, model_parent_path: Path) -> gguf.SpecialVocab: load_merges = vocabtype == "bpe" @@ -1322,30 +1319,30 @@ class VocabFactory: n_vocab=n_vocab, ) - def load_vocab(self, vocabtype: str, model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: - path = self._select_file(vocabtype) - print(f"Loading vocab file '{path}', type '{vocabtype}'") + def load_vocab(self, vocab_types: list[str], model_parent_path: Path) -> tuple[Vocab, gguf.SpecialVocab]: + vocab_type, path = self._select_file(vocab_types) + print(f"Loading vocab file {path!r}, type {vocab_type!r}") added_tokens_path = path.parent / "added_tokens.json" vocab: Vocab - if vocabtype == "bpe": + if vocab_type == "bpe": vocab = BpeVocab( path, added_tokens_path if added_tokens_path.exists() else None ) - elif vocabtype == "spm": + elif vocab_type == "spm": vocab = SentencePieceVocab( path, added_tokens_path if added_tokens_path.exists() else None ) - elif vocabtype == "hfft": + elif vocab_type == "hfft": vocab = HfVocab( - path, added_tokens_path if added_tokens_path.exists() else None + path.parent, added_tokens_path if added_tokens_path.exists() else None ) else: - raise ValueError(f"Unsupported vocabulary type {vocabtype}") + raise ValueError(vocab_type) # FIXME: Respect --vocab-dir? special_vocab = self._create_special_vocab( vocab, - vocabtype, + vocab_type, model_parent_path, ) return vocab, special_vocab @@ -1379,15 +1376,14 @@ def main(args_in: list[str] | None = None) -> None: if np.uint32(1) == np.uint32(1).newbyteorder("<"): # We currently only support Q8_0 output on little endian systems. output_choices.append("q8_0") - vocab_types = ["spm", "bpe", "hfft"] - parser = argparse.ArgumentParser(description="Convert a LLaMa model to a GGML compatible file") + parser = argparse.ArgumentParser(description="Convert a LLaMA model to a GGML compatible file") parser.add_argument("--awq-path", type=Path, help="Path to scale awq cache file", default=None) parser.add_argument("--dump", action="store_true", help="don't convert, just show what's in the model") parser.add_argument("--dump-single", action="store_true", help="don't convert, just show what's in a single model file") parser.add_argument("--vocab-only", action="store_true", help="extract only the vocab") parser.add_argument("--outtype", choices=output_choices, help="output format - note: q8_0 may be very slow (default: f16 or f32 based on input)") parser.add_argument("--vocab-dir", type=Path, help="directory containing tokenizer.model, if separate from model file") - parser.add_argument("--vocab-type", choices=vocab_types, help="The vocabulary format used to define the tokenizer model (default: spm)", default="spm") + parser.add_argument("--vocab-type", help="vocab types to try in order, choose from 'spm', 'bpe', 'hfft' (default: spm,hfft)", default="spm,hfft") parser.add_argument("--outfile", type=Path, help="path to write to; default: based on input") parser.add_argument("model", type=Path, help="directory containing model file, or model file itself (*.pth, *.pt, *.bin)") parser.add_argument("--ctx", type=int, help="model training context (default: based on input)") @@ -1448,7 +1444,7 @@ def main(args_in: list[str] | None = None) -> None: model_parent_path = model_plus.paths[0].parent vocab_path = Path(args.vocab_dir or args.model or model_parent_path) vocab_factory = VocabFactory(vocab_path) - vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type, model_parent_path) + vocab, special_vocab = vocab_factory.load_vocab(args.vocab_type.split(","), model_parent_path) if args.vocab_only: if not args.outfile: diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index d4b8729dd..91c39c5ae 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -378,10 +378,10 @@ int main(int argc, char ** argv) { if (params.interactive) { const char *control_message; if (params.multiline_input) { - control_message = " - To return control to LLaMa, end your input with '\\'.\n" + control_message = " - To return control to LLaMA, end your input with '\\'.\n" " - To return control without starting a new line, end your input with '/'.\n"; } else { - control_message = " - Press Return to return control to LLaMa.\n" + control_message = " - Press Return to return control to LLaMA.\n" " - To return control without starting a new line, end your input with '/'.\n" " - If you want to submit another line, end your input with '\\'.\n"; } From 494c87032613e31c0be99b2735e732871f2c4e4d Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 2 Mar 2024 20:00:49 +0200 Subject: [PATCH 24/26] ggml : fix IQ3_S AVX implementation (#5834) ggml-ci --- ggml-quants.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-quants.c b/ggml-quants.c index 492a1b9a6..2a8881d73 100644 --- a/ggml-quants.c +++ b/ggml-quants.c @@ -10231,7 +10231,7 @@ void ggml_vec_dot_iq3_s_q8_K (int n, float * GGML_RESTRICT s, size_t bs, const v //const __m256i q2_2 = _mm256_i32gather_epi32((const int *)iq3s_grid, idx.vec[1], 4); const __m256i q2_1 = _mm256_set_epi32( iq3s_grid[idx.index[7]], iq3s_grid[idx.index[6]], iq3s_grid[idx.index[5]], iq3s_grid[idx.index[4]], - iq3s_grid[idx.index[3]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] + iq3s_grid[idx.index[3]], iq3s_grid[idx.index[2]], iq3s_grid[idx.index[1]], iq3s_grid[idx.index[0]] ); const __m256i q2_2 = _mm256_set_epi32( iq3s_grid[idx.index[15]], iq3s_grid[idx.index[14]], iq3s_grid[idx.index[13]], iq3s_grid[idx.index[12]], From 4a6e2d6142ab815c964924896891e9ab3e050632 Mon Sep 17 00:00:00 2001 From: Michael Podvitskiy Date: Sat, 2 Mar 2024 20:52:25 +0100 Subject: [PATCH 25/26] llama : add abort_callback to interrupt computation (#5409) * using abort_callback from ggml to stop llama computation * format fix * a brief explaining comment --------- Co-authored-by: Georgi Gerganov --- llama.cpp | 18 ++++++++++++++++-- llama.h | 13 +++++++++++-- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/llama.cpp b/llama.cpp index 697e85e89..d4c7a965b 100644 --- a/llama.cpp +++ b/llama.cpp @@ -1987,6 +1987,9 @@ struct llama_context { std::vector buf_compute_meta; ggml_backend_sched_t sched = nullptr; + ggml_abort_callback abort_callback = nullptr; + void * abort_callback_data = nullptr; + // input tensors ggml_backend_buffer_t buf_input = nullptr; ggml_context * ctx_input = nullptr; @@ -8071,6 +8074,7 @@ static void llama_graph_compute( if (lctx.backend_cpu != nullptr) { ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads); + ggml_backend_cpu_set_abort_callback(lctx.backend_cpu, lctx.abort_callback, lctx.abort_callback_data); } ggml_backend_sched_graph_compute(lctx.sched, gf); @@ -11856,6 +11860,8 @@ struct llama_context_params llama_context_default_params() { /*.embedding =*/ false, /*.offload_kqv =*/ true, /*.do_pooling =*/ true, + /*.abort_callback =*/ nullptr, + /*.abort_callback_data =*/ nullptr, }; return result; @@ -12038,8 +12044,11 @@ struct llama_context * llama_new_context_with_model( LLAMA_LOG_INFO("%s: freq_base = %.1f\n", __func__, cparams.rope_freq_base); LLAMA_LOG_INFO("%s: freq_scale = %g\n", __func__, cparams.rope_freq_scale); - ctx->rng = std::mt19937(params.seed); - ctx->logits_all = params.logits_all; + ctx->abort_callback = params.abort_callback; + ctx->abort_callback_data = params.abort_callback_data; + + ctx->rng = std::mt19937(params.seed); + ctx->logits_all = params.logits_all; const ggml_type type_k = params.type_k; const ggml_type type_v = params.type_v; @@ -12989,6 +12998,11 @@ void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_ ctx->cparams.n_threads_batch = n_threads_batch; } +void llama_set_abort_callback(struct llama_context * ctx, bool (*abort_callback)(void * data), void * abort_callback_data) { + ctx->abort_callback = abort_callback; + ctx->abort_callback_data = abort_callback_data; +} + struct llama_batch llama_batch_get_one( llama_token * tokens, int32_t n_tokens, diff --git a/llama.h b/llama.h index ed51f478a..6406b5270 100644 --- a/llama.h +++ b/llama.h @@ -255,10 +255,16 @@ extern "C" { enum ggml_type type_v; // data type for V cache // Keep the booleans together to avoid misalignment during copy-by-value. - bool logits_all; // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) + bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embedding; // embedding mode only bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU bool do_pooling; // whether to pool (sum) embedding results by sequence id (ignored if no pooling layer) + + // Abort callback + // if it returns true, execution of llama_decode() will be aborted + // currently works only with CPU execution + ggml_abort_callback abort_callback; + void * abort_callback_data; }; // model quantization parameters @@ -632,7 +638,10 @@ extern "C" { // n_threads_batch is the number of threads used for prompt and batch processing (multiple tokens) LLAMA_API void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch); - // Token logits obtained from the last call to llama_eval() + // Set abort callback + LLAMA_API void llama_set_abort_callback(struct llama_context * ctx, ggml_abort_callback abort_callback, void * abort_callback_data); + + // Token logits obtained from the last call to llama_decode() // The logits for the last token are stored in the last row // Logits for which llama_batch.logits[i] == 0 are undefined // Rows: n_tokens provided with llama_batch From 9731134296af3a6839cd682e51d9c2109a871de5 Mon Sep 17 00:00:00 2001 From: Pierrick Hymbert Date: Sat, 2 Mar 2024 22:00:14 +0100 Subject: [PATCH 26/26] server: tests: passkey challenge / self-extend with context shift demo (#5832) * server: tests: add models endpoint scenario * server: /v1/models add some metadata * server: tests: add debug field in context before scenario * server: tests: download model from HF, add batch size * server: tests: add passkey test * server: tests: add group attention params * server: do not truncate prompt tokens if self-extend through group attention is enabled * server: logs: do not truncate log values * server: tests - passkey - first good working value of nga * server: tests: fix server timeout * server: tests: fix passkey, add doc, fix regex content matching, fix timeout * server: tests: fix regex content matching * server: tests: schedule slow tests on master * server: metrics: fix when no prompt processed * server: tests: self-extend add llama-2-7B and Mixtral-8x7B-v0.1 * server: tests: increase timeout for completion * server: tests: keep only the PHI-2 test * server: tests: passkey add a negative test --- .github/workflows/server.yml | 17 +- examples/server/server.cpp | 46 +++- examples/server/tests/README.md | 50 +++- examples/server/tests/features/environment.py | 5 +- examples/server/tests/features/issues.feature | 1 + .../server/tests/features/parallel.feature | 5 +- .../server/tests/features/passkey.feature | 55 ++++ .../server/tests/features/security.feature | 3 +- examples/server/tests/features/server.feature | 23 +- examples/server/tests/features/steps/steps.py | 259 ++++++++++++++---- .../tests/features/wrong_usages.feature | 5 +- examples/server/tests/requirements.txt | 1 + examples/server/tests/tests.sh | 2 +- examples/server/utils.hpp | 3 +- 14 files changed, 363 insertions(+), 112 deletions(-) create mode 100644 examples/server/tests/features/passkey.feature diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml index 0b6f6669b..8c6312508 100644 --- a/.github/workflows/server.yml +++ b/.github/workflows/server.yml @@ -10,6 +10,8 @@ on: pull_request: types: [opened, synchronize, reopened] paths: ['.github/workflows/server.yml', '**/CMakeLists.txt', '**/Makefile', '**/*.h', '**/*.hpp', '**/*.c', '**/*.cpp', '**/*.cu', '**/*.swift', '**/*.m', 'examples/server/tests/**.*'] + schedule: + - cron: '00 0 * * *' jobs: server: @@ -70,14 +72,15 @@ jobs: run: | pip install -r examples/server/tests/requirements.txt - - name: Download models - id: download_models - run: | - cd examples/server/tests - ../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf - - name: Tests - id: server_integration_test + id: server_integration_tests run: | cd examples/server/tests PORT=8888 ./tests.sh + + - name: Slow tests + id: server_integration_tests_slow + if: github.event.schedule != '' + run: | + cd examples/server/tests + PORT=8888 ./tests.sh --stop --no-skipped --no-capture --tags slow diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 2b2f4a0f4..52daf9e7a 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -441,8 +441,8 @@ struct llama_server_context const int ga_w = params.grp_attn_w; if (ga_n != 1) { - GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT - GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT + GGML_ASSERT(ga_n > 0 && "ga_n must be positive"); // NOLINT + GGML_ASSERT(ga_w % ga_n == 0 && "ga_w must be a multiple of ga_n"); // NOLINT //GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT //GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT @@ -1709,8 +1709,8 @@ struct llama_server_context } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - // if input prompt is too big, truncate it - if (slot.n_prompt_tokens >= slot.n_ctx) + // if input prompt is too big, truncate it, if group attention self-extend is disabled + if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_block_size = n_left / 2; @@ -1785,9 +1785,11 @@ struct llama_server_context } LOG_INFO("slot progression", { - { "slot_id", slot.id }, - { "task_id", slot.task_id }, - { "n_past", slot.n_past }, + { "slot_id", slot.id }, + { "task_id", slot.task_id }, + { "n_past", slot.n_past }, + { "n_past_se", slot.n_past_se }, + { "ga_i", slot.ga_i }, { "n_prompt_tokens_processed", slot.n_prompt_tokens_processed } }); } @@ -2001,6 +2003,17 @@ struct llama_server_context LOG_VERBOSE("slots updated", {}); return true; } + + json model_meta() { + return json{ + {"vocab_type", llama_vocab_type(model)}, + {"n_vocab", llama_n_vocab(model)}, + {"n_ctx_train", llama_n_ctx_train(model)}, + {"n_embd", llama_n_embd(model)}, + {"n_params", llama_model_n_params(model)}, + {"size", llama_model_size(model)}, + }; + } }; static void server_print_usage(const char *argv0, const gpt_params ¶ms, @@ -2911,9 +2924,10 @@ int main(int argc, char **argv) for (const auto& metric_def : metrics_def) { std::string name = metric_def["name"]; std::string help = metric_def["help"]; - prometheus << "# HELP llamacpp:" << name << " " << help << "\n" - << "# TYPE llamacpp:" << name << " " << type << "\n" - << "llamacpp:" << name << " " << metric_def["value"] << "\n"; + auto value = json_value(metric_def, "value", 0); + prometheus << "# HELP llamacpp:" << name << " " << help << "\n" + << "# TYPE llamacpp:" << name << " " << type << "\n" + << "llamacpp:" << name << " " << value << "\n"; } } @@ -2994,6 +3008,7 @@ int main(int argc, char **argv) state.store(SERVER_STATE_READY); LOG_INFO("model loaded", {}); } + const auto model_meta = llama.model_meta(); if (sparams.chat_template.empty()) { // custom chat template is not supplied // check if the template comes with the model is supported by us @@ -3143,7 +3158,7 @@ int main(int argc, char **argv) } }); - svr.Get("/v1/models", [¶ms](const httplib::Request& req, httplib::Response& res) + svr.Get("/v1/models", [¶ms, &model_meta](const httplib::Request& req, httplib::Response& res) { res.set_header("Access-Control-Allow-Origin", req.get_header_value("Origin")); std::time_t t = std::time(0); @@ -3152,10 +3167,11 @@ int main(int argc, char **argv) {"object", "list"}, {"data", { { - {"id", params.model_alias}, - {"object", "model"}, - {"created", t}, - {"owned_by", "llamacpp"} + {"id", params.model_alias}, + {"object", "model"}, + {"created", t}, + {"owned_by", "llamacpp"}, + {"meta", model_meta} }, }} }; diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md index 0b9fdc4e7..95a0353b6 100644 --- a/examples/server/tests/README.md +++ b/examples/server/tests/README.md @@ -1,22 +1,30 @@ # Server tests -Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) and [behave](https://behave.readthedocs.io/en/latest/): - * [issues.feature](./features/issues.feature) Pending issues scenario - * [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests - * [security.feature](./features/security.feature) Security, CORS and API Key - * [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc... +Python based server tests scenario using [BDD](https://en.wikipedia.org/wiki/Behavior-driven_development) +and [behave](https://behave.readthedocs.io/en/latest/): + +* [issues.feature](./features/issues.feature) Pending issues scenario +* [parallel.feature](./features/parallel.feature) Scenario involving multi slots and concurrent requests +* [security.feature](./features/security.feature) Security, CORS and API Key +* [server.feature](./features/server.feature) Server base scenario: completion, embedding, tokenization, etc... Tests target GitHub workflows job runners with 4 vCPU. -Requests are using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) based http client. +Requests are +using [aiohttp](https://docs.aiohttp.org/en/stable/client_reference.html), [asyncio](https://docs.python.org/fr/3/library/asyncio.html) +based http client. -Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. To mitigate it, you can increase values in `n_predict`, `kv_size`. +Note: If the host architecture inference speed is faster than GitHub runners one, parallel scenario may randomly fail. +To mitigate it, you can increase values in `n_predict`, `kv_size`. ### Install dependencies + `pip install -r requirements.txt` ### Run tests + 1. Build the server + ```shell cd ../../.. mkdir build @@ -24,24 +32,36 @@ cd build cmake ../ cmake --build . --target server ``` -2. download required models: - 1. `../../../scripts/hf.sh --repo ggml-org/models --file tinyllamas/stories260K.gguf` -3. Start the test: `./tests.sh` + +2. Start the test: `./tests.sh` It's possible to override some scenario steps values with environment variables: - - `PORT` -> `context.server_port` to set the listening port of the server during scenario, default: `8080` - - `LLAMA_SERVER_BIN_PATH` -> to change the server binary path, default: `../../../build/bin/server` - - `DEBUG` -> "ON" to enable steps and server verbose mode `--verbose` - - `SERVER_LOG_FORMAT_JSON` -> if set switch server logs to json format + +| variable | description | +|--------------------------|------------------------------------------------------------------------------------------------| +| `PORT` | `context.server_port` to set the listening port of the server during scenario, default: `8080` | +| `LLAMA_SERVER_BIN_PATH` | to change the server binary path, default: `../../../build/bin/server` | +| `DEBUG` | "ON" to enable steps and server verbose mode `--verbose` | +| `SERVER_LOG_FORMAT_JSON` | if set switch server logs to json format | +| `N_GPU_LAYERS` | number of model layers to offload to VRAM `-ngl --n-gpu-layers` | ### Run @bug, @wip or @wrong_usage annotated scenario Feature or Scenario must be annotated with `@llama.cpp` to be included in the default scope. + - `@bug` annotation aims to link a scenario with a GitHub issue. - `@wrong_usage` are meant to show user issue that are actually an expected behavior - `@wip` to focus on a scenario working in progress +- `@slow` heavy test, disabled by default To run a scenario annotated with `@bug`, start: -`DEBUG=ON ./tests.sh --no-skipped --tags bug` + +```shell +DEBUG=ON ./tests.sh --no-skipped --tags bug +``` After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated. + +```shell +./tests.sh --no-skipped --tags bug,wrong_usage || echo "should failed but compile" +``` diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py index 09e826747..9fd330db6 100644 --- a/examples/server/tests/features/environment.py +++ b/examples/server/tests/features/environment.py @@ -7,7 +7,10 @@ from signal import SIGKILL def before_scenario(context, scenario): - print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m") + context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON' + if context.debug: + print("DEBUG=ON\n") + print(f"\x1b[33;42mStarting new scenario: {scenario.name}!\x1b[0m\n") port = 8080 if 'PORT' in os.environ: port = int(os.environ['PORT']) diff --git a/examples/server/tests/features/issues.feature b/examples/server/tests/features/issues.feature index bf5a175a3..7b13e44ca 100644 --- a/examples/server/tests/features/issues.feature +++ b/examples/server/tests/features/issues.feature @@ -1,4 +1,5 @@ # List of ongoing issues +# run with: DEBUG=ON ./tests.sh --no-skipped --tags bug @bug Feature: Issues # No confirmed issue at the moment diff --git a/examples/server/tests/features/parallel.feature b/examples/server/tests/features/parallel.feature index 5f895cf90..86cdf7282 100644 --- a/examples/server/tests/features/parallel.feature +++ b/examples/server/tests/features/parallel.feature @@ -1,11 +1,12 @@ @llama.cpp +@parallel Feature: Parallel Background: Server startup Given a server listening on localhost:8080 - And a model file stories260K.gguf - And a model alias tinyllama-2 + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models And 42 as server seed + And 512 as batch size And 64 KV cache size And 2 slots And embeddings extraction diff --git a/examples/server/tests/features/passkey.feature b/examples/server/tests/features/passkey.feature new file mode 100644 index 000000000..1bde7aab8 --- /dev/null +++ b/examples/server/tests/features/passkey.feature @@ -0,0 +1,55 @@ +# run with: ./tests.sh --no-skipped --tags passkey +@passkey +@slow +Feature: Passkey / Self-extend with context shift + + Background: Server startup + Given a server listening on localhost:8080 + + # Generates a long text of junk and inserts a secret passkey number inside it. + # Then we query the LLM for the secret passkey. + # see #3856 and #4810 + Scenario Outline: Passkey + Given a model file from HF repo + And as batch size + And as number of junk + And server max tokens to predict + And 42 as seed + And KV cache size + And 1 slots + And group attention factor to extend context size through self-extend + And group attention width to extend context size through self-extend + # Can be override with N_GPU_LAYERS + And GPU offloaded layers + Then the server is starting + Then the server is healthy + Given available models + Then model 0 is trained on tokens context + Given a prefix prompt: + """ + here is an important info hidden inside a lot of irrelevant text. Find it and memorize them. I will quiz you about the important information there. + """ + And a passkey prompt template: + """ + The pass key is Remember it. is the pass key. + """ + And a junk suffix prompt: + """ + The grass is green. The sky is blue. The sun is yellow. Here we go. There and back again. + """ + And a suffix prompt: + """ + What is the pass key? The pass key is + """ + Given a "" passkey challenge prompt with the passkey inserted every junk + And a completion request with no api error + Then tokens are predicted matching + + Examples: + | hf_repo | hf_file | n_ctx_train | ngl | n_ctx | n_batch | n_ga | n_ga_w | n_junk | i_pos | passkey | n_predicted | re_content | + | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 4 | 512 | 250 | 50 | 42 | 1 | 42 | + | TheBloke/phi-2-GGUF | phi-2.Q4_K_M.gguf | 2048 | 5 | 8192 | 512 | 2 | 512 | 250 | 50 | 42 | 1 | \b((?!42)\w)+\b | + #| TheBloke/Llama-2-7B-GGUF | llama-2-7b.Q2_K.gguf | 4096 | 3 | 16384 | 512 | 4 | 512 | 500 | 300 | 1234 | 5 | 1234 | + #| TheBloke/Mixtral-8x7B-v0.1-GGUF | mixtral-8x7b-v0.1.Q2_K.gguf | 32768 | 2 | 16384 | 512 | 4 | 512 | 500 | 100 | 0987 | 5 | 0 + # 987 | + diff --git a/examples/server/tests/features/security.feature b/examples/server/tests/features/security.feature index db06d3977..42a6709a5 100644 --- a/examples/server/tests/features/security.feature +++ b/examples/server/tests/features/security.feature @@ -1,9 +1,10 @@ @llama.cpp +@security Feature: Security Background: Server startup with an api key defined Given a server listening on localhost:8080 - And a model file stories260K.gguf + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models And a server api key llama.cpp Then the server is starting Then the server is healthy diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature index b571582a7..7c977bcce 100644 --- a/examples/server/tests/features/server.feature +++ b/examples/server/tests/features/server.feature @@ -1,15 +1,17 @@ @llama.cpp +@server Feature: llama.cpp server Background: Server startup Given a server listening on localhost:8080 - And a model file stories260K.gguf + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models And a model alias tinyllama-2 And 42 as server seed # KV Cache corresponds to the total amount of tokens # that can be stored across all independent sequences: #4130 # see --ctx-size and #5568 And 32 KV cache size + And 512 as batch size And 1 slots And embeddings extraction And 32 server max tokens to predict @@ -29,9 +31,9 @@ Feature: llama.cpp server And prometheus metrics are exposed Examples: Prompts - | prompt | n_predict | re_content | n_predicted | - | I believe the meaning of life is | 8 | (readgoing)+ | 8 | - | Write a joke about AI | 64 | (parkfriendsscaredalways)+ | 32 | + | prompt | n_predict | re_content | n_predicted | + | I believe the meaning of life is | 8 | (read\|going)+ | 8 | + | Write a joke about AI | 64 | (park\|friends\|scared\|always)+ | 32 | Scenario Outline: OAI Compatibility Given a model @@ -43,9 +45,9 @@ Feature: llama.cpp server Then tokens are predicted matching Examples: Prompts - | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming | - | llama-2 | Book | What is the best book | 8 | (Momwhat)+ | 8 | disabled | - | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thankshappybird)+ | 32 | enabled | + | model | system_prompt | user_prompt | max_tokens | re_content | n_predicted | enable_streaming | + | llama-2 | Book | What is the best book | 8 | (Mom\|what)+ | 8 | disabled | + | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 64 | (thanks\|happy\|bird)+ | 32 | enabled | Scenario: Embedding When embeddings are computed for: @@ -75,10 +77,15 @@ Feature: llama.cpp server When an OAI compatible embeddings computation request for multiple inputs Then embeddings are generated - Scenario: Tokenize / Detokenize When tokenizing: """ What is the capital of France ? """ Then tokens can be detokenize + + Scenario: Models available + Given available models + Then 1 models are supported + Then model 0 is identified by tinyllama-2 + Then model 0 is trained on 128 tokens context diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py index 381da105e..319527802 100644 --- a/examples/server/tests/features/steps/steps.py +++ b/examples/server/tests/features/steps/steps.py @@ -13,6 +13,7 @@ import aiohttp import openai from behave import step from behave.api.async_step import async_run_until_complete +from huggingface_hub import hf_hub_download from prometheus_client import parser @@ -26,17 +27,23 @@ def step_server_config(context, server_fqdn, server_port): context.base_url = f'http://{context.server_fqdn}:{context.server_port}' - context.debug = 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON' context.model_alias = None + context.n_batch = None context.n_ctx = None + context.n_ga = None + context.n_ga_w = None + context.n_gpu_layer = None context.n_predict = None context.n_server_predict = None context.n_slots = None + context.prompt_prefix = None + context.prompt_suffix = None context.server_api_key = None context.server_continuous_batching = False context.server_embeddings = False context.server_metrics = False context.server_process = None + context.seed = None context.server_seed = None context.user_api_key = None @@ -45,9 +52,11 @@ def step_server_config(context, server_fqdn, server_port): context.prompts = [] -@step(u'a model file {model_file}') -def step_model_file(context, model_file): - context.model_file = model_file +@step(u'a model file {hf_file} from HF repo {hf_repo}') +def step_download_hf_model(context, hf_file, hf_repo): + context.model_file = hf_hub_download(repo_id=hf_repo, filename=hf_file) + if context.debug: + print(f"model file: {context.model_file}\n") @step(u'a model alias {model_alias}') @@ -55,24 +64,34 @@ def step_model_alias(context, model_alias): context.model_alias = model_alias -@step(u'{seed} as server seed') +@step(u'{seed:d} as server seed') def step_seed(context, seed): - context.server_seed = int(seed) + context.server_seed = seed -@step(u'{n_ctx} KV cache size') +@step(u'{ngl:d} GPU offloaded layers') +def step_n_gpu_layer(context, ngl): + if 'N_GPU_LAYERS' in os.environ: + new_ngl = int(os.environ['N_GPU_LAYERS']) + if context.debug: + print(f"-ngl upgraded from {ngl} to {new_ngl}") + ngl = new_ngl + context.n_gpu_layer = ngl + + +@step(u'{n_ctx:d} KV cache size') def step_n_ctx(context, n_ctx): - context.n_ctx = int(n_ctx) + context.n_ctx = n_ctx -@step(u'{n_slots} slots') +@step(u'{n_slots:d} slots') def step_n_slots(context, n_slots): - context.n_slots = int(n_slots) + context.n_slots = n_slots -@step(u'{n_predict} server max tokens to predict') +@step(u'{n_predict:d} server max tokens to predict') def step_server_n_predict(context, n_predict): - context.n_server_predict = int(n_predict) + context.n_server_predict = n_predict @step(u'continuous batching') @@ -116,11 +135,13 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status): case 'ready' | 'idle': await wait_for_health_status(context, context.base_url, 200, 'ok', + timeout=10, params={'fail_on_no_slot': 0, 'include_slots': 0}, slots_idle=context.n_slots, slots_processing=0, expected_slots=[{'id': slot_id, 'state': 0} - for slot_id in range(context.n_slots)]) + for slot_id in + range(context.n_slots if context.n_slots else 1)]) case 'busy': await wait_for_health_status(context, context.base_url, 503, 'no slot available', @@ -128,7 +149,8 @@ async def step_wait_for_the_server_to_be_started(context, expecting_status): slots_idle=0, slots_processing=context.n_slots, expected_slots=[{'id': slot_id, 'state': 1} - for slot_id in range(context.n_slots)]) + for slot_id in + range(context.n_slots if context.n_slots else 1)]) case _: assert False, "unknown status" @@ -157,24 +179,24 @@ async def step_request_completion(context, api_error): context.base_url, debug=context.debug, n_predict=context.n_predict, - server_seed=context.server_seed, + seed=await completions_seed(context), expect_api_error=expect_api_error, user_api_key=context.user_api_key) context.tasks_result.append(completion) if context.debug: - print(f"Completion response: {completion}") + print(f"Completion response: {completion}\n") if expect_api_error: assert completion == 401, f"completion must be an 401 status code: {completion}" -@step(u'{predicted_n} tokens are predicted matching {re_content}') +@step(u'{predicted_n:d} tokens are predicted matching {re_content}') def step_n_tokens_predicted_with_content(context, predicted_n, re_content): - assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n), re_content) + assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n, re_content) -@step(u'{predicted_n} tokens are predicted') +@step(u'{predicted_n:d} tokens are predicted') def step_n_tokens_predicted(context, predicted_n): - assert_n_tokens_predicted(context.tasks_result.pop(), int(predicted_n)) + assert_n_tokens_predicted(context.tasks_result.pop(), predicted_n) @step(u'a user prompt {user_prompt}') @@ -192,9 +214,9 @@ def step_model(context, model): context.model = model -@step(u'{max_tokens} max tokens to predict') +@step(u'{max_tokens:d} max tokens to predict') def step_max_tokens(context, max_tokens): - context.n_predict = int(max_tokens) + context.n_predict = max_tokens @step(u'streaming is {enable_streaming}') @@ -222,11 +244,70 @@ def step_server_api_key(context, server_api_key): context.server_api_key = server_api_key +@step(u'{n_junk:d} as number of junk') +def step_n_junk(context, n_junk): + context.n_junk = n_junk + + +@step(u'{n_batch:d} as batch size') +def step_n_batch(context, n_batch): + context.n_batch = n_batch + + +@step(u'{seed:d} as seed') +def step_seed(context, seed): + context.seed = seed + + +@step(u'a prefix prompt') +def step_prompt_prefix(context): + context.prompt_prefix = context.text + + +@step(u'a junk suffix prompt') +def step_prompt_junk_suffix(context): + context.prompt_junk_suffix = context.text + + +@step(u'a suffix prompt') +def step_prompt_suffix(context): + context.prompt_suffix = context.text + + +@step(u'{n_ga:d} group attention factor' + u' to extend context size through self-extend') +def step_impl(context, n_ga): + context.n_ga = n_ga + + +@step(u'{n_ga_w:d} group attention width to extend context size through self-extend') +def step_impl(context, n_ga_w): + context.n_ga_w = n_ga_w + + +@step(u'a passkey prompt template') +def step_prompt_passkey(context): + context.prompt_passkey = context.text + + +@step(u'a "{passkey}" passkey challenge prompt with the passkey inserted every {i_pos:d} junk') +def step_prompt_passkey(context, passkey, i_pos): + prompt = "" + for i in range(context.n_junk): + if i % context.n_junk == i_pos: + prompt += context.prompt_passkey # the passkey is already substituted + prompt += context.prompt_junk_suffix + if context.debug: + passkey_highlight = "\x1b[33m" + passkey + "\x1b[0m" + print(f"Passkey challenge:\n```{prompt.replace(passkey, passkey_highlight)}```\n") + context.prompts.append(context.prompt_prefix + prompt + context.prompt_suffix) + + @step(u'an OAI compatible chat completions request with {api_error} api error') @async_run_until_complete async def step_oai_chat_completions(context, api_error): if context.debug: - print(f"Submitting OAI compatible completions request...") + print(f"Submitting OAI compatible completions request...\n") expect_api_error = api_error == 'raised' completion = await oai_chat_completions(context.prompts.pop(), context.system_prompt, @@ -241,8 +322,7 @@ async def step_oai_chat_completions(context, api_error): enable_streaming=context.enable_streaming if hasattr(context, 'enable_streaming') else None, - server_seed=context.server_seed - if hasattr(context, 'server_seed') else None, + seed=await completions_seed(context), user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None, @@ -276,8 +356,10 @@ async def step_concurrent_completion_requests(context): # prompt is inserted automatically context.base_url, debug=context.debug, + prompt_prefix=context.prompt_prefix, + prompt_suffix=context.prompt_suffix, n_predict=context.n_predict if hasattr(context, 'n_predict') else None, - server_seed=context.server_seed if hasattr(context, 'server_seed') else None, + seed=await completions_seed(context), user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None) @@ -297,8 +379,7 @@ async def step_oai_chat_completions(context): if hasattr(context, 'n_predict') else None, enable_streaming=context.enable_streaming if hasattr(context, 'enable_streaming') else None, - server_seed=context.server_seed - if hasattr(context, 'server_seed') else None, + seed=await completions_seed(context), user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None) @@ -318,7 +399,9 @@ async def step_oai_chat_completions(context): if hasattr(context, 'n_predict') else None, enable_streaming=context.enable_streaming if hasattr(context, 'enable_streaming') else None, - server_seed=context.server_seed + seed=context.seed + if hasattr(context, 'seed') else + context.server_seed if hasattr(context, 'server_seed') else None, user_api_key=context.user_api_key if hasattr(context, 'user_api_key') else None) @@ -330,11 +413,10 @@ async def step_all_prompts_are_predicted(context): await all_prompts_are_predicted(context) -@step(u'all prompts are predicted with {n_predict} tokens') +@step(u'all prompts are predicted with {n_expected_predicted:d} tokens') @async_run_until_complete -async def step_all_prompts_are_predicted_with_n_tokens(context, n_predict): - expected_predicted_n = int(n_predict) - await all_prompts_are_predicted(context, expected_predicted_n) +async def step_all_prompts_are_predicted_with_n_tokens(context, n_expected_predicted): + await all_prompts_are_predicted(context, n_expected_predicted) async def all_prompts_are_predicted(context, expected_predicted_n=None): @@ -464,6 +546,8 @@ async def step_prometheus_metrics_exported(context): assert metrics_response.headers['Content-Type'] == "text/plain; version=0.0.4" metrics_raw = await metrics_response.text() metric_exported = False + if context.debug: + print(f"/metrics answer:\n{metrics_raw}\n") for metric in parser.text_string_to_metric_families(metrics_raw): match metric.name: case "llamacpp:kv_cache_usage_ratio": @@ -472,6 +556,37 @@ async def step_prometheus_metrics_exported(context): assert metric_exported, "No metrics exported" +@step(u'available models') +def step_available_models(context): + # openai client always expects an api_key + openai.api_key = context.user_api_key if context.user_api_key is not None else 'nope' + openai.api_base = f'{context.base_url}/v1' + context.models = openai.Model.list().data + + +@step(u'{n_model:d} models are supported') +def step_supported_models(context, n_model): + if context.debug: + print("server models available:", context.models) + assert len(context.models) == n_model + + +@step(u'model {i_model:d} is {param} {preposition} {param_value}') +def step_supported_models(context, i_model, param, preposition, param_value): + assert i_model < len(context.models) + model = context.models[i_model] + + param_value = param_value.split(' ', 1)[0] + match param: + case 'identified': + value = model.id + case 'trained': + value = str(model.meta.n_ctx_train) + case _: + assert False, "param {param} not supported" + assert param_value == value, f"model param {param} {value} != {param_value}" + + async def concurrent_requests(context, f_completion, *args, **kwargs): n_prompts = len(context.prompts) if context.debug: @@ -486,8 +601,10 @@ async def concurrent_requests(context, f_completion, *args, **kwargs): async def request_completion(prompt, base_url, debug=False, + prompt_prefix=None, + prompt_suffix=None, n_predict=None, - server_seed=None, + seed=None, expect_api_error=None, user_api_key=None): if debug: @@ -504,11 +621,14 @@ async def request_completion(prompt, async with aiohttp.ClientSession() as session: async with session.post(f'{base_url}/completion', json={ + "input_prefix": prompt_prefix, "prompt": prompt, - "n_predict": int(n_predict) if n_predict is not None else -1, - "seed": server_seed if server_seed is not None else 42 + "input_suffix": prompt_suffix, + "n_predict": n_predict if n_predict is not None else -1, + "seed": seed if seed is not None else 42 }, - headers=headers) as response: + headers=headers, + timeout=3600) as response: if expect_api_error is None or not expect_api_error: assert response.status == 200 assert response.headers['Access-Control-Allow-Origin'] == origin @@ -526,14 +646,14 @@ async def oai_chat_completions(user_prompt, model=None, n_predict=None, enable_streaming=None, - server_seed=None, + seed=None, user_api_key=None, expect_api_error=None): if debug: print(f"Sending OAI Chat completions request: {user_prompt}") # openai client always expects an api key user_api_key = user_api_key if user_api_key is not None else 'nope' - seed = server_seed if server_seed is not None else 42 + seed = seed if seed is not None else 42 enable_streaming = enable_streaming if enable_streaming is not None else False payload = { "messages": [ @@ -692,20 +812,32 @@ def assert_n_tokens_predicted(completion_response, expected_predicted_n=None, re content = completion_response['content'] n_predicted = completion_response['timings']['predicted_n'] assert len(content) > 0, "no token predicted" - if expected_predicted_n is not None: + if re_content is not None: + p = re.compile(re_content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL) + matches = p.finditer(content) + last_match = 0 + highlighted = '' + for match in matches: + start, end = match.span() + highlighted += content[last_match: start] + highlighted += '\x1b[33m' + highlighted += content[start: end] + highlighted += '\x1b[0m' + last_match = end + highlighted += content[last_match:] + if 'DEBUG' in os.environ and os.environ['DEBUG'] == 'ON': + print(f"Checking completion response: {highlighted}\n") + assert last_match > 0, f'/{re_content}/ must match ```{highlighted}```' + if expected_predicted_n and expected_predicted_n > 0: assert n_predicted == expected_predicted_n, (f'invalid number of tokens predicted:' f' {n_predicted} <> {expected_predicted_n}') - if re_content is not None: - re_content = '^.*' + re_content.replace('', '|') + '.*$' - assert re.match(re_content, content, flags=RegexFlag.IGNORECASE | RegexFlag.MULTILINE | RegexFlag.DOTALL), ( - f'invalid tokens predicted:' - f' ```\n{content}\n``` do not match /{re_content}/') + async def gather_tasks_results(context): n_tasks = len(context.concurrent_tasks) if context.debug: - print(f"Waiting for all {n_tasks} tasks results...") + print(f"Waiting for all {n_tasks} tasks results...\n") for task_no in range(n_tasks): context.tasks_result.append(await context.concurrent_tasks.pop()) n_completions = len(context.tasks_result) @@ -716,15 +848,13 @@ async def wait_for_health_status(context, base_url, expected_http_status_code, expected_health_status, + timeout=3, params=None, slots_idle=None, slots_processing=None, expected_slots=None): if context.debug: - print(f"Starting checking for health for expected_health_status={expected_health_status}") - timeout = 3 # seconds - if expected_health_status == 'ok': - timeout = 10 # CI slow inference + print(f"Starting checking for health for expected_health_status={expected_health_status}\n") interval = 0.5 counter = 0 async with aiohttp.ClientSession() as session: @@ -734,7 +864,7 @@ async def wait_for_health_status(context, health = await health_response.json() if context.debug: print(f"HEALTH - response for expected health status='{expected_health_status}' on " - f"'{base_url}/health'?{params} is {health}") + f"'{base_url}/health'?{params} is {health}\n") if (status_code == expected_http_status_code and health['status'] == expected_health_status and (slots_idle is None or health['slots_idle'] == slots_idle) @@ -757,7 +887,7 @@ async def wait_for_health_status(context, if expected_http_status_code == 503: if len(context.tasks_result) == 0: print("\x1b[5;37;43mWARNING: forcing concurrent tasks," - " busy health check missed, probably too fast inference\x1b[0m") + " busy health check missed, probably too fast inference\x1b[0m\n") n_completions = await gather_tasks_results(context) if n_completions > 0: return @@ -791,6 +921,11 @@ def assert_slots_status(slots, expected_slots): f" = {expected[key]} != {slot[key]}") +async def completions_seed(context): + return context.seed if hasattr(context, 'seed') and context.seed is not None \ + else context.server_seed if hasattr(context, 'server_seed') else None + + def start_server_background(context): context.server_path = '../../../build/bin/server' if 'LLAMA_SERVER_BIN_PATH' in os.environ: @@ -800,27 +935,35 @@ def start_server_background(context): '--port', context.server_port, '--model', context.model_file ] + if context.n_batch: + server_args.extend(['--batch-size', context.n_batch]) + if context.n_gpu_layer: + server_args.extend(['--n-gpu-layers', context.n_gpu_layer]) if context.server_continuous_batching: server_args.append('--cont-batching') if context.server_embeddings: server_args.append('--embedding') if context.server_metrics: server_args.append('--metrics') - if context.model_alias is not None: + if context.model_alias: server_args.extend(['--alias', context.model_alias]) - if context.n_ctx is not None: + if context.n_ctx: server_args.extend(['--ctx-size', context.n_ctx]) - if context.n_slots is not None: + if context.n_slots: server_args.extend(['--parallel', context.n_slots]) - if context.n_server_predict is not None: + if context.n_server_predict: server_args.extend(['--n-predict', context.n_server_predict]) - if context.server_api_key is not None: + if context.server_api_key: server_args.extend(['--api-key', context.server_api_key]) + if context.n_ga: + server_args.extend(['--grp-attn-n', context.n_ga]) + if context.n_ga_w: + server_args.extend(['--grp-attn-w', context.n_ga_w]) if context.debug: server_args.append('--verbose') if 'SERVER_LOG_FORMAT_JSON' not in os.environ: server_args.extend(['--log-format', "text"]) - print(f"starting server with: {context.server_path}", *server_args) + print(f"starting server with: {context.server_path} {server_args}\n") context.server_process = subprocess.Popen( [str(arg) for arg in [context.server_path, *server_args]], close_fds=True) diff --git a/examples/server/tests/features/wrong_usages.feature b/examples/server/tests/features/wrong_usages.feature index e228b2371..cf14b3b44 100644 --- a/examples/server/tests/features/wrong_usages.feature +++ b/examples/server/tests/features/wrong_usages.feature @@ -1,4 +1,4 @@ -# run with ./test.sh --tags wrong_usage +# run with: ./tests.sh --no-skipped --tags wrong_usage @wrong_usage Feature: Wrong usage of llama.cpp server @@ -7,7 +7,7 @@ Feature: Wrong usage of llama.cpp server # or pass n_predict/max_tokens in the request. Scenario: Infinite loop Given a server listening on localhost:8080 - And a model file stories260K.gguf + And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models # Uncomment below to fix the issue #And 64 server max tokens to predict Then the server is starting @@ -18,4 +18,5 @@ Feature: Wrong usage of llama.cpp server # Uncomment below to fix the issue #And 128 max tokens to predict Given concurrent completion requests + Then the server is idle Then all prompts are predicted diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt index 334fa4a70..5d4210164 100644 --- a/examples/server/tests/requirements.txt +++ b/examples/server/tests/requirements.txt @@ -1,4 +1,5 @@ aiohttp~=3.9.3 behave~=1.2.6 +huggingface_hub~=0.20.3 openai~=0.25.0 prometheus-client~=0.20.0 diff --git a/examples/server/tests/tests.sh b/examples/server/tests/tests.sh index 17a4e6fc6..1c6c5695f 100755 --- a/examples/server/tests/tests.sh +++ b/examples/server/tests/tests.sh @@ -5,7 +5,7 @@ set -eu if [ $# -lt 1 ] then # Start @llama.cpp scenario - behave --summary --stop --no-capture --exclude 'issues|wrong_usages' --tags llama.cpp + behave --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp else behave "$@" fi diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp index d98541f26..b6e49d8b9 100644 --- a/examples/server/utils.hpp +++ b/examples/server/utils.hpp @@ -126,8 +126,7 @@ static inline void server_log(const char *level, const char *function, int line, for (const auto& el : log.items()) { const std::string value = el.value().dump(-1, ' ', false, json::error_handler_t::replace); - snprintf(buf, 1024, " %s=%s", el.key().c_str(), value.c_str()); - ss << buf; + ss << " " << el.key() << "=" << value; } const std::string str = ss.str();