diff --git a/.github/ISSUE_TEMPLATE/custom.md b/.github/ISSUE_TEMPLATE/bug.md similarity index 96% rename from .github/ISSUE_TEMPLATE/custom.md rename to .github/ISSUE_TEMPLATE/bug.md index 8fd955356..d7879b232 100644 --- a/.github/ISSUE_TEMPLATE/custom.md +++ b/.github/ISSUE_TEMPLATE/bug.md @@ -1,8 +1,7 @@ --- -name: Issue and enhancement template -about: Used to report issues and request enhancements for llama.cpp -title: "[User] Insert summary of your issue or enhancement.." -labels: '' +name: Bug template +about: Used to report bugs in llama.cpp +labels: ["bug"] assignees: '' --- @@ -46,7 +45,7 @@ $ g++ --version # Failure Information (for bugs) -Please help provide information about the failure if this is a bug. If it is not a bug, please remove the rest of this template. +Please help provide information about the failure / bug. # Steps to Reproduce diff --git a/.github/ISSUE_TEMPLATE/enhancement.md b/.github/ISSUE_TEMPLATE/enhancement.md new file mode 100644 index 000000000..dcffda750 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/enhancement.md @@ -0,0 +1,28 @@ +--- +name: Enhancement template +about: Used to request enhancements for llama.cpp +labels: ["enhancement"] +assignees: '' + +--- + +# Prerequisites + +Please answer the following questions for yourself before submitting an issue. + +- [ ] I am running the latest code. Development is very rapid so there are no tagged versions as of now. +- [ ] I carefully followed the [README.md](https://github.com/ggerganov/llama.cpp/blob/master/README.md). +- [ ] I [searched using keywords relevant to my issue](https://docs.github.com/en/issues/tracking-your-work-with-issues/filtering-and-searching-issues-and-pull-requests) to make sure that I am creating a new issue that is not already open (or closed). +- [ ] I reviewed the [Discussions](https://github.com/ggerganov/llama.cpp/discussions), and have a new bug or useful enhancement to share. + +# Feature Description + +Please provide a detailed written description of what you were trying to do, and what you expected `llama.cpp` to do as an enhancement. + +# Motivation + +Please provide a detailed written description of reasons why this feature is necessary and how it is useful to `llama.cpp` users. + +# Possible Implementation + +If you have an idea as to how it can be implemented, please write a detailed description. Feel free to give links to external sources or share visuals that might be helpful to understand the details better. diff --git a/CMakeLists.txt b/CMakeLists.txt index 6af42a6c2..202f26049 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -331,6 +331,7 @@ if (LLAMA_CUBLAS) set(CMAKE_CUDA_ARCHITECTURES "60;61;70") # needed for f16 CUDA intrinsics else() set(CMAKE_CUDA_ARCHITECTURES "52;61;70") # lowest CUDA 12 standard + lowest for integer intrinsics + #set(CMAKE_CUDA_ARCHITECTURES "") # use this to compile much faster, but only F16 models work endif() endif() message(STATUS "Using CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}") diff --git a/Makefile b/Makefile index 705c4acb4..80179631f 100644 --- a/Makefile +++ b/Makefile @@ -391,12 +391,9 @@ else endif #LLAMA_CUDA_NVCC ifdef CUDA_DOCKER_ARCH NVCCFLAGS += -Wno-deprecated-gpu-targets -arch=$(CUDA_DOCKER_ARCH) -endif # CUDA_DOCKER_ARCH -ifdef CUDA_NATIVE_ARCH - NVCCFLAGS += -arch=$(CUDA_NATIVE_ARCH) else NVCCFLAGS += -arch=native -endif # CUDA_NATIVE_ARCH +endif # CUDA_DOCKER_ARCH ifdef LLAMA_CUDA_FORCE_DMMV NVCCFLAGS += -DGGML_CUDA_FORCE_DMMV endif # LLAMA_CUDA_FORCE_DMMV diff --git a/common/common.cpp b/common/common.cpp index ac406ff72..73c2d4d42 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -933,13 +933,13 @@ std::tuple llama_init_from_gpt_par } if (params.ignore_eos) { - params.sparams.logit_bias[llama_token_eos(lctx)] = -INFINITY; + params.sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } { LOG("warming up the model with an empty run\n"); - std::vector tmp = { llama_token_bos(lctx), llama_token_eos(lctx), }; + std::vector tmp = { llama_token_bos(model), llama_token_eos(model), }; llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0)); llama_kv_cache_tokens_rm(lctx, -1, -1); llama_reset_timings(lctx); @@ -994,7 +994,7 @@ std::string llama_token_to_piece(const struct llama_context * ctx, llama_token t } std::string llama_detokenize_spm(llama_context * ctx, const std::vector & tokens) { - const llama_token bos_id = llama_token_bos(ctx); + const llama_token bos_id = llama_token_bos(llama_get_model(ctx)); std::string piece; std::string result; @@ -1239,7 +1239,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false"); fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks); - const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(lctx)); + const auto logit_bias_eos = sparams.logit_bias.find(llama_token_eos(llama_get_model(lctx))); const bool ignore_eos = logit_bias_eos != sparams.logit_bias.end() && logit_bias_eos->second == -INFINITY; fprintf(stream, "ignore_eos: %s # default: false\n", ignore_eos ? "true" : "false"); diff --git a/common/sampling.cpp b/common/sampling.cpp index 6f0af3c4a..5258d4e82 100644 --- a/common/sampling.cpp +++ b/common/sampling.cpp @@ -147,7 +147,7 @@ llama_token llama_sampling_sample( // apply penalties if (!prev.empty()) { - const float nl_logit = logits[llama_token_nl(ctx_main)]; + const float nl_logit = logits[llama_token_nl(llama_get_model(ctx_main))]; llama_sample_repetition_penalties(ctx_main, &cur_p, prev.data() + prev.size() - penalty_last_n, @@ -155,7 +155,7 @@ llama_token llama_sampling_sample( if (!penalize_nl) { for (size_t idx = 0; idx < cur_p.size; idx++) { - if (cur_p.data[idx].id == llama_token_nl(ctx_main)) { + if (cur_p.data[idx].id == llama_token_nl(llama_get_model(ctx_main))) { cur_p.data[idx].logit = nl_logit; break; } diff --git a/common/train.cpp b/common/train.cpp index 154ca56e5..3cce5da26 100644 --- a/common/train.cpp +++ b/common/train.cpp @@ -236,8 +236,8 @@ int64_t get_example_targets_batch( int64_t used_samples = 0; ggml_set_f32(target_probs, 0.0f); - llama_token bos = llama_token_bos(lctx); - llama_token eos = llama_token_eos(lctx); + llama_token bos = llama_token_bos(llama_get_model(lctx)); + llama_token eos = llama_token_eos(llama_get_model(lctx)); // printf("%s: example_id=%d n_batch=%d n_train_samples=%zu\n", __func__, example_id, n_batch, n_train_samples); for (int k=0; k= 2) { params.model = argv[1]; } @@ -37,6 +40,10 @@ int main(int argc, char ** argv) { n_len = std::atoi(argv[4]); } + if (argc >= 6) { + n_gpu_layers = std::atoi(argv[5]); + } + if (params.prompt.empty()) { params.prompt = "Hello my name is"; } @@ -49,7 +56,7 @@ int main(int argc, char ** argv) { llama_model_params model_params = llama_model_default_params(); - // model_params.n_gpu_layers = 99; // offload all layers to the GPU + model_params.n_gpu_layers = n_gpu_layers; llama_model * model = llama_load_model_from_file(params.model.c_str(), model_params); @@ -180,7 +187,7 @@ int main(int argc, char ** argv) { //const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of stream? -> mark the stream as finished - if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) { + if (new_token_id == llama_token_eos(model) || n_cur == n_len) { i_batch[i] = -1; LOG_TEE("\n"); if (n_parallel > 1) { diff --git a/examples/beam-search/beam-search.cpp b/examples/beam-search/beam-search.cpp index f078ab8a8..679b382e1 100644 --- a/examples/beam-search/beam-search.cpp +++ b/examples/beam-search/beam-search.cpp @@ -47,7 +47,7 @@ struct beam_search_callback_data { // In this case, end-of-beam (eob) is equivalent to end-of-sentence (eos) but this need not always be the same. // For example, eob can be flagged due to maximum token length, stop words, etc. static bool is_at_eob(const beam_search_callback_data & callback_data, const llama_token * tokens, size_t n_tokens) { - return n_tokens && tokens[n_tokens-1] == llama_token_eos(callback_data.ctx); + return n_tokens && tokens[n_tokens-1] == llama_token_eos(llama_get_model(callback_data.ctx)); } // Function matching type llama_beam_search_callback_fn_t. diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp index 6331335e3..9c52b7bba 100644 --- a/examples/infill/infill.cpp +++ b/examples/infill/infill.cpp @@ -246,14 +246,14 @@ int main(int argc, char ** argv) { if (suff_rm_leading_spc && inp_sfx[0] == space_token) { inp_sfx.erase(inp_sfx.begin()); } - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx)); + inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); } - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(ctx)); + embd_inp.push_back(llama_token_middle(model)); LOG("prefix: \"%s\"\n", log_tostr(params.input_prefix)); LOG("suffix: \"%s\"\n", log_tostr(params.input_suffix)); @@ -261,7 +261,7 @@ int main(int argc, char ** argv) { // Should not run without any tokens if (embd_inp.empty()) { - embd_inp.push_back(llama_token_bos(ctx)); + embd_inp.push_back(llama_token_bos(model)); LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); } @@ -577,10 +577,10 @@ int main(int argc, char ** argv) { if ((int) embd_inp.size() <= n_consumed) { // deal with eot token in infill mode - if ((llama_sampling_last(ctx_sampling) == llama_token_eot(ctx) || is_interacting) && params.interactive){ + if ((llama_sampling_last(ctx_sampling) == llama_token_eot(model) || is_interacting) && params.interactive){ if(is_interacting && !params.interactive_first) { // print an eot token - printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); + printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); } fflush(stdout); printf("\n"); @@ -627,14 +627,14 @@ int main(int argc, char ** argv) { if (suff_rm_leading_spc && inp_sfx[0] == space_token) { inp_sfx.erase(inp_sfx.begin()); } - inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(ctx)); + inp_pfx.insert(inp_pfx.begin(), llama_token_prefix(model)); if (add_bos) { - inp_pfx.insert(inp_pfx.begin(), llama_token_bos(ctx)); + inp_pfx.insert(inp_pfx.begin(), llama_token_bos(model)); } - inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(ctx)); + inp_sfx.insert(inp_sfx.begin(), llama_token_suffix(model)); embd_inp = inp_pfx; embd_inp.insert(embd_inp.end(), inp_sfx.begin(), inp_sfx.end()); - embd_inp.push_back(llama_token_middle(ctx)); + embd_inp.push_back(llama_token_middle(model)); embd.clear(); embd_guidance.clear(); n_remain = params.n_predict; @@ -644,7 +644,7 @@ int main(int argc, char ** argv) { is_interacting = false; } // deal with end of text token in interactive mode - else if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) { + else if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { LOG("found EOS token\n"); if (params.interactive) { @@ -661,7 +661,7 @@ int main(int argc, char ** argv) { if (params.input_prefix_bos) { LOG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_token_bos(ctx)); + embd_inp.push_back(llama_token_bos(model)); } std::string buffer; @@ -724,7 +724,7 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !params.interactive) { + if (!embd.empty() && embd.back() == llama_token_eos(model) && !params.interactive) { break; } @@ -736,7 +736,7 @@ int main(int argc, char ** argv) { } } if (!params.interactive && n_remain <= 0) { - printf("%s", llama_token_to_piece(ctx, llama_token_eot(ctx)).c_str()); + printf("%s", llama_token_to_piece(ctx, llama_token_eot(model)).c_str()); fflush(stdout); } diff --git a/examples/llama-bench/llama-bench.cpp b/examples/llama-bench/llama-bench.cpp index a04115c96..20767d555 100644 --- a/examples/llama-bench/llama-bench.cpp +++ b/examples/llama-bench/llama-bench.cpp @@ -933,7 +933,7 @@ struct sql_printer : public printer { }; static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_batch, int n_threads) { - std::vector tokens(n_batch, llama_token_bos(ctx)); + std::vector tokens(n_batch, llama_token_bos(llama_get_model(ctx))); int n_processed = 0; llama_set_n_threads(ctx, n_threads, n_threads); @@ -946,7 +946,7 @@ static void test_prompt(llama_context * ctx, int n_prompt, int n_past, int n_bat } static void test_gen(llama_context * ctx, int n_gen, int n_past, int n_threads) { - llama_token token = llama_token_bos(ctx); + llama_token token = llama_token_bos(llama_get_model(ctx)); llama_set_n_threads(ctx, n_threads, n_threads); diff --git a/examples/llava/llava-utils.h b/examples/llava/llava-utils.h index 45b2b1ad3..320c71967 100644 --- a/examples/llava/llava-utils.h +++ b/examples/llava/llava-utils.h @@ -137,7 +137,7 @@ inline llama_token sample_id(llama_context * ctx_llama, gpt_params & params) { inline const char * sample(struct llama_context * ctx_llama, gpt_params & params, int * n_past) { int id = sample_id(ctx_llama, params); static std::string ret; - if (id == llama_token_eos(ctx_llama)) { + if (id == llama_token_eos(llama_get_model(ctx_llama))) { ret = ""; } else { ret = llama_token_to_piece(ctx_llama, id); diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt index 908131884..cb00edbbb 100644 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -16,6 +16,8 @@ add_library(common OBJECT ${_common_path}/console.cpp ${_common_path}/grammar-parser.h ${_common_path}/grammar-parser.cpp + ${_common_path}/sampling.h + ${_common_path}/sampling.cpp ) # WARNING: because build-info.h is auto-generated, it will only diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 2621bd539..3d9f670b9 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -248,7 +248,7 @@ int main(int argc, char ** argv) { // Should not run without any tokens if (embd_inp.empty()) { - embd_inp.push_back(llama_token_bos(ctx)); + embd_inp.push_back(llama_token_bos(model)); LOG("embd_inp was considered empty and bos was added: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, embd_inp).c_str()); } @@ -693,7 +693,7 @@ int main(int argc, char ** argv) { } // deal with end of text token in interactive mode - if (llama_sampling_last(ctx_sampling) == llama_token_eos(ctx)) { + if (llama_sampling_last(ctx_sampling) == llama_token_eos(model)) { LOG("found EOS token\n"); if (params.interactive) { @@ -720,7 +720,7 @@ int main(int argc, char ** argv) { if (params.input_prefix_bos) { LOG("adding input prefix BOS token\n"); - embd_inp.push_back(llama_token_bos(ctx)); + embd_inp.push_back(llama_token_bos(model)); } std::string buffer; @@ -804,7 +804,7 @@ int main(int argc, char ** argv) { } // end of text token - if (!embd.empty() && embd.back() == llama_token_eos(ctx) && !(params.instruct || params.interactive)) { + if (!embd.empty() && embd.back() == llama_token_eos(model) && !(params.instruct || params.interactive)) { LOG_TEE(" [end of text]\n"); break; } diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp index eb64adef8..9a0b9c183 100644 --- a/examples/parallel/parallel.cpp +++ b/examples/parallel/parallel.cpp @@ -347,7 +347,7 @@ int main(int argc, char ** argv) { // client.id, client.seq_id, id, client.n_decoded, client.i_batch, token_str.c_str()); if (client.n_decoded > 2 && - (id == llama_token_eos(ctx) || + (id == llama_token_eos(model) || (params.n_predict > 0 && client.n_decoded + client.n_prompt >= params.n_predict) || client.response.find("User:") != std::string::npos || client.response.find('\n') != std::string::npos)) { diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp index 7d0038bd4..3c2542e8c 100644 --- a/examples/perplexity/perplexity.cpp +++ b/examples/perplexity/perplexity.cpp @@ -227,7 +227,7 @@ static results_perplexity perplexity_v2(llama_context * ctx, const gpt_params & // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(ctx); + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } const auto batch_logits = llama_get_logits(ctx); @@ -350,7 +350,7 @@ static results_perplexity perplexity(llama_context * ctx, const gpt_params & par // add BOS token for the first batch of each chunk if (add_bos && j == 0) { - tokens[batch_start] = llama_token_bos(ctx); + tokens[batch_start] = llama_token_bos(llama_get_model(ctx)); } if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) { diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 210c41736..460a6baa0 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -454,7 +454,7 @@ struct llama_client_slot } void release() { - if (state == PROCESSING) + if (state == IDLE || state == PROCESSING) { t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; command = RELEASE; @@ -726,7 +726,7 @@ struct llama_server_context if (json_value(data, "ignore_eos", false)) { - slot->sparams.logit_bias[llama_token_eos(ctx)] = -INFINITY; + slot->sparams.logit_bias[llama_token_eos(model)] = -INFINITY; } const auto &logit_bias = data.find("logit_bias"); @@ -754,6 +754,7 @@ struct llama_server_context } slot->params.antiprompt.clear(); + const auto &stop = data.find("stop"); if (stop != data.end() && stop->is_array()) { @@ -867,7 +868,7 @@ struct llama_server_context kv_cache_clear(); - for (int32_t i = 0; i < batch.n_tokens; ++i) + for (int i = 0; i < (int) system_tokens.size(); ++i) { llama_batch_add(batch, system_tokens[i], i, { 0 }, false); } @@ -894,16 +895,8 @@ struct llama_server_context { slot.release(); } - wait_all_are_idle(); - all_slots_are_idle = true; - // wait until system prompt load system_need_update = true; - while (system_need_update) - { - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - } - // system prompt loaded, continue } void process_system_prompt_data(const json &sys_props) { @@ -915,26 +908,6 @@ struct llama_server_context { notify_system_prompt_changed(); } - else - { - system_need_update = true; - } - } - - void wait_all_are_idle() { - bool wait = true; - while (wait) - { - wait = false; - for (auto &slot : slots) - { - if (!slot.available()) - { - wait = true; - break; - } - } - } } static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, @@ -965,7 +938,6 @@ struct llama_server_context slot.has_next_token = false; } stop_pos = pos; - } } @@ -1056,7 +1028,7 @@ struct llama_server_context slot.has_next_token = false; } - if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(ctx)) + if (!slot.cache_tokens.empty() && result.tok == llama_token_eos(model)) { slot.stopped_eos = true; slot.has_next_token = false; @@ -1130,7 +1102,7 @@ struct llama_server_context json get_formated_generation(llama_client_slot &slot) { - const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(ctx)); + const auto eos_bias = slot.sparams.logit_bias.find(llama_token_eos(model)); const bool ignore_eos = eos_bias != slot.sparams.logit_bias.end() && eos_bias->second < 0.0f && std::isinf(eos_bias->second); return json { @@ -1444,7 +1416,7 @@ struct llama_server_context process_tasks(); // update the system prompt wait until all slots are idle state - if (system_need_update) + if (system_need_update && all_slots_are_idle) { LOG_TEE("updating system prompt\n"); update_system_prompt(); @@ -1498,7 +1470,7 @@ struct llama_server_context for (auto & slot : slots) { // release the slot - if (slot.state == PROCESSING && slot.command == RELEASE) + if (slot.command == RELEASE) { slot.state = IDLE; slot.command = NONE; @@ -1509,7 +1481,7 @@ struct llama_server_context continue; } - if (slot.state == IDLE || slot.command == RELEASE) + if (slot.state == IDLE) { continue; } @@ -1530,6 +1502,17 @@ struct llama_server_context { for (auto & slot : slots) { + const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()); + + // empty prompt passed -> release the slot and send empty response + if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) + { + slot.release(); + slot.print_timings(); + send_final_response(slot); + continue; + } + // need process the prompt if (slot.state == IDLE && slot.command == LOAD_PROMPT) { @@ -1555,11 +1538,11 @@ struct llama_server_context suffix_tokens.erase(suffix_tokens.begin()); } - prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(ctx)); - prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(ctx)); // always add BOS - prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(ctx)); + prefix_tokens.insert(prefix_tokens.begin(), llama_token_prefix(model)); + prefix_tokens.insert(prefix_tokens.begin(), llama_token_bos(model)); // always add BOS + prefix_tokens.insert(prefix_tokens.end(), llama_token_suffix(model)); prefix_tokens.insert(prefix_tokens.end(), suffix_tokens.begin(), suffix_tokens.end()); - prefix_tokens.push_back(llama_token_middle(ctx)); + prefix_tokens.push_back(llama_token_middle(model)); prompt_tokens = prefix_tokens; } else @@ -1749,8 +1732,8 @@ struct llama_server_context if (!process_token(result, slot)) { slot.release(); - send_final_response(slot); slot.print_timings(); + send_final_response(slot); } slot.i_batch = -1; @@ -1766,21 +1749,22 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf("usage: %s [options]\n", argv0); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" -h, --help show this help message and exit\n"); + printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); printf(" --rope-scaling {none,linear,yarn}\n"); - printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); - printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); - printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); - printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); - printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); - printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); - printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); + printf(" RoPE frequency scaling method, defaults to linear unless specified by the model\n"); + printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor, expands context by a factor of 1/N\n"); + printf(" --yarn-ext-factor N YaRN: extrapolation mix factor (default: 1.0, 0.0 = full interpolation)\n"); + printf(" --yarn-attn-factor N YaRN: scale sqrt(t) or attention magnitude (default: 1.0)\n"); + printf(" --yarn-beta-slow N YaRN: high correction dim or alpha (default: %.1f)\n", params.yarn_beta_slow); + printf(" --yarn-beta-fast N YaRN: low correction dim or beta (default: %.1f)\n", params.yarn_beta_fast); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_mlock_supported()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); @@ -1975,6 +1959,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_threads = std::stoi(argv[i]); } + else if (arg == "--threads-batch" || arg == "-tb") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_threads_batch = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) @@ -2336,7 +2329,7 @@ int main(int argc, char **argv) if (!json_value(data, "stream", false)) { std::string completion_text; task_result result = llama.next_result(task_id); - if(!result.error && result.stop) { + if (!result.error && result.stop) { res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); } else @@ -2363,7 +2356,7 @@ int main(int argc, char **argv) { return false; } - if(result.stop) { + if (result.stop) { break; } } else { diff --git a/examples/simple/simple.cpp b/examples/simple/simple.cpp index 55385f566..f376c0509 100644 --- a/examples/simple/simple.cpp +++ b/examples/simple/simple.cpp @@ -138,7 +138,7 @@ int main(int argc, char ** argv) { const llama_token new_token_id = llama_sample_token_greedy(ctx, &candidates_p); // is it an end of stream? - if (new_token_id == llama_token_eos(ctx) || n_cur == n_len) { + if (new_token_id == llama_token_eos(model) || n_cur == n_len) { LOG_TEE("\n"); break; diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp index 894321ce9..92ad27e8e 100644 --- a/examples/speculative/speculative.cpp +++ b/examples/speculative/speculative.cpp @@ -163,7 +163,7 @@ int main(int argc, char ** argv) { printf("%s", token_str.c_str()); fflush(stdout); - if (id == llama_token_eos(ctx_tgt)) { + if (id == llama_token_eos(model_tgt)) { has_eos = true; } diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 4c6a36ca1..a43a5873d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -29,6 +29,8 @@ #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width) #define cublasCreate hipblasCreate #define cublasGemmEx hipblasGemmEx +#define cublasGemmBatchedEx hipblasGemmBatchedEx +#define cublasGemmStridedBatchedEx hipblasGemmStridedBatchedEx #define cublasHandle_t hipblasHandle_t #define cublasSetMathMode(handle, mode) CUBLAS_STATUS_SUCCESS #define cublasSetStream hipblasSetStream @@ -4326,13 +4328,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous const half * x = (const half *) vx; - const int row_x = blockDim.y*blockIdx.y + threadIdx.y; - const int channel = blockDim.z*blockIdx.z + threadIdx.z; + const int row_x = blockDim.y*blockIdx.y + threadIdx.y; + const int channel = blockDim.z*blockIdx.z + threadIdx.z; const int channel_x = channel / channel_x_divisor; - const int nrows_y = ncols_x; + const int nrows_y = ncols_x; const int nrows_dst = nrows_x; - const int row_dst = row_x; + const int row_dst = row_x; const int idst = channel*nrows_dst + row_dst; @@ -4345,13 +4347,13 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous break; } - const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; - const float xi = __half2float(x[ix]); - const int row_y = col_x; + const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; const int iy = channel*nrows_y + row_y; + const float xi = __half2float(x[ix]); + tmp += xi * y[iy]; } @@ -5698,10 +5700,10 @@ void ggml_init_cublas() { GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); - for (int64_t id = 0; id < g_device_count; ++id) { + for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); + fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -5711,15 +5713,15 @@ void ggml_init_cublas() { g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } - for (int64_t id = 0; id < g_device_count; ++id) { + for (int id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; } - for (int64_t id = 0; id < g_device_count; ++id) { + for (int id = 0; id < g_device_count; ++id) { CUDA_CHECK(ggml_cuda_set_device(id)); // create cuda streams - for (int64_t is = 0; is < MAX_STREAMS; ++is) { + for (int is = 0; is < MAX_STREAMS; ++is) { CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking)); } @@ -7063,7 +7065,8 @@ static void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tens } static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ - GGML_ASSERT(!ggml_is_contiguous(src0) && ggml_is_contiguous(src1)); + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); GGML_ASSERT(!ggml_is_permuted(src0)); GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); GGML_ASSERT(src0->type == GGML_TYPE_F16); @@ -7073,11 +7076,11 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; - const int64_t ne12 = src1->ne[2]; - const int64_t nb01 = src0->nb[1]; const int64_t nb02 = src0->nb[2]; + const int64_t ne12 = src1->ne[2]; + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; @@ -7096,6 +7099,159 @@ static void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, main_stream); } +static void ggml_cuda_mul_mat_mat_batched_cublas(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ + GGML_ASSERT(!ggml_is_transposed(src0)); + GGML_ASSERT(!ggml_is_transposed(src1)); + GGML_ASSERT(src0->backend != GGML_BACKEND_GPU_SPLIT); + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + + const int64_t ne00 = src0->ne[0]; GGML_UNUSED(ne00); + const int64_t ne01 = src0->ne[1]; + const int64_t ne02 = src0->ne[2]; + const int64_t ne03 = src0->ne[3]; + + const int64_t nb01 = src0->nb[1]; + const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02); + const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03); + + const int64_t ne10 = src1->ne[0]; + const int64_t ne11 = src1->ne[1]; + const int64_t ne12 = src1->ne[2]; + const int64_t ne13 = src1->ne[3]; + + const int64_t nb11 = src1->nb[1]; + const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12); + const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13); + + const int64_t ne1 = ggml_nelements(src1); + const int64_t ne = ggml_nelements(dst); + + CUDA_CHECK(ggml_cuda_set_device(g_main_device)); + cudaStream_t main_stream = g_cudaStreams[g_main_device][0]; + + int id; + CUDA_CHECK(cudaGetDevice(&id)); + CUBLAS_CHECK(cublasSetStream(g_cublas_handles[id], main_stream)); + + ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra; + void * src0_ddq = src0_extra->data_device[g_main_device]; + half * src0_as_f16 = (half *) src0_ddq; + + ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra; + float * src1_ddf = (float *) src1_extra->data_device[g_main_device]; + + ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; + float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; + + // convert src1 to fp16 + const to_fp16_cuda_t to_fp16_cuda = ggml_get_to_fp16_cuda(src1->type); + GGML_ASSERT(to_fp16_cuda != nullptr); + + size_t src1_as = 0; + half * src1_as_f16 = (half *) ggml_cuda_pool_malloc(ne1 * sizeof(half), &src1_as); + to_fp16_cuda(src1_ddf, src1_as_f16, ne1, main_stream); + + size_t dst_as = 0; + half * dst_f16 = (half *) ggml_cuda_pool_malloc(ne * sizeof(half), &dst_as); + + GGML_ASSERT(ne12 % ne02 == 0); + GGML_ASSERT(ne13 % ne03 == 0); + + // broadcast factors + const int64_t r2 = ne12/ne02; + const int64_t r3 = ne13/ne03; + + const half alpha_f16 = 1.0f; + const half beta_f16 = 0.0f; + +#if 0 + // use cublasGemmEx + { + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + int i03 = i13 / r3; + int i02 = i12 / r2; + + CUBLAS_CHECK( + cublasGemmEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha_f16, (const char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3] , CUDA_R_16F, nb01/sizeof(half), + (const char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2, CUDA_R_16F, nb11/sizeof(float), + &beta_f16, ( char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2, CUDA_R_16F, ne01, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } + } + } +#else + if (r2 == 1 && r3 == 1 && src0->nb[2]*src0->ne[2] == src0->nb[3] && src1->nb[2]*src1->ne[2] == src1->nb[3]) { + // there is no broadcast and src0, src1 are contiguous across dims 2, 3 + // use cublasGemmStridedBatchedEx + CUBLAS_CHECK( + cublasGemmStridedBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha_f16, (const char *) src0_as_f16, CUDA_R_16F, nb01/sizeof(half), src0->nb[2]/sizeof(half), // strideA + (const char *) src1_as_f16, CUDA_R_16F, nb11/sizeof(float), src1->nb[2]/sizeof(float), // strideB + &beta_f16, ( char *) dst_f16, CUDA_R_16F, ne01, dst->nb[2]/sizeof(float), // strideC + ne12*ne13, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + } else { + // use cublasGemmBatchedEx + // TODO: https://github.com/ggerganov/llama.cpp/pull/3749#discussion_r1369997000 + const int ne23 = ne12*ne13; + + // TODO: avoid this alloc + void ** ptrs = (void **) malloc(3*ne23*sizeof(void *)); + + for (int i13 = 0; i13 < ne13; ++i13) { + for (int i12 = 0; i12 < ne12; ++i12) { + int i03 = i13 / r3; + int i02 = i12 / r2; + + ptrs[0*ne23 + i12 + i13*ne12] = (char *) src0_as_f16 + i02*src0->nb[2] + i03*src0->nb[3]; + ptrs[1*ne23 + i12 + i13*ne12] = (char *) src1_as_f16 + i12*src1->nb[2]/2 + i13*src1->nb[3]/2; + ptrs[2*ne23 + i12 + i13*ne12] = (char *) dst_f16 + i12* dst->nb[2]/2 + i13* dst->nb[3]/2; + } + } + + // allocate device memory for pointers + void ** ptrs_as = nullptr; + CUDA_CHECK(cudaMalloc(&ptrs_as, 3*ne23*sizeof(void *))); + + // TODO: this does not work for some reason -- not sure why? + //size_t ptrs_s = 0; + //ptrs_as = (void **) ggml_cuda_pool_malloc(3*ne23*sizeof(void *), &ptrs_s); + + // copy pointers to device + CUDA_CHECK(cudaMemcpy(ptrs_as, ptrs, 3*ne23*sizeof(void *), cudaMemcpyHostToDevice)); + + free(ptrs); + + CUBLAS_CHECK( + cublasGemmBatchedEx(g_cublas_handles[id], CUBLAS_OP_T, CUBLAS_OP_N, + ne01, ne11, ne10, + &alpha_f16, (const void **) (ptrs_as + 0*ne23), CUDA_R_16F, nb01/sizeof(half), + (const void **) (ptrs_as + 1*ne23), CUDA_R_16F, nb11/sizeof(float), + &beta_f16, ( void **) (ptrs_as + 2*ne23), CUDA_R_16F, ne01, + ne23, + CUBLAS_COMPUTE_16F, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); + + // free device memory for pointers + CUDA_CHECK(cudaFree(ptrs_as)); + //ggml_cuda_pool_free(ptrs_as, ptrs_s); + } +#endif + + const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(GGML_TYPE_F16); + to_fp32_cuda(dst_f16, dst_ddf, ne, main_stream); + + ggml_cuda_pool_free(src1_as_f16, src1_as); + ggml_cuda_pool_free(dst_f16, dst_as); +} + static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { bool all_on_device = (src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT) && src1->backend == GGML_BACKEND_GPU && dst->backend == GGML_BACKEND_GPU; @@ -7108,10 +7264,22 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 } } + // debug helpers + //printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]); + //printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]); + //printf("src1: %8d %8d %8d %8d\n", src1->ne[0], src1->ne[1], src1->ne[2], src1->ne[3]); + //printf(" %8d %8d %8d %8d\n", src1->nb[0], src1->nb[1], src1->nb[2], src1->nb[3]); + //printf("src0 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src0), ggml_is_transposed(src0), ggml_type_name(src0->type), src0->name); + //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); + if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { + // KQ ggml_cuda_mul_mat_vec_p021(src0, src1, dst); - } else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) { + } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { + // KQV ggml_cuda_mul_mat_vec_nc(src0, src1, dst); + } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); } else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) { diff --git a/ggml-metal.m b/ggml-metal.m index d3165a218..2cc7aad3e 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -62,6 +62,7 @@ struct ggml_metal_context { GGML_METAL_DECL_KERNEL(mul); GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast GGML_METAL_DECL_KERNEL(scale); + GGML_METAL_DECL_KERNEL(scale_4); GGML_METAL_DECL_KERNEL(silu); GGML_METAL_DECL_KERNEL(relu); GGML_METAL_DECL_KERNEL(gelu); @@ -249,6 +250,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { GGML_METAL_ADD_KERNEL(mul); GGML_METAL_ADD_KERNEL(mul_row); GGML_METAL_ADD_KERNEL(scale); + GGML_METAL_ADD_KERNEL(scale_4); GGML_METAL_ADD_KERNEL(silu); GGML_METAL_ADD_KERNEL(relu); GGML_METAL_ADD_KERNEL(gelu); @@ -347,6 +349,7 @@ void ggml_metal_free(struct ggml_metal_context * ctx) { GGML_METAL_DEL_KERNEL(mul); GGML_METAL_DEL_KERNEL(mul_row); GGML_METAL_DEL_KERNEL(scale); + GGML_METAL_DEL_KERNEL(scale_4); GGML_METAL_DEL_KERNEL(silu); GGML_METAL_DEL_KERNEL(relu); GGML_METAL_DEL_KERNEL(gelu); @@ -923,15 +926,20 @@ void ggml_metal_graph_compute( const float scale = *(const float *) src1->data; - [encoder setComputePipelineState:ctx->pipeline_scale]; + int64_t n = ggml_nelements(dst); + + if (n % 4 == 0) { + n /= 4; + [encoder setComputePipelineState:ctx->pipeline_scale_4]; + } else { + [encoder setComputePipelineState:ctx->pipeline_scale]; + } + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; [encoder setBytes:&scale length:sizeof(scale) atIndex:2]; - const int64_t n = ggml_nelements(dst); - GGML_ASSERT(n % 4 == 0); - - [encoder dispatchThreadgroups:MTLSizeMake(n/4, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; case GGML_OP_UNARY: switch (ggml_get_unary_op(gf->nodes[i])) { diff --git a/ggml-metal.metal b/ggml-metal.metal index fcb5327e8..ddfe37813 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -125,9 +125,17 @@ kernel void kernel_mul_row( } kernel void kernel_scale( + device const float * src0, + device float * dst, + constant float & scale, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] * scale; +} + +kernel void kernel_scale_4( device const float4 * src0, device float4 * dst, - constant float & scale, + constant float & scale, uint tpig[[thread_position_in_grid]]) { dst[tpig] = src0[tpig] * scale; } diff --git a/ggml.c b/ggml.c index 5403ae1d1..0034067d5 100644 --- a/ggml.c +++ b/ggml.c @@ -572,7 +572,6 @@ int64_t ggml_cycles_per_ms(void) { #define ggml_perf_cycles_per_ms() 0 #endif - // // cache line // @@ -1829,7 +1828,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { return type_traits[type]; } - // // simd mappings // @@ -4058,16 +4056,17 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ALIBI", "CLAMP", "CONV_1D", + "CONV_1D_STAGE_0", + "CONV_1D_STAGE_1", "CONV_TRANSPOSE_1D", "CONV_2D", + "CONV_2D_STAGE_0", + "CONV_2D_STAGE_1", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", "UPSCALE", - "CONV_1D_STAGE_0", - "CONV_1D_STAGE_1", - "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", @@ -4093,7 +4092,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); +static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -4144,16 +4143,17 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "alibi(x)", "clamp(x)", "conv_1d(x)", + "conv_1d_stage_0(x)", + "conv_1d_stage_1(x)", "conv_transpose_1d(x)", "conv_2d(x)", + "conv_2d_stage_0(x)", + "conv_2d_stage_1(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", - "conv_1d_stage_0(x)", - "conv_1d_stage_1(x)", - "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", @@ -4179,7 +4179,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); +static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4210,8 +4210,10 @@ static void ggml_setup_op_has_task_pass(void) { p[GGML_OP_CONV_1D ] = true; p[GGML_OP_CONV_1D_STAGE_0 ] = true; p[GGML_OP_CONV_1D_STAGE_1 ] = true; - p[GGML_OP_CONV_2D ] = true; p[GGML_OP_CONV_TRANSPOSE_1D ] = true; + p[GGML_OP_CONV_2D ] = true; + p[GGML_OP_CONV_2D_STAGE_0 ] = true; + p[GGML_OP_CONV_2D_STAGE_1 ] = true; p[GGML_OP_CONV_TRANSPOSE_2D ] = true; p[GGML_OP_FLASH_ATTN_BACK ] = true; p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; @@ -5955,7 +5957,6 @@ struct ggml_tensor * ggml_sqrt_inplace( return ggml_sqrt_impl(ctx, a, true); } - // ggml_log static struct ggml_tensor * ggml_log_impl( @@ -6009,7 +6010,6 @@ struct ggml_tensor * ggml_sum( return result; } - // ggml_sum_rows struct ggml_tensor * ggml_sum_rows( @@ -6641,7 +6641,6 @@ struct ggml_tensor * ggml_set_2d_inplace( return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); } - // ggml_cpy static struct ggml_tensor * ggml_cpy_impl( @@ -6721,7 +6720,6 @@ struct ggml_tensor * ggml_cont_inplace( return ggml_cont_impl(ctx, a, true); } - // make contiguous, with new shape GGML_API struct ggml_tensor * ggml_cont_1d( struct ggml_context * ctx, @@ -7174,7 +7172,6 @@ struct ggml_tensor * ggml_diag( return result; } - // ggml_diag_mask_inf static struct ggml_tensor * ggml_diag_mask_inf_impl( @@ -7286,7 +7283,6 @@ struct ggml_tensor * ggml_soft_max_inplace( return ggml_soft_max_impl(ctx, a, true); } - // ggml_soft_max_back static struct ggml_tensor * ggml_soft_max_back_impl( @@ -7732,7 +7728,11 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( // ggml_conv_2d -struct ggml_tensor * ggml_conv_2d( +// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OH, OW, IC*KH*KW] +static struct ggml_tensor * ggml_conv_2d_stage_0( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -7751,17 +7751,21 @@ struct ggml_tensor * ggml_conv_2d( is_node = true; } + const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); + const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + const int64_t ne[4] = { - ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), - ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), - a->ne[3], b->ne[3], + a->ne[2] * a->ne[1] * a->ne[0], + OW, + OH, + b->ne[3], }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_CONV_2D; + result->op = GGML_OP_CONV_2D_STAGE_0; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7770,8 +7774,61 @@ struct ggml_tensor * ggml_conv_2d( } -// ggml_conv_2d_sk_p0 +// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] +// a: [OC, IC, KH, KW] +// b: [N, OH, OW, IC * KH * KW] +// result: [N, OC, OH, OW] +static struct ggml_tensor * ggml_conv_2d_stage_1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + b->ne[1], + b->ne[2], + a->ne[3], + b->ne[3], + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_CONV_2D_STAGE_1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; + +} + +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OC, OH, OW] +struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + + struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW] + result = ggml_conv_2d_stage_1(ctx, a, result); + + return result; + +} + +// ggml_conv_2d_sk_p0 struct ggml_tensor * ggml_conv_2d_sk_p0( struct ggml_context * ctx, struct ggml_tensor * a, @@ -8210,7 +8267,6 @@ static struct ggml_tensor * ggml_add_rel_pos_impl( return result; } - struct ggml_tensor * ggml_add_rel_pos( struct ggml_context * ctx, struct ggml_tensor * a, @@ -8655,8 +8711,6 @@ struct ggml_tensor * ggml_map_custom3_inplace( return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); } - - // ggml_cross_entropy_loss struct ggml_tensor * ggml_cross_entropy_loss( @@ -9858,7 +9912,6 @@ static void ggml_compute_forward_add1( } } - // ggml_compute_forward_acc static void ggml_compute_forward_acc_f32( @@ -9998,7 +10051,6 @@ static void ggml_compute_forward_sub_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - #ifdef GGML_USE_ACCELERATE vDSP_vsub( (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, @@ -10179,7 +10231,6 @@ static void ggml_compute_forward_div_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - #ifdef GGML_USE_ACCELERATE UNUSED(ggml_vec_div_f32); @@ -10317,7 +10368,6 @@ static void ggml_compute_forward_sqrt( } } - // ggml_compute_forward_log static void ggml_compute_forward_log_f32( @@ -12150,7 +12200,6 @@ static void ggml_compute_forward_out_prod_f32( } } - //int64_t t1 = ggml_perf_time_us(); //static int64_t acc = 0; //acc += t1 - t0; @@ -12346,7 +12395,6 @@ static void ggml_compute_forward_scale_f32( const size_t nb1 = dst->nb[1]; - for (int i1 = ir0; i1 < ir1; i1++) { if (dst->data != src0->data) { // src0 is same shape as dst => same indices @@ -12744,7 +12792,6 @@ static void ggml_compute_forward_get_rows_back_f32( } } - static void ggml_compute_forward_get_rows_back( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -14087,6 +14134,7 @@ static void ggml_compute_forward_conv_1d_f32( } } +// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1 static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k, ggml_fp16_t * A, ggml_fp16_t * B, @@ -14388,6 +14436,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, ggml_nbytes(dst)); + return; } @@ -14460,7 +14511,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); float * dst_data = wdata + i01*ne00*ne02; for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00]; + dst_data[i00*ne02 + i02] = src[i00]; } } } @@ -14479,6 +14530,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32( } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, ggml_nbytes(dst)); + return; } @@ -14540,6 +14594,144 @@ static void ggml_compute_forward_conv_transpose_1d( // ggml_compute_forward_conv_2d +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void ggml_compute_forward_conv_2d_stage_0_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int64_t N = ne13; + const int64_t IC = ne12; + const int64_t IH = ne11; + const int64_t IW = ne10; + + // const int64_t OC = ne03; + // const int64_t IC = ne02; + const int64_t KH = ne01; + const int64_t KW = ne00; + + const int64_t OH = ne2; + const int64_t OW = ne1; + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic+=nth) { + + // micro kernel + ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] + + for (int64_t ikh = 0; ikh < KH; ikh++) { + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); + } + } + } + } + } + } + } + } +} + +// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] +// src0: [OC, IC, KH, KW] +// src1: [N, OH, OW, IC * KH * KW] +// result: [N, OC, OH, OW] +static void ggml_compute_forward_conv_2d_stage_1_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb0 == sizeof(float)); + + const int N = ne13; + const int OH = ne12; + const int OW = ne11; + + const int OC = ne03; + const int IC = ne02; + const int KH = ne01; + const int KW = ne00; + + const int ith = params->ith; + const int nth = params->nth; + + int64_t m = OC; + int64_t n = OH * OW; + int64_t k = IC * KH * KW; + + // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] + for (int i = 0; i < N; i++) { + ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k] + ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m, n] + + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); + } +} + static void ggml_compute_forward_conv_2d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -14552,16 +14744,40 @@ static void ggml_compute_forward_conv_2d_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS + + // src1: image [N, IC, IH, IW] + // src0: kernel [OC, IC, KH, KW] + // dst: result [N, OC, OH, OW] + // ne12: IC + // ne0: OW + // ne1: OH + // nk0: KW + // nk1: KH + // ne13: N + + const int N = ne13; + const int IC = ne12; + const int IH = ne11; + const int IW = ne10; + + const int OC = ne03; + // const int IC = ne02; + const int KH = ne01; + const int KW = ne00; + + const int OH = ne1; + const int OW = ne0; const int ith = params->ith; const int nth = params->nth; - const int nk0 = ne00; - const int nk1 = ne01; + // const int nk0 = ne00; + // const int nk1 = ne01; // size of the convolution row - the kernel size unrolled across all channels - const int ew0 = nk0*nk1*ne02; + // const int ew0 = nk0*nk1*ne02; + // ew0: IC*KH*KW const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; @@ -14577,24 +14793,27 @@ static void ggml_compute_forward_conv_2d_f16_f32( memset(params->wdata, 0, params->wsize); // prepare source data (src1) + // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW] + { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i13 = 0; i13 < ne13; i13++) { - for (int i12 = 0; i12 < ne12; i12++) { - const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12); - ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0); + for (int in = 0; in < N; in++) { + for (int iic = 0; iic < IC; iic++) { + for (int ioh = 0; ioh < OH; ioh++) { + for (int iow = 0; iow < OW; iow++) { - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - for (int ik1 = 0; ik1 < nk1; ik1++) { - for (int ik0 = 0; ik0 < nk0; ik0++) { - const int idx0 = i0*s0 + ik0*d0 - p0; - const int idx1 = i1*s1 + ik1*d1 - p1; + // micro kernel + ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] - if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { - dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); + for (int ikh = 0; ikh < KH; ikh++) { + for (int ikw = 0; ikw < KW; ikw++) { + const int iiw = iow*s0 + ikw*d0 - p0; + const int iih = ioh*s1 + ikh*d1 - p1; + + if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); } } } @@ -14611,30 +14830,22 @@ static void ggml_compute_forward_conv_2d_f16_f32( return; } - // total patches in dst - const int np = ne2; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + // wdata: [N*OH*OW, IC*KH*KW] + // dst: result [N, OC, OH, OW] + // src0: kernel [OC, IC, KH, KW] - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ip0; i2 < ip1; i2++) { - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2); + int64_t m = OC; + int64_t n = OH * OW; + int64_t k = IC * KH * KW; - for (int i1 = 0; i1 < ne1; ++i1) { - for (int i0 = 0; i0 < ne0; ++i0) { - ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, - (ggml_fp16_t *) ((char *) src0->data + i2*nb03), - (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0); - } - } - } + // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] + for (int i = 0; i < N; i++) { + ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k] + ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m * k] + + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); } } @@ -14660,6 +14871,48 @@ static void ggml_compute_forward_conv_2d( } } +static void ggml_compute_forward_conv_2d_stage_0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(false); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_2d_stage_1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(false); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_conv_transpose_2d static void ggml_compute_forward_conv_transpose_2d( @@ -14718,6 +14971,8 @@ static void ggml_compute_forward_conv_transpose_2d( } } + memset(dst->data, 0, ggml_nbytes(dst)); + return; } @@ -16216,7 +16471,6 @@ static void ggml_compute_forward_add_rel_pos_f32( const int ip0 = dp*ith; const int ip1 = MIN(ip0 + dp, np); - for (int64_t i13 = ip0; i13 < ip1; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { @@ -16283,7 +16537,6 @@ static void ggml_compute_forward_map_unary_f32( } } - static void ggml_compute_forward_map_unary( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -16331,7 +16584,6 @@ static void ggml_compute_forward_map_binary_f32( } } - static void ggml_compute_forward_map_binary( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -16383,7 +16635,6 @@ static void ggml_compute_forward_map_custom2_f32( fun(dst, a, b); } - // ggml_compute_forward_map_custom3 static void ggml_compute_forward_map_custom3_f32( @@ -16658,7 +16909,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( ggml_vec_sub_f32(nc, ds0, ds0, s1); ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr); - #ifndef NDEBUG for (int i = 0; i < nc; ++i) { assert(!isnan(ds0[i])); @@ -16686,12 +16936,15 @@ static void ggml_compute_forward_cross_entropy_loss_back( } } - ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { GGML_ASSERT(params); + if (tensor->op == GGML_OP_NONE) { + return; + } + #ifdef GGML_USE_CUBLAS bool skip_cpu = ggml_cuda_compute_forward(params, tensor); if (skip_cpu) { @@ -16894,6 +17147,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); } break; + case GGML_OP_CONV_2D_STAGE_0: + { + ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_2D_STAGE_1: + { + ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor); + } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); @@ -17828,11 +18089,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_CONV_2D: { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_TRANSPOSE_1D: + case GGML_OP_CONV_2D_STAGE_0: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_2D_STAGE_1: { GGML_ASSERT(false); // TODO: not implemented } break; @@ -18761,6 +19030,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { const int64_t ne0 = node->ne[0]; const int64_t ne1 = node->ne[1]; const int64_t ne2 = node->ne[2]; + const int64_t ne3 = node->ne[3]; const int64_t nk = ne00*ne01; const int64_t ew0 = nk * ne02; @@ -18771,7 +19041,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { if (node->src[0]->type == GGML_TYPE_F16 && node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); + // im2col: [N*OH*OW, IC*KH*KW] + cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0); } else if (node->src[0]->type == GGML_TYPE_F32 && node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)* (ne10*ne11*ne12); @@ -18781,6 +19052,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { work_size = MAX(work_size, cur); } break; + case GGML_OP_CONV_2D_STAGE_0: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONV_2D_STAGE_1: + { + n_tasks = n_threads; + } break; case GGML_OP_CONV_TRANSPOSE_2D: { n_tasks = n_threads; @@ -19969,7 +20248,6 @@ static enum ggml_opt_result ggml_opt_adam( opt->loss_after = fx; - // check convergence if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { GGML_PRINT_DEBUG("converged\n"); diff --git a/ggml.h b/ggml.h index 3a0b8153c..ffebad35d 100644 --- a/ggml.h +++ b/ggml.h @@ -401,15 +401,16 @@ extern "C" { GGML_OP_ALIBI, GGML_OP_CLAMP, GGML_OP_CONV_1D, - GGML_OP_CONV_2D, + GGML_OP_CONV_1D_STAGE_0, // internal + GGML_OP_CONV_1D_STAGE_1, // internal GGML_OP_CONV_TRANSPOSE_1D, + GGML_OP_CONV_2D, + GGML_OP_CONV_2D_STAGE_0, // internal + GGML_OP_CONV_2D_STAGE_1, // internal GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, - GGML_OP_CONV_1D_STAGE_0, // internal - GGML_OP_CONV_1D_STAGE_1, // internal - GGML_OP_UPSCALE, // nearest interpolate GGML_OP_FLASH_ATTN, @@ -1020,9 +1021,9 @@ extern "C" { struct ggml_tensor * b, float eps); - // A: n columns, m rows - // B: n columns, p rows (i.e. we transpose it internally) - // result is m columns, p rows + // A: k columns, n rows => [ne03, ne02, n, k] + // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k] + // result is n columns, m rows => [ne03 * x, ne02 * y, m, n] GGML_API struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a, diff --git a/llama.cpp b/llama.cpp index ff713c14e..e0794f90c 100644 --- a/llama.cpp +++ b/llama.cpp @@ -7593,7 +7593,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c } } - const llama_token eos = llama_token_eos(ctx); + const llama_token eos = llama_token_eos(&ctx->model); std::vector, llama_partial_utf8>> candidates_decoded; std::vector candidates_grammar; @@ -7803,7 +7803,7 @@ llama_token llama_sample_token(struct llama_context * ctx, llama_token_data_arra void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar * grammar, llama_token token) { const int64_t t_start_sample_us = ggml_time_us(); - if (token == llama_token_eos(ctx)) { + if (token == llama_token_eos(&ctx->model)) { for (const auto & stack : grammar->stacks) { if (stack.empty()) { return; @@ -9033,7 +9033,7 @@ struct llama_context * llama_new_context_with_model( // build worst-case graph int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch); int n_past = cparams.n_ctx - n_tokens; - llama_token token = llama_token_bos(ctx); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph + llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0)); #ifdef GGML_USE_METAL @@ -9794,43 +9794,44 @@ float * llama_get_embeddings(struct llama_context * ctx) { return ctx->embedding.data(); } -const char * llama_token_get_text(const struct llama_context * ctx, llama_token token) { - return ctx->model.vocab.id_to_token[token].text.c_str(); +const char * llama_token_get_text(const struct llama_model * model, llama_token token) { + return model->vocab.id_to_token[token].text.c_str(); } -float llama_token_get_score(const struct llama_context * ctx, llama_token token) { - return ctx->model.vocab.id_to_token[token].score; +float llama_token_get_score(const struct llama_model * model, llama_token token) { + return model->vocab.id_to_token[token].score; } -llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token) { - return ctx->model.vocab.id_to_token[token].type; +llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token) { + return model->vocab.id_to_token[token].type; } -llama_token llama_token_bos(const struct llama_context * ctx) { - return ctx->model.vocab.special_bos_id; +llama_token llama_token_bos(const struct llama_model * model) { + return model->vocab.special_bos_id; } -llama_token llama_token_eos(const struct llama_context * ctx) { - return ctx->model.vocab.special_eos_id; +llama_token llama_token_eos(const struct llama_model * model) { + return model->vocab.special_eos_id; } -llama_token llama_token_nl(const struct llama_context * ctx) { - return ctx->model.vocab.linefeed_id; -} -llama_token llama_token_prefix(const struct llama_context * ctx) { - return ctx->model.vocab.special_prefix_id; +llama_token llama_token_nl(const struct llama_model * model) { + return model->vocab.linefeed_id; } -llama_token llama_token_middle(const struct llama_context * ctx) { - return ctx->model.vocab.special_middle_id; +llama_token llama_token_prefix(const struct llama_model * model) { + return model->vocab.special_prefix_id; } -llama_token llama_token_suffix(const struct llama_context * ctx) { - return ctx->model.vocab.special_suffix_id; +llama_token llama_token_middle(const struct llama_model * model) { + return model->vocab.special_middle_id; } -llama_token llama_token_eot(const struct llama_context * ctx) { - return ctx->model.vocab.special_eot_id; +llama_token llama_token_suffix(const struct llama_model * model) { + return model->vocab.special_suffix_id; +} + +llama_token llama_token_eot(const struct llama_model * model) { + return model->vocab.special_eot_id; } int llama_tokenize( diff --git a/llama.h b/llama.h index 7c0f9ab1f..2048272bf 100644 --- a/llama.h +++ b/llama.h @@ -508,21 +508,22 @@ extern "C" { // Vocab // - LLAMA_API const char * llama_token_get_text(const struct llama_context * ctx, llama_token token); + LLAMA_API const char * llama_token_get_text(const struct llama_model * model, llama_token token); - LLAMA_API float llama_token_get_score(const struct llama_context * ctx, llama_token token); + LLAMA_API float llama_token_get_score(const struct llama_model * model, llama_token token); - LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_context * ctx, llama_token token); + LLAMA_API enum llama_token_type llama_token_get_type(const struct llama_model * model, llama_token token); // Special tokens - LLAMA_API llama_token llama_token_bos(const struct llama_context * ctx); // beginning-of-sentence - LLAMA_API llama_token llama_token_eos(const struct llama_context * ctx); // end-of-sentence - LLAMA_API llama_token llama_token_nl (const struct llama_context * ctx); // next-line + LLAMA_API llama_token llama_token_bos(const struct llama_model * model); // beginning-of-sentence + LLAMA_API llama_token llama_token_eos(const struct llama_model * model); // end-of-sentence + LLAMA_API llama_token llama_token_nl (const struct llama_model * model); // next-line + // codellama infill tokens - LLAMA_API llama_token llama_token_prefix(const struct llama_context * ctx); // Beginning of infill prefix - LLAMA_API llama_token llama_token_middle(const struct llama_context * ctx); // Beginning of infill middle - LLAMA_API llama_token llama_token_suffix(const struct llama_context * ctx); // Beginning of infill suffix - LLAMA_API llama_token llama_token_eot (const struct llama_context * ctx); // End of infill middle + LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix + LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle + LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix + LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle // // Tokenization diff --git a/models/ggml-vocab-baichuan.gguf b/models/ggml-vocab-baichuan.gguf new file mode 100644 index 000000000..7caaf8239 Binary files /dev/null and b/models/ggml-vocab-baichuan.gguf differ diff --git a/models/ggml-vocab-gpt-neox.gguf b/models/ggml-vocab-gpt-neox.gguf new file mode 100644 index 000000000..b9af16845 Binary files /dev/null and b/models/ggml-vocab-gpt-neox.gguf differ diff --git a/models/ggml-vocab-refact.gguf b/models/ggml-vocab-refact.gguf new file mode 100644 index 000000000..8f26cfb76 Binary files /dev/null and b/models/ggml-vocab-refact.gguf differ diff --git a/models/ggml-vocab-starcoder.gguf b/models/ggml-vocab-starcoder.gguf new file mode 100644 index 000000000..a52983fdb Binary files /dev/null and b/models/ggml-vocab-starcoder.gguf differ diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 1c73de0a3..6757ad1cc 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -28,10 +28,14 @@ llama_build_executable(test-tokenizer-0-falcon.cpp) llama_test_executable (test-tokenizer-0-falcon test-tokenizer-0-falcon.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_build_executable(test-tokenizer-1-llama.cpp) llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama.gguf) +llama_test_executable(test-tokenizer-1-baichuan test-tokenizer-1-llama.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-baichuan.gguf) llama_build_executable(test-tokenizer-1-bpe.cpp) llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf) llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf) llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf) +llama_test_executable(test-tokenizer-1-gpt-neox test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-neox.gguf) +llama_test_executable(test-tokenizer-1-refact test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf) +llama_test_executable(test-tokenizer-1-starcoder test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-starcoder.gguf) llama_build_and_test_executable(test-grammar-parser.cpp) llama_build_and_test_executable(test-llama-grammar.cpp) llama_build_and_test_executable(test-grad0.cpp) # SLOW diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp index 85a59a14d..386530f23 100644 --- a/tests/test-tokenizer-1-bpe.cpp +++ b/tests/test-tokenizer-1-bpe.cpp @@ -91,9 +91,19 @@ int main(int argc, char **argv) { } } } - // TODO: why doesn't this work for the full range of Unicodes? + // Restrict to assigned unicode planes // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) { - for (uint32_t cp = 0x10000; cp < 0x00080000; ++cp) { + for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) { + std::string str = codepoint_to_utf8(cp); + std::vector tokens = llama_tokenize(ctx, str, false); + std::string check = llama_detokenize_bpe(ctx, tokens); + if (str != check) { + fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n", + __func__, cp, check.c_str(), check.length(), str.c_str(), str.length()); + return 4; + } + } + for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) { std::string str = codepoint_to_utf8(cp); std::vector tokens = llama_tokenize(ctx, str, false); std::string check = llama_detokenize_bpe(ctx, tokens); @@ -103,7 +113,6 @@ int main(int argc, char **argv) { return 4; } } - llama_free_model(model); llama_free(ctx);