diff --git a/common/log.h b/common/log.h index 70e7e4ca2..d2c864cea 100644 --- a/common/log.h +++ b/common/log.h @@ -97,22 +97,23 @@ #define LOG_TEE_TARGET stderr #endif +// NOTE: currently disabled as it produces too many log files // Utility to obtain "pid" like unique process id and use it when creating log files. -inline std::string log_get_pid() -{ - static std::string pid; - if (pid.empty()) - { - // std::this_thread::get_id() is the most portable way of obtaining a "process id" - // it's not the same as "pid" but is unique enough to solve multiple instances - // trying to write to the same log. - std::stringstream ss; - ss << std::this_thread::get_id(); - pid = ss.str(); - } - - return pid; -} +//inline std::string log_get_pid() +//{ +// static std::string pid; +// if (pid.empty()) +// { +// // std::this_thread::get_id() is the most portable way of obtaining a "process id" +// // it's not the same as "pid" but is unique enough to solve multiple instances +// // trying to write to the same log. +// std::stringstream ss; +// ss << std::this_thread::get_id(); +// pid = ss.str(); +// } +// +// return pid; +//} // Utility function for generating log file names with unique id based on thread id. // invocation with log_filename_generator( "llama", "log" ) creates a string "llama..log" @@ -126,8 +127,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base std::stringstream buf; buf << log_file_basename; - buf << "."; - buf << log_get_pid(); + //buf << "."; + //buf << log_get_pid(); buf << "."; buf << log_file_extension; diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp index c552eaa73..43f9c971d 100644 --- a/examples/batched-bench/batched-bench.cpp +++ b/examples/batched-bench/batched-bench.cpp @@ -154,6 +154,10 @@ int main(int argc, char ** argv) { } } + LOG_TEE("\n"); + LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq); + LOG_TEE("\n"); + LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s"); LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------"); diff --git a/examples/main-cmake-pkg/CMakeLists.txt b/examples/main-cmake-pkg/CMakeLists.txt index 908131884..cb00edbbb 100644 --- a/examples/main-cmake-pkg/CMakeLists.txt +++ b/examples/main-cmake-pkg/CMakeLists.txt @@ -16,6 +16,8 @@ add_library(common OBJECT ${_common_path}/console.cpp ${_common_path}/grammar-parser.h ${_common_path}/grammar-parser.cpp + ${_common_path}/sampling.h + ${_common_path}/sampling.cpp ) # WARNING: because build-info.h is auto-generated, it will only diff --git a/examples/server/server.cpp b/examples/server/server.cpp index 693f9b773..b4c4d0a20 100644 --- a/examples/server/server.cpp +++ b/examples/server/server.cpp @@ -454,7 +454,7 @@ struct llama_client_slot } void release() { - if (state == PROCESSING) + if (state == IDLE || state == PROCESSING) { t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3; command = RELEASE; @@ -754,6 +754,7 @@ struct llama_server_context } slot->params.antiprompt.clear(); + const auto &stop = data.find("stop"); if (stop != data.end() && stop->is_array()) { @@ -867,7 +868,7 @@ struct llama_server_context kv_cache_clear(); - for (int32_t i = 0; i < batch.n_tokens; ++i) + for (int i = 0; i < (int) system_tokens.size(); ++i) { llama_batch_add(batch, system_tokens[i], i, { 0 }, false); } @@ -894,16 +895,8 @@ struct llama_server_context { slot.release(); } - wait_all_are_idle(); - all_slots_are_idle = true; - // wait until system prompt load system_need_update = true; - while (system_need_update) - { - std::this_thread::sleep_for(std::chrono::milliseconds(5)); - } - // system prompt loaded, continue } void process_system_prompt_data(const json &sys_props) { @@ -915,26 +908,6 @@ struct llama_server_context { notify_system_prompt_changed(); } - else - { - system_need_update = true; - } - } - - void wait_all_are_idle() { - bool wait = true; - while (wait) - { - wait = false; - for (auto &slot : slots) - { - if (!slot.available()) - { - wait = true; - break; - } - } - } } static size_t find_stopping_strings(const std::string &text, const size_t last_token_size, @@ -965,7 +938,6 @@ struct llama_server_context slot.has_next_token = false; } stop_pos = pos; - } } @@ -1444,7 +1416,7 @@ struct llama_server_context process_tasks(); // update the system prompt wait until all slots are idle state - if (system_need_update) + if (system_need_update && all_slots_are_idle) { LOG_TEE("updating system prompt\n"); update_system_prompt(); @@ -1498,7 +1470,7 @@ struct llama_server_context for (auto & slot : slots) { // release the slot - if (slot.state == PROCESSING && slot.command == RELEASE) + if (slot.command == RELEASE) { slot.state = IDLE; slot.command = NONE; @@ -1509,7 +1481,7 @@ struct llama_server_context continue; } - if (slot.state == IDLE || slot.command == RELEASE) + if (slot.state == IDLE) { continue; } @@ -1530,6 +1502,17 @@ struct llama_server_context { for (auto & slot : slots) { + const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get().empty()); + + // empty prompt passed -> release the slot and send empty response + if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt) + { + slot.release(); + slot.print_timings(); + send_final_response(slot); + continue; + } + // need process the prompt if (slot.state == IDLE && slot.command == LOAD_PROMPT) { @@ -1749,8 +1732,8 @@ struct llama_server_context if (!process_token(result, slot)) { slot.release(); - send_final_response(slot); slot.print_timings(); + send_final_response(slot); } slot.i_batch = -1; @@ -1766,15 +1749,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms, printf("usage: %s [options]\n", argv0); printf("\n"); printf("options:\n"); - printf(" -h, --help show this help message and exit\n"); - printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); - printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); - printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); - printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); + printf(" -h, --help show this help message and exit\n"); + printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled"); + printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n"); + printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n"); + printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n"); + printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + printf(" not recommended: doubles context memory required and no measurable increase in quality\n"); if (llama_mlock_supported()) { printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n"); @@ -1924,6 +1908,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams, } params.n_threads = std::stoi(argv[i]); } + else if (arg == "--threads-batch" || arg == "-tb") + { + if (++i >= argc) + { + invalid_param = true; + break; + } + params.n_threads_batch = std::stoi(argv[i]); + } else if (arg == "-b" || arg == "--batch-size") { if (++i >= argc) @@ -2285,7 +2278,7 @@ int main(int argc, char **argv) if (!json_value(data, "stream", false)) { std::string completion_text; task_result result = llama.next_result(task_id); - if(!result.error && result.stop) { + if (!result.error && result.stop) { res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json"); } else @@ -2312,7 +2305,7 @@ int main(int argc, char **argv) { return false; } - if(result.stop) { + if (result.stop) { break; } } else { diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 960076f4d..85ae5e5a1 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -5678,10 +5678,10 @@ void ggml_init_cublas() { GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES); int64_t total_vram = 0; fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count); - for (int64_t id = 0; id < g_device_count; ++id) { + for (int id = 0; id < g_device_count; ++id) { cudaDeviceProp prop; CUDA_CHECK(cudaGetDeviceProperties(&prop, id)); - fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); + fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor); g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; @@ -5691,15 +5691,15 @@ void ggml_init_cublas() { g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; #endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__) } - for (int64_t id = 0; id < g_device_count; ++id) { + for (int id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; } - for (int64_t id = 0; id < g_device_count; ++id) { + for (int id = 0; id < g_device_count; ++id) { CUDA_CHECK(ggml_cuda_set_device(id)); // create cuda streams - for (int64_t is = 0; is < MAX_STREAMS; ++is) { + for (int is = 0; is < MAX_STREAMS; ++is) { CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking)); } @@ -6271,16 +6271,15 @@ inline void ggml_cuda_op_mul_mat_cublas( const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols, const int64_t src1_padded_row_size, const cudaStream_t & stream) { - GGML_ASSERT(src0_dd_i != nullptr); + GGML_ASSERT(src0_dd_i != nullptr); GGML_ASSERT(src1_ddf_i != nullptr); - GGML_ASSERT(dst_dd_i != nullptr); - + GGML_ASSERT(dst_dd_i != nullptr); const int64_t ne00 = src0->ne[0]; - const int64_t ne10 = src1->ne[0]; const int64_t ne0 = dst->ne[0]; + const int64_t row_diff = row_high - row_low; int id; @@ -7240,12 +7239,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1 //printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name); if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) { - // KQ + // KQ single-batch ggml_cuda_mul_mat_vec_p021(src0, src1, dst); } else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) { - // KQV + // KQV single-batch ggml_cuda_mul_mat_vec_nc(src0, src1, dst); } else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) { + // KQ + KQV multi-batch ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst); } else if (src0->type == GGML_TYPE_F32) { ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false); diff --git a/ggml.c b/ggml.c index 96e0fad6f..84191d34d 100644 --- a/ggml.c +++ b/ggml.c @@ -571,7 +571,6 @@ int64_t ggml_cycles_per_ms(void) { #define ggml_perf_cycles_per_ms() 0 #endif - // // cache line // @@ -1830,7 +1829,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) { return type_traits[type]; } - // // simd mappings // @@ -4059,16 +4057,17 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "ALIBI", "CLAMP", "CONV_1D", + "CONV_1D_STAGE_0", + "CONV_1D_STAGE_1", "CONV_TRANSPOSE_1D", "CONV_2D", + "CONV_2D_STAGE_0", + "CONV_2D_STAGE_1", "CONV_TRANSPOSE_2D", "POOL_1D", "POOL_2D", "UPSCALE", - "CONV_1D_STAGE_0", - "CONV_1D_STAGE_1", - "FLASH_ATTN", "FLASH_FF", "FLASH_ATTN_BACK", @@ -4094,7 +4093,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { "CROSS_ENTROPY_LOSS_BACK", }; -static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); +static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "none", @@ -4145,16 +4144,17 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "alibi(x)", "clamp(x)", "conv_1d(x)", + "conv_1d_stage_0(x)", + "conv_1d_stage_1(x)", "conv_transpose_1d(x)", "conv_2d(x)", + "conv_2d_stage_0(x)", + "conv_2d_stage_1(x)", "conv_transpose_2d(x)", "pool_1d(x)", "pool_2d(x)", "upscale(x)", - "conv_1d_stage_0(x)", - "conv_1d_stage_1(x)", - "flash_attn(x)", "flash_ff(x)", "flash_attn_back(x)", @@ -4180,7 +4180,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "cross_entropy_loss_back(x,y)", }; -static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71"); +static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73"); static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); @@ -4211,8 +4211,10 @@ static void ggml_setup_op_has_task_pass(void) { p[GGML_OP_CONV_1D ] = true; p[GGML_OP_CONV_1D_STAGE_0 ] = true; p[GGML_OP_CONV_1D_STAGE_1 ] = true; - p[GGML_OP_CONV_2D ] = true; p[GGML_OP_CONV_TRANSPOSE_1D ] = true; + p[GGML_OP_CONV_2D ] = true; + p[GGML_OP_CONV_2D_STAGE_0 ] = true; + p[GGML_OP_CONV_2D_STAGE_1 ] = true; p[GGML_OP_CONV_TRANSPOSE_2D ] = true; p[GGML_OP_FLASH_ATTN_BACK ] = true; p[GGML_OP_CROSS_ENTROPY_LOSS ] = true; @@ -5956,7 +5958,6 @@ struct ggml_tensor * ggml_sqrt_inplace( return ggml_sqrt_impl(ctx, a, true); } - // ggml_log static struct ggml_tensor * ggml_log_impl( @@ -6010,7 +6011,6 @@ struct ggml_tensor * ggml_sum( return result; } - // ggml_sum_rows struct ggml_tensor * ggml_sum_rows( @@ -6642,7 +6642,6 @@ struct ggml_tensor * ggml_set_2d_inplace( return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false); } - // ggml_cpy static struct ggml_tensor * ggml_cpy_impl( @@ -6722,7 +6721,6 @@ struct ggml_tensor * ggml_cont_inplace( return ggml_cont_impl(ctx, a, true); } - // make contiguous, with new shape GGML_API struct ggml_tensor * ggml_cont_1d( struct ggml_context * ctx, @@ -7175,7 +7173,6 @@ struct ggml_tensor * ggml_diag( return result; } - // ggml_diag_mask_inf static struct ggml_tensor * ggml_diag_mask_inf_impl( @@ -7287,7 +7284,6 @@ struct ggml_tensor * ggml_soft_max_inplace( return ggml_soft_max_impl(ctx, a, true); } - // ggml_soft_max_back static struct ggml_tensor * ggml_soft_max_back_impl( @@ -7704,7 +7700,11 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d( // ggml_conv_2d -struct ggml_tensor * ggml_conv_2d( +// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OH, OW, IC*KH*KW] +static struct ggml_tensor * ggml_conv_2d_stage_0( struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b, @@ -7723,17 +7723,21 @@ struct ggml_tensor * ggml_conv_2d( is_node = true; } + const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1); + const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0); + const int64_t ne[4] = { - ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0), - ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1), - a->ne[3], b->ne[3], + a->ne[2] * a->ne[1] * a->ne[0], + OW, + OH, + b->ne[3], }; - struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne); int32_t params[] = { s0, s1, p0, p1, d0, d1 }; ggml_set_op_params(result, params, sizeof(params)); - result->op = GGML_OP_CONV_2D; + result->op = GGML_OP_CONV_2D_STAGE_0; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; @@ -7742,8 +7746,61 @@ struct ggml_tensor * ggml_conv_2d( } -// ggml_conv_2d_sk_p0 +// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] +// a: [OC, IC, KH, KW] +// b: [N, OH, OW, IC * KH * KW] +// result: [N, OC, OH, OW] +static struct ggml_tensor * ggml_conv_2d_stage_1( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b) { + bool is_node = false; + + if (a->grad || b->grad) { + GGML_ASSERT(false); // TODO: implement backward + is_node = true; + } + + const int64_t ne[4] = { + b->ne[1], + b->ne[2], + a->ne[3], + b->ne[3], + }; + struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); + + result->op = GGML_OP_CONV_2D_STAGE_1; + result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; + result->src[0] = a; + result->src[1] = b; + + return result; + +} + +// a: [OC,IC, KH, KW] +// b: [N, IC, IH, IW] +// result: [N, OC, OH, OW] +struct ggml_tensor * ggml_conv_2d( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int s1, + int p0, + int p1, + int d0, + int d1) { + + struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW] + result = ggml_conv_2d_stage_1(ctx, a, result); + + return result; + +} + +// ggml_conv_2d_sk_p0 struct ggml_tensor * ggml_conv_2d_sk_p0( struct ggml_context * ctx, struct ggml_tensor * a, @@ -8182,7 +8239,6 @@ static struct ggml_tensor * ggml_add_rel_pos_impl( return result; } - struct ggml_tensor * ggml_add_rel_pos( struct ggml_context * ctx, struct ggml_tensor * a, @@ -8627,8 +8683,6 @@ struct ggml_tensor * ggml_map_custom3_inplace( return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true); } - - // ggml_cross_entropy_loss struct ggml_tensor * ggml_cross_entropy_loss( @@ -9853,7 +9907,6 @@ static void ggml_compute_forward_add1( } } - // ggml_compute_forward_acc static void ggml_compute_forward_acc_f32( @@ -9993,7 +10046,6 @@ static void ggml_compute_forward_sub_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - #ifdef GGML_USE_ACCELERATE vDSP_vsub( (float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1, @@ -10174,7 +10226,6 @@ static void ggml_compute_forward_div_f32( const int i2 = (ir - i3*ne2*ne1)/ne1; const int i1 = (ir - i3*ne2*ne1 - i2*ne1); - #ifdef GGML_USE_ACCELERATE UNUSED(ggml_vec_div_f32); @@ -10312,7 +10363,6 @@ static void ggml_compute_forward_sqrt( } } - // ggml_compute_forward_log static void ggml_compute_forward_log_f32( @@ -12145,7 +12195,6 @@ static void ggml_compute_forward_out_prod_f32( } } - //int64_t t1 = ggml_perf_time_us(); //static int64_t acc = 0; //acc += t1 - t0; @@ -12341,7 +12390,6 @@ static void ggml_compute_forward_scale_f32( const size_t nb1 = dst->nb[1]; - for (int i1 = ir0; i1 < ir1; i1++) { if (dst->data != src0->data) { // src0 is same shape as dst => same indices @@ -12739,7 +12787,6 @@ static void ggml_compute_forward_get_rows_back_f32( } } - static void ggml_compute_forward_get_rows_back( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -14022,6 +14069,7 @@ static void ggml_compute_forward_conv_1d_f32( } } +// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1 static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k, ggml_fp16_t * A, ggml_fp16_t * B, @@ -14323,6 +14371,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32( } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, ggml_nbytes(dst)); + return; } @@ -14395,7 +14446,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32( const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01); float * dst_data = wdata + i01*ne00*ne02; for (int64_t i00 = 0; i00 < ne00; i00++) { - dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00]; + dst_data[i00*ne02 + i02] = src[i00]; } } } @@ -14414,6 +14465,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32( } } + // need to zero dst since we are accumulating into it + memset(dst->data, 0, ggml_nbytes(dst)); + return; } @@ -14475,6 +14529,144 @@ static void ggml_compute_forward_conv_transpose_1d( // ggml_compute_forward_conv_2d +// src0: kernel [OC, IC, KH, KW] +// src1: image [N, IC, IH, IW] +// dst: result [N, OH, OW, IC*KH*KW] +static void ggml_compute_forward_conv_2d_stage_0_f32( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F32); + GGML_ASSERT( dst->type == GGML_TYPE_F16); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + GGML_TENSOR_BINARY_OP_LOCALS; + + const int64_t N = ne13; + const int64_t IC = ne12; + const int64_t IH = ne11; + const int64_t IW = ne10; + + // const int64_t OC = ne03; + // const int64_t IC = ne02; + const int64_t KH = ne01; + const int64_t KW = ne00; + + const int64_t OH = ne2; + const int64_t OW = ne1; + + const int ith = params->ith; + const int nth = params->nth; + + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(float)); + + if (params->type == GGML_TASK_INIT) { + memset(dst->data, 0, ggml_nbytes(dst)); + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + // im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW] + { + ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data; + + for (int64_t in = 0; in < N; in++) { + for (int64_t ioh = 0; ioh < OH; ioh++) { + for (int64_t iow = 0; iow < OW; iow++) { + for (int64_t iic = ith; iic < IC; iic+=nth) { + + // micro kernel + ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] + + for (int64_t ikh = 0; ikh < KH; ikh++) { + for (int64_t ikw = 0; ikw < KW; ikw++) { + const int64_t iiw = iow*s0 + ikw*d0 - p0; + const int64_t iih = ioh*s1 + ikh*d1 - p1; + + if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); + } + } + } + } + } + } + } + } +} + +// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] +// src0: [OC, IC, KH, KW] +// src1: [N, OH, OW, IC * KH * KW] +// result: [N, OC, OH, OW] +static void ggml_compute_forward_conv_2d_stage_1_f16( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + GGML_ASSERT(src0->type == GGML_TYPE_F16); + GGML_ASSERT(src1->type == GGML_TYPE_F16); + GGML_ASSERT( dst->type == GGML_TYPE_F32); + + int64_t t0 = ggml_perf_time_us(); + UNUSED(t0); + + if (params->type == GGML_TASK_INIT) { + return; + } + + if (params->type == GGML_TASK_FINALIZE) { + return; + } + + GGML_TENSOR_BINARY_OP_LOCALS; + + GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb10 == sizeof(ggml_fp16_t)); + GGML_ASSERT(nb0 == sizeof(float)); + + const int N = ne13; + const int OH = ne12; + const int OW = ne11; + + const int OC = ne03; + const int IC = ne02; + const int KH = ne01; + const int KW = ne00; + + const int ith = params->ith; + const int nth = params->nth; + + int64_t m = OC; + int64_t n = OH * OW; + int64_t k = IC * KH * KW; + + // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] + for (int i = 0; i < N; i++) { + ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k] + ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m, n] + + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); + } +} + static void ggml_compute_forward_conv_2d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -14487,16 +14679,40 @@ static void ggml_compute_forward_conv_2d_f16_f32( int64_t t0 = ggml_perf_time_us(); UNUSED(t0); - GGML_TENSOR_BINARY_OP_LOCALS; + GGML_TENSOR_BINARY_OP_LOCALS + + // src1: image [N, IC, IH, IW] + // src0: kernel [OC, IC, KH, KW] + // dst: result [N, OC, OH, OW] + // ne12: IC + // ne0: OW + // ne1: OH + // nk0: KW + // nk1: KH + // ne13: N + + const int N = ne13; + const int IC = ne12; + const int IH = ne11; + const int IW = ne10; + + const int OC = ne03; + // const int IC = ne02; + const int KH = ne01; + const int KW = ne00; + + const int OH = ne1; + const int OW = ne0; const int ith = params->ith; const int nth = params->nth; - const int nk0 = ne00; - const int nk1 = ne01; + // const int nk0 = ne00; + // const int nk1 = ne01; // size of the convolution row - the kernel size unrolled across all channels - const int ew0 = nk0*nk1*ne02; + // const int ew0 = nk0*nk1*ne02; + // ew0: IC*KH*KW const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; @@ -14512,24 +14728,27 @@ static void ggml_compute_forward_conv_2d_f16_f32( memset(params->wdata, 0, params->wsize); // prepare source data (src1) + // im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW] + { ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; - for (int i13 = 0; i13 < ne13; i13++) { - for (int i12 = 0; i12 < ne12; i12++) { - const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12); - ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0); + for (int in = 0; in < N; in++) { + for (int iic = 0; iic < IC; iic++) { + for (int ioh = 0; ioh < OH; ioh++) { + for (int iow = 0; iow < OW; iow++) { - for (int i1 = 0; i1 < ne1; i1++) { - for (int i0 = 0; i0 < ne0; i0++) { - for (int ik1 = 0; ik1 < nk1; ik1++) { - for (int ik0 = 0; ik0 < nk0; ik0++) { - const int idx0 = i0*s0 + ik0*d0 - p0; - const int idx1 = i1*s1 + ik1*d1 - p1; + // micro kernel + ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW] + const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW] - if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) { - dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] = - GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]); + for (int ikh = 0; ikh < KH; ikh++) { + for (int ikw = 0; ikw < KW; ikw++) { + const int iiw = iow*s0 + ikw*d0 - p0; + const int iih = ioh*s1 + ikh*d1 - p1; + + if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) { + dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]); } } } @@ -14546,30 +14765,22 @@ static void ggml_compute_forward_conv_2d_f16_f32( return; } - // total patches in dst - const int np = ne2; - - // patches per thread - const int dp = (np + nth - 1)/nth; - - // patch range for this thread - const int ip0 = dp*ith; - const int ip1 = MIN(ip0 + dp, np); - ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0; + // wdata: [N*OH*OW, IC*KH*KW] + // dst: result [N, OC, OH, OW] + // src0: kernel [OC, IC, KH, KW] - for (int i3 = 0; i3 < ne3; i3++) { - for (int i2 = ip0; i2 < ip1; i2++) { - float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2); + int64_t m = OC; + int64_t n = OH * OW; + int64_t k = IC * KH * KW; - for (int i1 = 0; i1 < ne1; ++i1) { - for (int i0 = 0; i0 < ne0; ++i0) { - ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0, - (ggml_fp16_t *) ((char *) src0->data + i2*nb03), - (ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0); - } - } - } + // [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW] + for (int i = 0; i < N; i++) { + ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k] + ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k] + float * C = (float *)dst->data + i * m * n; // [m * k] + + gemm_f16_out_f32(m, n, k, A, B, C, ith, nth); } } @@ -14595,6 +14806,48 @@ static void ggml_compute_forward_conv_2d( } } +static void ggml_compute_forward_conv_2d_stage_0( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(false); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + +static void ggml_compute_forward_conv_2d_stage_1( + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + switch (src0->type) { + case GGML_TYPE_F16: + { + ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst); + } break; + case GGML_TYPE_F32: + { + GGML_ASSERT(false); + } break; + default: + { + GGML_ASSERT(false); + } break; + } +} + // ggml_compute_forward_conv_transpose_2d static void ggml_compute_forward_conv_transpose_2d( @@ -14653,6 +14906,8 @@ static void ggml_compute_forward_conv_transpose_2d( } } + memset(dst->data, 0, ggml_nbytes(dst)); + return; } @@ -16151,7 +16406,6 @@ static void ggml_compute_forward_add_rel_pos_f32( const int ip0 = dp*ith; const int ip1 = MIN(ip0 + dp, np); - for (int64_t i13 = ip0; i13 < ip1; ++i13) { for (int64_t i12 = 0; i12 < ne12; ++i12) { for (int64_t i11 = 0; i11 < ne11; ++i11) { @@ -16218,7 +16472,6 @@ static void ggml_compute_forward_map_unary_f32( } } - static void ggml_compute_forward_map_unary( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -16266,7 +16519,6 @@ static void ggml_compute_forward_map_binary_f32( } } - static void ggml_compute_forward_map_binary( const struct ggml_compute_params * params, const struct ggml_tensor * src0, @@ -16318,7 +16570,6 @@ static void ggml_compute_forward_map_custom2_f32( fun(dst, a, b); } - // ggml_compute_forward_map_custom3 static void ggml_compute_forward_map_custom3_f32( @@ -16593,7 +16844,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32( ggml_vec_sub_f32(nc, ds0, ds0, s1); ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr); - #ifndef NDEBUG for (int i = 0; i < nc; ++i) { assert(!isnan(ds0[i])); @@ -16621,7 +16871,6 @@ static void ggml_compute_forward_cross_entropy_loss_back( } } - ///////////////////////////////// static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) { @@ -16833,6 +17082,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm { ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); } break; + case GGML_OP_CONV_2D_STAGE_0: + { + ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor); + } break; + case GGML_OP_CONV_2D_STAGE_1: + { + ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor); + } break; case GGML_OP_CONV_TRANSPOSE_2D: { ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor); @@ -17762,11 +18019,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { GGML_ASSERT(false); // TODO: not implemented } break; + case GGML_OP_CONV_TRANSPOSE_1D: + { + GGML_ASSERT(false); // TODO: not implemented + } break; case GGML_OP_CONV_2D: { GGML_ASSERT(false); // TODO: not implemented } break; - case GGML_OP_CONV_TRANSPOSE_1D: + case GGML_OP_CONV_2D_STAGE_0: + { + GGML_ASSERT(false); // TODO: not implemented + } break; + case GGML_OP_CONV_2D_STAGE_1: { GGML_ASSERT(false); // TODO: not implemented } break; @@ -18695,6 +18960,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { const int64_t ne0 = node->ne[0]; const int64_t ne1 = node->ne[1]; const int64_t ne2 = node->ne[2]; + const int64_t ne3 = node->ne[3]; const int64_t nk = ne00*ne01; const int64_t ew0 = nk * ne02; @@ -18705,7 +18971,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { if (node->src[0]->type == GGML_TYPE_F16 && node->src[1]->type == GGML_TYPE_F32) { - cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0); + // im2col: [N*OH*OW, IC*KH*KW] + cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0); } else if (node->src[0]->type == GGML_TYPE_F32 && node->src[1]->type == GGML_TYPE_F32) { cur = sizeof(float)* (ne10*ne11*ne12); @@ -18715,6 +18982,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { work_size = MAX(work_size, cur); } break; + case GGML_OP_CONV_2D_STAGE_0: + { + n_tasks = n_threads; + } break; + case GGML_OP_CONV_2D_STAGE_1: + { + n_tasks = n_threads; + } break; case GGML_OP_CONV_TRANSPOSE_2D: { n_tasks = n_threads; @@ -19903,7 +20178,6 @@ static enum ggml_opt_result ggml_opt_adam( opt->loss_after = fx; - // check convergence if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) { GGML_PRINT_DEBUG("converged\n"); diff --git a/ggml.h b/ggml.h index 16aaf169e..08bff5511 100644 --- a/ggml.h +++ b/ggml.h @@ -401,15 +401,16 @@ extern "C" { GGML_OP_ALIBI, GGML_OP_CLAMP, GGML_OP_CONV_1D, - GGML_OP_CONV_2D, + GGML_OP_CONV_1D_STAGE_0, // internal + GGML_OP_CONV_1D_STAGE_1, // internal GGML_OP_CONV_TRANSPOSE_1D, + GGML_OP_CONV_2D, + GGML_OP_CONV_2D_STAGE_0, // internal + GGML_OP_CONV_2D_STAGE_1, // internal GGML_OP_CONV_TRANSPOSE_2D, GGML_OP_POOL_1D, GGML_OP_POOL_2D, - GGML_OP_CONV_1D_STAGE_0, // internal - GGML_OP_CONV_1D_STAGE_1, // internal - GGML_OP_UPSCALE, // nearest interpolate GGML_OP_FLASH_ATTN, @@ -1020,9 +1021,9 @@ extern "C" { struct ggml_tensor * b, float eps); - // A: n columns, m rows - // B: n columns, p rows (i.e. we transpose it internally) - // result is m columns, p rows + // A: k columns, n rows => [ne03, ne02, n, k] + // B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k] + // result is n columns, m rows => [ne03 * x, ne02 * y, m, n] GGML_API struct ggml_tensor * ggml_mul_mat( struct ggml_context * ctx, struct ggml_tensor * a,