Merge branch 'master' of https://github.com/AndrewGodfrey/llama.cpp into finetune_enableGpu
This commit is contained in:
commit
653bc1c000
7 changed files with 438 additions and 163 deletions
35
common/log.h
35
common/log.h
|
@ -97,22 +97,23 @@
|
|||
#define LOG_TEE_TARGET stderr
|
||||
#endif
|
||||
|
||||
// NOTE: currently disabled as it produces too many log files
|
||||
// Utility to obtain "pid" like unique process id and use it when creating log files.
|
||||
inline std::string log_get_pid()
|
||||
{
|
||||
static std::string pid;
|
||||
if (pid.empty())
|
||||
{
|
||||
// std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||
// it's not the same as "pid" but is unique enough to solve multiple instances
|
||||
// trying to write to the same log.
|
||||
std::stringstream ss;
|
||||
ss << std::this_thread::get_id();
|
||||
pid = ss.str();
|
||||
}
|
||||
|
||||
return pid;
|
||||
}
|
||||
//inline std::string log_get_pid()
|
||||
//{
|
||||
// static std::string pid;
|
||||
// if (pid.empty())
|
||||
// {
|
||||
// // std::this_thread::get_id() is the most portable way of obtaining a "process id"
|
||||
// // it's not the same as "pid" but is unique enough to solve multiple instances
|
||||
// // trying to write to the same log.
|
||||
// std::stringstream ss;
|
||||
// ss << std::this_thread::get_id();
|
||||
// pid = ss.str();
|
||||
// }
|
||||
//
|
||||
// return pid;
|
||||
//}
|
||||
|
||||
// Utility function for generating log file names with unique id based on thread id.
|
||||
// invocation with log_filename_generator( "llama", "log" ) creates a string "llama.<number>.log"
|
||||
|
@ -126,8 +127,8 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
|
|||
std::stringstream buf;
|
||||
|
||||
buf << log_file_basename;
|
||||
buf << ".";
|
||||
buf << log_get_pid();
|
||||
//buf << ".";
|
||||
//buf << log_get_pid();
|
||||
buf << ".";
|
||||
buf << log_file_extension;
|
||||
|
||||
|
|
|
@ -154,6 +154,10 @@ int main(int argc, char ** argv) {
|
|||
}
|
||||
}
|
||||
|
||||
LOG_TEE("\n");
|
||||
LOG_TEE("%s: n_kv_max = %d, is_pp_shared = %d, n_gpu_layers = %d, mmq = %d\n", __func__, n_kv_max, is_pp_shared, n_gpu_layers, mmq);
|
||||
LOG_TEE("\n");
|
||||
|
||||
LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP", "TG", "B", "N_KV", "T_PP s", "S_PP t/s", "T_TG s", "S_TG t/s", "T s", "S t/s");
|
||||
LOG_TEE("|%6s-|-%6s-|-%4s-|-%6s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|-%8s-|\n", "------", "------", "----", "------", "--------", "--------", "--------", "--------", "--------", "--------");
|
||||
|
||||
|
|
|
@ -16,6 +16,8 @@ add_library(common OBJECT
|
|||
${_common_path}/console.cpp
|
||||
${_common_path}/grammar-parser.h
|
||||
${_common_path}/grammar-parser.cpp
|
||||
${_common_path}/sampling.h
|
||||
${_common_path}/sampling.cpp
|
||||
)
|
||||
|
||||
# WARNING: because build-info.h is auto-generated, it will only
|
||||
|
|
|
@ -454,7 +454,7 @@ struct llama_client_slot
|
|||
}
|
||||
|
||||
void release() {
|
||||
if (state == PROCESSING)
|
||||
if (state == IDLE || state == PROCESSING)
|
||||
{
|
||||
t_token_generation = (ggml_time_us() - t_start_genereration) / 1e3;
|
||||
command = RELEASE;
|
||||
|
@ -754,6 +754,7 @@ struct llama_server_context
|
|||
}
|
||||
|
||||
slot->params.antiprompt.clear();
|
||||
|
||||
const auto &stop = data.find("stop");
|
||||
if (stop != data.end() && stop->is_array())
|
||||
{
|
||||
|
@ -867,7 +868,7 @@ struct llama_server_context
|
|||
|
||||
kv_cache_clear();
|
||||
|
||||
for (int32_t i = 0; i < batch.n_tokens; ++i)
|
||||
for (int i = 0; i < (int) system_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
@ -894,16 +895,8 @@ struct llama_server_context
|
|||
{
|
||||
slot.release();
|
||||
}
|
||||
wait_all_are_idle();
|
||||
all_slots_are_idle = true;
|
||||
|
||||
// wait until system prompt load
|
||||
system_need_update = true;
|
||||
while (system_need_update)
|
||||
{
|
||||
std::this_thread::sleep_for(std::chrono::milliseconds(5));
|
||||
}
|
||||
// system prompt loaded, continue
|
||||
}
|
||||
|
||||
void process_system_prompt_data(const json &sys_props) {
|
||||
|
@ -915,26 +908,6 @@ struct llama_server_context
|
|||
{
|
||||
notify_system_prompt_changed();
|
||||
}
|
||||
else
|
||||
{
|
||||
system_need_update = true;
|
||||
}
|
||||
}
|
||||
|
||||
void wait_all_are_idle() {
|
||||
bool wait = true;
|
||||
while (wait)
|
||||
{
|
||||
wait = false;
|
||||
for (auto &slot : slots)
|
||||
{
|
||||
if (!slot.available())
|
||||
{
|
||||
wait = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static size_t find_stopping_strings(const std::string &text, const size_t last_token_size,
|
||||
|
@ -965,7 +938,6 @@ struct llama_server_context
|
|||
slot.has_next_token = false;
|
||||
}
|
||||
stop_pos = pos;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -1444,7 +1416,7 @@ struct llama_server_context
|
|||
process_tasks();
|
||||
|
||||
// update the system prompt wait until all slots are idle state
|
||||
if (system_need_update)
|
||||
if (system_need_update && all_slots_are_idle)
|
||||
{
|
||||
LOG_TEE("updating system prompt\n");
|
||||
update_system_prompt();
|
||||
|
@ -1498,7 +1470,7 @@ struct llama_server_context
|
|||
for (auto & slot : slots)
|
||||
{
|
||||
// release the slot
|
||||
if (slot.state == PROCESSING && slot.command == RELEASE)
|
||||
if (slot.command == RELEASE)
|
||||
{
|
||||
slot.state = IDLE;
|
||||
slot.command = NONE;
|
||||
|
@ -1509,7 +1481,7 @@ struct llama_server_context
|
|||
continue;
|
||||
}
|
||||
|
||||
if (slot.state == IDLE || slot.command == RELEASE)
|
||||
if (slot.state == IDLE)
|
||||
{
|
||||
continue;
|
||||
}
|
||||
|
@ -1530,6 +1502,17 @@ struct llama_server_context
|
|||
{
|
||||
for (auto & slot : slots)
|
||||
{
|
||||
const bool has_prompt = slot.prompt.is_array() || (slot.prompt.is_string() && !slot.prompt.get<std::string>().empty());
|
||||
|
||||
// empty prompt passed -> release the slot and send empty response
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT && !has_prompt)
|
||||
{
|
||||
slot.release();
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
continue;
|
||||
}
|
||||
|
||||
// need process the prompt
|
||||
if (slot.state == IDLE && slot.command == LOAD_PROMPT)
|
||||
{
|
||||
|
@ -1749,8 +1732,8 @@ struct llama_server_context
|
|||
if (!process_token(result, slot))
|
||||
{
|
||||
slot.release();
|
||||
send_final_response(slot);
|
||||
slot.print_timings();
|
||||
send_final_response(slot);
|
||||
}
|
||||
|
||||
slot.i_batch = -1;
|
||||
|
@ -1766,15 +1749,16 @@ static void server_print_usage(const char *argv0, const gpt_params ¶ms,
|
|||
printf("usage: %s [options]\n", argv0);
|
||||
printf("\n");
|
||||
printf("options:\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
printf(" -h, --help show this help message and exit\n");
|
||||
printf(" -v, --verbose verbose output (default: %s)\n", server_verbose ? "enabled" : "disabled");
|
||||
printf(" -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads);
|
||||
printf(" -tb N, --threads-batch N number of threads to use during batch and prompt processing (default: same as --threads)\n");
|
||||
printf(" -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx);
|
||||
printf(" --rope-freq-base N RoPE base frequency (default: loaded from model)\n");
|
||||
printf(" --rope-freq-scale N RoPE frequency scaling factor (default: loaded from model)\n");
|
||||
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
|
||||
printf(" --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n");
|
||||
printf(" not recommended: doubles context memory required and no measurable increase in quality\n");
|
||||
if (llama_mlock_supported())
|
||||
{
|
||||
printf(" --mlock force system to keep model in RAM rather than swapping or compressing\n");
|
||||
|
@ -1924,6 +1908,15 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
|
|||
}
|
||||
params.n_threads = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "--threads-batch" || arg == "-tb")
|
||||
{
|
||||
if (++i >= argc)
|
||||
{
|
||||
invalid_param = true;
|
||||
break;
|
||||
}
|
||||
params.n_threads_batch = std::stoi(argv[i]);
|
||||
}
|
||||
else if (arg == "-b" || arg == "--batch-size")
|
||||
{
|
||||
if (++i >= argc)
|
||||
|
@ -2285,7 +2278,7 @@ int main(int argc, char **argv)
|
|||
if (!json_value(data, "stream", false)) {
|
||||
std::string completion_text;
|
||||
task_result result = llama.next_result(task_id);
|
||||
if(!result.error && result.stop) {
|
||||
if (!result.error && result.stop) {
|
||||
res.set_content(result.result_json.dump(-1, ' ', false, json::error_handler_t::replace), "application/json");
|
||||
}
|
||||
else
|
||||
|
@ -2312,7 +2305,7 @@ int main(int argc, char **argv)
|
|||
{
|
||||
return false;
|
||||
}
|
||||
if(result.stop) {
|
||||
if (result.stop) {
|
||||
break;
|
||||
}
|
||||
} else {
|
||||
|
|
22
ggml-cuda.cu
22
ggml-cuda.cu
|
@ -5678,10 +5678,10 @@ void ggml_init_cublas() {
|
|||
GGML_ASSERT(g_device_count <= GGML_CUDA_MAX_DEVICES);
|
||||
int64_t total_vram = 0;
|
||||
fprintf(stderr, "%s: found %d " GGML_CUDA_NAME " devices:\n", __func__, g_device_count);
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
|
||||
fprintf(stderr, " Device %ld: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||
fprintf(stderr, " Device %d: %s, compute capability %d.%d\n", id, prop.name, prop.major, prop.minor);
|
||||
|
||||
g_tensor_split[id] = total_vram;
|
||||
total_vram += prop.totalGlobalMem;
|
||||
|
@ -5691,15 +5691,15 @@ void ggml_init_cublas() {
|
|||
g_compute_capabilities[id] = 100*prop.major + 10*prop.minor;
|
||||
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
g_tensor_split[id] /= total_vram;
|
||||
}
|
||||
|
||||
for (int64_t id = 0; id < g_device_count; ++id) {
|
||||
for (int id = 0; id < g_device_count; ++id) {
|
||||
CUDA_CHECK(ggml_cuda_set_device(id));
|
||||
|
||||
// create cuda streams
|
||||
for (int64_t is = 0; is < MAX_STREAMS; ++is) {
|
||||
for (int is = 0; is < MAX_STREAMS; ++is) {
|
||||
CUDA_CHECK(cudaStreamCreateWithFlags(&g_cudaStreams[id][is], cudaStreamNonBlocking));
|
||||
}
|
||||
|
||||
|
@ -6271,16 +6271,15 @@ inline void ggml_cuda_op_mul_mat_cublas(
|
|||
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
|
||||
const int64_t src1_padded_row_size, const cudaStream_t & stream) {
|
||||
|
||||
GGML_ASSERT(src0_dd_i != nullptr);
|
||||
GGML_ASSERT(src0_dd_i != nullptr);
|
||||
GGML_ASSERT(src1_ddf_i != nullptr);
|
||||
GGML_ASSERT(dst_dd_i != nullptr);
|
||||
|
||||
GGML_ASSERT(dst_dd_i != nullptr);
|
||||
|
||||
const int64_t ne00 = src0->ne[0];
|
||||
|
||||
const int64_t ne10 = src1->ne[0];
|
||||
|
||||
const int64_t ne0 = dst->ne[0];
|
||||
|
||||
const int64_t row_diff = row_high - row_low;
|
||||
|
||||
int id;
|
||||
|
@ -7240,12 +7239,13 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
|
|||
//printf("src1 is contiguous %d, transposed %d, type = %s, name = %s\n", ggml_is_contiguous(src1), ggml_is_transposed(src1), ggml_type_name(src1->type), src1->name);
|
||||
|
||||
if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
|
||||
// KQ
|
||||
// KQ single-batch
|
||||
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
|
||||
// KQV
|
||||
// KQV single-batch
|
||||
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
|
||||
} else if (all_on_device && src0->type == GGML_TYPE_F16 && src1->type == GGML_TYPE_F32 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
|
||||
// KQ + KQV multi-batch
|
||||
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
|
||||
} else if (src0->type == GGML_TYPE_F32) {
|
||||
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
|
||||
|
|
438
ggml.c
438
ggml.c
|
@ -571,7 +571,6 @@ int64_t ggml_cycles_per_ms(void) {
|
|||
#define ggml_perf_cycles_per_ms() 0
|
||||
#endif
|
||||
|
||||
|
||||
//
|
||||
// cache line
|
||||
//
|
||||
|
@ -1830,7 +1829,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|||
return type_traits[type];
|
||||
}
|
||||
|
||||
|
||||
//
|
||||
// simd mappings
|
||||
//
|
||||
|
@ -4059,16 +4057,17 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"ALIBI",
|
||||
"CLAMP",
|
||||
"CONV_1D",
|
||||
"CONV_1D_STAGE_0",
|
||||
"CONV_1D_STAGE_1",
|
||||
"CONV_TRANSPOSE_1D",
|
||||
"CONV_2D",
|
||||
"CONV_2D_STAGE_0",
|
||||
"CONV_2D_STAGE_1",
|
||||
"CONV_TRANSPOSE_2D",
|
||||
"POOL_1D",
|
||||
"POOL_2D",
|
||||
"UPSCALE",
|
||||
|
||||
"CONV_1D_STAGE_0",
|
||||
"CONV_1D_STAGE_1",
|
||||
|
||||
"FLASH_ATTN",
|
||||
"FLASH_FF",
|
||||
"FLASH_ATTN_BACK",
|
||||
|
@ -4094,7 +4093,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
|
|||
"CROSS_ENTROPY_LOSS_BACK",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
|
||||
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
|
||||
|
||||
static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
||||
"none",
|
||||
|
@ -4145,16 +4144,17 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"alibi(x)",
|
||||
"clamp(x)",
|
||||
"conv_1d(x)",
|
||||
"conv_1d_stage_0(x)",
|
||||
"conv_1d_stage_1(x)",
|
||||
"conv_transpose_1d(x)",
|
||||
"conv_2d(x)",
|
||||
"conv_2d_stage_0(x)",
|
||||
"conv_2d_stage_1(x)",
|
||||
"conv_transpose_2d(x)",
|
||||
"pool_1d(x)",
|
||||
"pool_2d(x)",
|
||||
"upscale(x)",
|
||||
|
||||
"conv_1d_stage_0(x)",
|
||||
"conv_1d_stage_1(x)",
|
||||
|
||||
"flash_attn(x)",
|
||||
"flash_ff(x)",
|
||||
"flash_attn_back(x)",
|
||||
|
@ -4180,7 +4180,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
|
|||
"cross_entropy_loss_back(x,y)",
|
||||
};
|
||||
|
||||
static_assert(GGML_OP_COUNT == 71, "GGML_OP_COUNT != 71");
|
||||
static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
|
||||
|
||||
static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
|
||||
|
||||
|
@ -4211,8 +4211,10 @@ static void ggml_setup_op_has_task_pass(void) {
|
|||
p[GGML_OP_CONV_1D ] = true;
|
||||
p[GGML_OP_CONV_1D_STAGE_0 ] = true;
|
||||
p[GGML_OP_CONV_1D_STAGE_1 ] = true;
|
||||
p[GGML_OP_CONV_2D ] = true;
|
||||
p[GGML_OP_CONV_TRANSPOSE_1D ] = true;
|
||||
p[GGML_OP_CONV_2D ] = true;
|
||||
p[GGML_OP_CONV_2D_STAGE_0 ] = true;
|
||||
p[GGML_OP_CONV_2D_STAGE_1 ] = true;
|
||||
p[GGML_OP_CONV_TRANSPOSE_2D ] = true;
|
||||
p[GGML_OP_FLASH_ATTN_BACK ] = true;
|
||||
p[GGML_OP_CROSS_ENTROPY_LOSS ] = true;
|
||||
|
@ -5956,7 +5958,6 @@ struct ggml_tensor * ggml_sqrt_inplace(
|
|||
return ggml_sqrt_impl(ctx, a, true);
|
||||
}
|
||||
|
||||
|
||||
// ggml_log
|
||||
|
||||
static struct ggml_tensor * ggml_log_impl(
|
||||
|
@ -6010,7 +6011,6 @@ struct ggml_tensor * ggml_sum(
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
// ggml_sum_rows
|
||||
|
||||
struct ggml_tensor * ggml_sum_rows(
|
||||
|
@ -6642,7 +6642,6 @@ struct ggml_tensor * ggml_set_2d_inplace(
|
|||
return ggml_set_impl(ctx, a, b, nb1, a->nb[2], a->nb[3], offset, false);
|
||||
}
|
||||
|
||||
|
||||
// ggml_cpy
|
||||
|
||||
static struct ggml_tensor * ggml_cpy_impl(
|
||||
|
@ -6722,7 +6721,6 @@ struct ggml_tensor * ggml_cont_inplace(
|
|||
return ggml_cont_impl(ctx, a, true);
|
||||
}
|
||||
|
||||
|
||||
// make contiguous, with new shape
|
||||
GGML_API struct ggml_tensor * ggml_cont_1d(
|
||||
struct ggml_context * ctx,
|
||||
|
@ -7175,7 +7173,6 @@ struct ggml_tensor * ggml_diag(
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
// ggml_diag_mask_inf
|
||||
|
||||
static struct ggml_tensor * ggml_diag_mask_inf_impl(
|
||||
|
@ -7287,7 +7284,6 @@ struct ggml_tensor * ggml_soft_max_inplace(
|
|||
return ggml_soft_max_impl(ctx, a, true);
|
||||
}
|
||||
|
||||
|
||||
// ggml_soft_max_back
|
||||
|
||||
static struct ggml_tensor * ggml_soft_max_back_impl(
|
||||
|
@ -7704,7 +7700,11 @@ GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
|
|||
|
||||
// ggml_conv_2d
|
||||
|
||||
struct ggml_tensor * ggml_conv_2d(
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
// a: [OC,IC, KH, KW]
|
||||
// b: [N, IC, IH, IW]
|
||||
// result: [N, OH, OW, IC*KH*KW]
|
||||
static struct ggml_tensor * ggml_conv_2d_stage_0(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
|
@ -7723,17 +7723,21 @@ struct ggml_tensor * ggml_conv_2d(
|
|||
is_node = true;
|
||||
}
|
||||
|
||||
const int64_t OH = ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1);
|
||||
const int64_t OW = ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0);
|
||||
|
||||
const int64_t ne[4] = {
|
||||
ggml_calc_conv_output_size(b->ne[0], a->ne[0], s0, p0, d0),
|
||||
ggml_calc_conv_output_size(b->ne[1], a->ne[1], s1, p1, d1),
|
||||
a->ne[3], b->ne[3],
|
||||
a->ne[2] * a->ne[1] * a->ne[0],
|
||||
OW,
|
||||
OH,
|
||||
b->ne[3],
|
||||
};
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F16, 4, ne);
|
||||
|
||||
int32_t params[] = { s0, s1, p0, p1, d0, d1 };
|
||||
ggml_set_op_params(result, params, sizeof(params));
|
||||
|
||||
result->op = GGML_OP_CONV_2D;
|
||||
result->op = GGML_OP_CONV_2D_STAGE_0;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
@ -7742,8 +7746,61 @@ struct ggml_tensor * ggml_conv_2d(
|
|||
|
||||
}
|
||||
|
||||
// ggml_conv_2d_sk_p0
|
||||
// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
||||
// a: [OC, IC, KH, KW]
|
||||
// b: [N, OH, OW, IC * KH * KW]
|
||||
// result: [N, OC, OH, OW]
|
||||
static struct ggml_tensor * ggml_conv_2d_stage_1(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b) {
|
||||
|
||||
bool is_node = false;
|
||||
|
||||
if (a->grad || b->grad) {
|
||||
GGML_ASSERT(false); // TODO: implement backward
|
||||
is_node = true;
|
||||
}
|
||||
|
||||
const int64_t ne[4] = {
|
||||
b->ne[1],
|
||||
b->ne[2],
|
||||
a->ne[3],
|
||||
b->ne[3],
|
||||
};
|
||||
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
|
||||
|
||||
result->op = GGML_OP_CONV_2D_STAGE_1;
|
||||
result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
|
||||
result->src[0] = a;
|
||||
result->src[1] = b;
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
// a: [OC,IC, KH, KW]
|
||||
// b: [N, IC, IH, IW]
|
||||
// result: [N, OC, OH, OW]
|
||||
struct ggml_tensor * ggml_conv_2d(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
struct ggml_tensor * b,
|
||||
int s0,
|
||||
int s1,
|
||||
int p0,
|
||||
int p1,
|
||||
int d0,
|
||||
int d1) {
|
||||
|
||||
struct ggml_tensor * result = ggml_conv_2d_stage_0(ctx, a, b, s0, s1, p0, p1, d0, d1); // [N, OH, OW, IC * KH * KW]
|
||||
result = ggml_conv_2d_stage_1(ctx, a, result);
|
||||
|
||||
return result;
|
||||
|
||||
}
|
||||
|
||||
// ggml_conv_2d_sk_p0
|
||||
struct ggml_tensor * ggml_conv_2d_sk_p0(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -8182,7 +8239,6 @@ static struct ggml_tensor * ggml_add_rel_pos_impl(
|
|||
return result;
|
||||
}
|
||||
|
||||
|
||||
struct ggml_tensor * ggml_add_rel_pos(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
@ -8627,8 +8683,6 @@ struct ggml_tensor * ggml_map_custom3_inplace(
|
|||
return ggml_map_custom3_impl(ctx, a, b, c, fun, n_tasks, userdata, true);
|
||||
}
|
||||
|
||||
|
||||
|
||||
// ggml_cross_entropy_loss
|
||||
|
||||
struct ggml_tensor * ggml_cross_entropy_loss(
|
||||
|
@ -9853,7 +9907,6 @@ static void ggml_compute_forward_add1(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_acc
|
||||
|
||||
static void ggml_compute_forward_acc_f32(
|
||||
|
@ -9993,7 +10046,6 @@ static void ggml_compute_forward_sub_f32(
|
|||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
vDSP_vsub(
|
||||
(float *) ((char *) src1->data + i3*nb13 + i2*nb12 + i1*nb11), 1,
|
||||
|
@ -10174,7 +10226,6 @@ static void ggml_compute_forward_div_f32(
|
|||
const int i2 = (ir - i3*ne2*ne1)/ne1;
|
||||
const int i1 = (ir - i3*ne2*ne1 - i2*ne1);
|
||||
|
||||
|
||||
#ifdef GGML_USE_ACCELERATE
|
||||
UNUSED(ggml_vec_div_f32);
|
||||
|
||||
|
@ -10312,7 +10363,6 @@ static void ggml_compute_forward_sqrt(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_log
|
||||
|
||||
static void ggml_compute_forward_log_f32(
|
||||
|
@ -12145,7 +12195,6 @@ static void ggml_compute_forward_out_prod_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
//int64_t t1 = ggml_perf_time_us();
|
||||
//static int64_t acc = 0;
|
||||
//acc += t1 - t0;
|
||||
|
@ -12341,7 +12390,6 @@ static void ggml_compute_forward_scale_f32(
|
|||
|
||||
const size_t nb1 = dst->nb[1];
|
||||
|
||||
|
||||
for (int i1 = ir0; i1 < ir1; i1++) {
|
||||
if (dst->data != src0->data) {
|
||||
// src0 is same shape as dst => same indices
|
||||
|
@ -12739,7 +12787,6 @@ static void ggml_compute_forward_get_rows_back_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_get_rows_back(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
|
@ -14022,6 +14069,7 @@ static void ggml_compute_forward_conv_1d_f32(
|
|||
}
|
||||
}
|
||||
|
||||
// TODO: reuse ggml_mul_mat or implement ggml_im2col and remove stage_0 and stage_1
|
||||
static void gemm_f16_out_f32(int64_t m, int64_t n, int64_t k,
|
||||
ggml_fp16_t * A,
|
||||
ggml_fp16_t * B,
|
||||
|
@ -14323,6 +14371,9 @@ static void ggml_compute_forward_conv_transpose_1d_f16_f32(
|
|||
}
|
||||
}
|
||||
|
||||
// need to zero dst since we are accumulating into it
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -14395,7 +14446,7 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|||
const float * const src = (float *)((char *) src0->data + i02*nb02 + i01*nb01);
|
||||
float * dst_data = wdata + i01*ne00*ne02;
|
||||
for (int64_t i00 = 0; i00 < ne00; i00++) {
|
||||
dst_data[i01*ne00*ne02 + i00*ne02 + i02] = src[i00];
|
||||
dst_data[i00*ne02 + i02] = src[i00];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -14414,6 +14465,9 @@ static void ggml_compute_forward_conv_transpose_1d_f32(
|
|||
}
|
||||
}
|
||||
|
||||
// need to zero dst since we are accumulating into it
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -14475,6 +14529,144 @@ static void ggml_compute_forward_conv_transpose_1d(
|
|||
|
||||
// ggml_compute_forward_conv_2d
|
||||
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
// src1: image [N, IC, IH, IW]
|
||||
// dst: result [N, OH, OW, IC*KH*KW]
|
||||
static void ggml_compute_forward_conv_2d_stage_0_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F16);
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
const int64_t N = ne13;
|
||||
const int64_t IC = ne12;
|
||||
const int64_t IH = ne11;
|
||||
const int64_t IW = ne10;
|
||||
|
||||
// const int64_t OC = ne03;
|
||||
// const int64_t IC = ne02;
|
||||
const int64_t KH = ne01;
|
||||
const int64_t KW = ne00;
|
||||
|
||||
const int64_t OH = ne2;
|
||||
const int64_t OW = ne1;
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
const int32_t p0 = ((const int32_t*)(dst->op_params))[2];
|
||||
const int32_t p1 = ((const int32_t*)(dst->op_params))[3];
|
||||
const int32_t d0 = ((const int32_t*)(dst->op_params))[4];
|
||||
const int32_t d1 = ((const int32_t*)(dst->op_params))[5];
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(float));
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
// im2col: [N, IC, IH, IW] => [N, OH, OW, IC*KH*KW]
|
||||
{
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) dst->data;
|
||||
|
||||
for (int64_t in = 0; in < N; in++) {
|
||||
for (int64_t ioh = 0; ioh < OH; ioh++) {
|
||||
for (int64_t iow = 0; iow < OW; iow++) {
|
||||
for (int64_t iic = ith; iic < IC; iic+=nth) {
|
||||
|
||||
// micro kernel
|
||||
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
||||
const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
|
||||
|
||||
for (int64_t ikh = 0; ikh < KH; ikh++) {
|
||||
for (int64_t ikw = 0; ikw < KW; ikw++) {
|
||||
const int64_t iiw = iow*s0 + ikw*d0 - p0;
|
||||
const int64_t iih = ioh*s1 + ikh*d1 - p1;
|
||||
|
||||
if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// gemm: [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
||||
// src0: [OC, IC, KH, KW]
|
||||
// src1: [N, OH, OW, IC * KH * KW]
|
||||
// result: [N, OC, OH, OW]
|
||||
static void ggml_compute_forward_conv_2d_stage_1_f16(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT(src1->type == GGML_TYPE_F16);
|
||||
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
||||
|
||||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
if (params->type == GGML_TASK_INIT) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (params->type == GGML_TASK_FINALIZE) {
|
||||
return;
|
||||
}
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
|
||||
GGML_ASSERT(nb00 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb10 == sizeof(ggml_fp16_t));
|
||||
GGML_ASSERT(nb0 == sizeof(float));
|
||||
|
||||
const int N = ne13;
|
||||
const int OH = ne12;
|
||||
const int OW = ne11;
|
||||
|
||||
const int OC = ne03;
|
||||
const int IC = ne02;
|
||||
const int KH = ne01;
|
||||
const int KW = ne00;
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
int64_t m = OC;
|
||||
int64_t n = OH * OW;
|
||||
int64_t k = IC * KH * KW;
|
||||
|
||||
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
||||
for (int i = 0; i < N; i++) {
|
||||
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
||||
ggml_fp16_t * B = (ggml_fp16_t *)src1->data + i * m * k; // [n, k]
|
||||
float * C = (float *)dst->data + i * m * n; // [m, n]
|
||||
|
||||
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_f16_f32(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
|
@ -14487,16 +14679,40 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|||
int64_t t0 = ggml_perf_time_us();
|
||||
UNUSED(t0);
|
||||
|
||||
GGML_TENSOR_BINARY_OP_LOCALS;
|
||||
GGML_TENSOR_BINARY_OP_LOCALS
|
||||
|
||||
// src1: image [N, IC, IH, IW]
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
// dst: result [N, OC, OH, OW]
|
||||
// ne12: IC
|
||||
// ne0: OW
|
||||
// ne1: OH
|
||||
// nk0: KW
|
||||
// nk1: KH
|
||||
// ne13: N
|
||||
|
||||
const int N = ne13;
|
||||
const int IC = ne12;
|
||||
const int IH = ne11;
|
||||
const int IW = ne10;
|
||||
|
||||
const int OC = ne03;
|
||||
// const int IC = ne02;
|
||||
const int KH = ne01;
|
||||
const int KW = ne00;
|
||||
|
||||
const int OH = ne1;
|
||||
const int OW = ne0;
|
||||
|
||||
const int ith = params->ith;
|
||||
const int nth = params->nth;
|
||||
|
||||
const int nk0 = ne00;
|
||||
const int nk1 = ne01;
|
||||
// const int nk0 = ne00;
|
||||
// const int nk1 = ne01;
|
||||
|
||||
// size of the convolution row - the kernel size unrolled across all channels
|
||||
const int ew0 = nk0*nk1*ne02;
|
||||
// const int ew0 = nk0*nk1*ne02;
|
||||
// ew0: IC*KH*KW
|
||||
|
||||
const int32_t s0 = ((const int32_t*)(dst->op_params))[0];
|
||||
const int32_t s1 = ((const int32_t*)(dst->op_params))[1];
|
||||
|
@ -14512,24 +14728,27 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|||
memset(params->wdata, 0, params->wsize);
|
||||
|
||||
// prepare source data (src1)
|
||||
// im2col: [N, IC, IH, IW] => [N*OH*OW, IC*KH*KW]
|
||||
|
||||
{
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
|
||||
for (int i13 = 0; i13 < ne13; i13++) {
|
||||
for (int i12 = 0; i12 < ne12; i12++) {
|
||||
const float * const src = (float *)((char *) src1->data + i13*nb13 + i12*nb12);
|
||||
ggml_fp16_t * dst_data = wdata + i13*(ne1*ne0*ew0);
|
||||
for (int in = 0; in < N; in++) {
|
||||
for (int iic = 0; iic < IC; iic++) {
|
||||
for (int ioh = 0; ioh < OH; ioh++) {
|
||||
for (int iow = 0; iow < OW; iow++) {
|
||||
|
||||
for (int i1 = 0; i1 < ne1; i1++) {
|
||||
for (int i0 = 0; i0 < ne0; i0++) {
|
||||
for (int ik1 = 0; ik1 < nk1; ik1++) {
|
||||
for (int ik0 = 0; ik0 < nk0; ik0++) {
|
||||
const int idx0 = i0*s0 + ik0*d0 - p0;
|
||||
const int idx1 = i1*s1 + ik1*d1 - p1;
|
||||
// micro kernel
|
||||
ggml_fp16_t * dst_data = wdata + (in*OH*OW + ioh*OW + iow)*(IC*KH*KW); // [IC, KH, KW]
|
||||
const float * const src_data = (float *)((char *) src1->data + in*nb13 + iic*nb12); // [IH, IW]
|
||||
|
||||
if (!(idx1 < 0 || idx1 >= ne11 || idx0 < 0 || idx0 >= ne10)) {
|
||||
dst_data[(i1*ne0 + i0)*ew0 + i12*(nk0*nk1) + ik1*nk0 + ik0] =
|
||||
GGML_FP32_TO_FP16(src[idx1*ne10 + idx0]);
|
||||
for (int ikh = 0; ikh < KH; ikh++) {
|
||||
for (int ikw = 0; ikw < KW; ikw++) {
|
||||
const int iiw = iow*s0 + ikw*d0 - p0;
|
||||
const int iih = ioh*s1 + ikh*d1 - p1;
|
||||
|
||||
if (!(iih < 0 || iih >= IH || iiw < 0 || iiw >= IW)) {
|
||||
dst_data[iic*(KH*KW) + ikh*KW + ikw] = GGML_FP32_TO_FP16(src_data[iih*IW + iiw]);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -14546,30 +14765,22 @@ static void ggml_compute_forward_conv_2d_f16_f32(
|
|||
return;
|
||||
}
|
||||
|
||||
// total patches in dst
|
||||
const int np = ne2;
|
||||
|
||||
// patches per thread
|
||||
const int dp = (np + nth - 1)/nth;
|
||||
|
||||
// patch range for this thread
|
||||
const int ip0 = dp*ith;
|
||||
const int ip1 = MIN(ip0 + dp, np);
|
||||
|
||||
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + 0;
|
||||
// wdata: [N*OH*OW, IC*KH*KW]
|
||||
// dst: result [N, OC, OH, OW]
|
||||
// src0: kernel [OC, IC, KH, KW]
|
||||
|
||||
for (int i3 = 0; i3 < ne3; i3++) {
|
||||
for (int i2 = ip0; i2 < ip1; i2++) {
|
||||
float * dst_data = (float *)((char *) dst->data + i3*nb3 + i2*nb2);
|
||||
int64_t m = OC;
|
||||
int64_t n = OH * OW;
|
||||
int64_t k = IC * KH * KW;
|
||||
|
||||
for (int i1 = 0; i1 < ne1; ++i1) {
|
||||
for (int i0 = 0; i0 < ne0; ++i0) {
|
||||
ggml_vec_dot_f16(ew0, dst_data + i1*ne0 + i0,
|
||||
(ggml_fp16_t *) ((char *) src0->data + i2*nb03),
|
||||
(ggml_fp16_t *) wdata + i3*nb3 + (i1*ne0 + i0)*ew0);
|
||||
}
|
||||
}
|
||||
}
|
||||
// [N, OC, OH, OW] = [OC, IC * KH * KW] x [N*OH*OW, IC * KH * KW]
|
||||
for (int i = 0; i < N; i++) {
|
||||
ggml_fp16_t * A = (ggml_fp16_t *)src0->data; // [m, k]
|
||||
ggml_fp16_t * B = (ggml_fp16_t *)wdata + i * m * k; // [n, k]
|
||||
float * C = (float *)dst->data + i * m * n; // [m * k]
|
||||
|
||||
gemm_f16_out_f32(m, n, k, A, B, C, ith, nth);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -14595,6 +14806,48 @@ static void ggml_compute_forward_conv_2d(
|
|||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_stage_0(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_stage_0_f32(params, src0, src1, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
static void ggml_compute_forward_conv_2d_stage_1(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
const struct ggml_tensor * src1,
|
||||
struct ggml_tensor * dst) {
|
||||
switch (src0->type) {
|
||||
case GGML_TYPE_F16:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_stage_1_f16(params, src0, src1, dst);
|
||||
} break;
|
||||
case GGML_TYPE_F32:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
default:
|
||||
{
|
||||
GGML_ASSERT(false);
|
||||
} break;
|
||||
}
|
||||
}
|
||||
|
||||
// ggml_compute_forward_conv_transpose_2d
|
||||
|
||||
static void ggml_compute_forward_conv_transpose_2d(
|
||||
|
@ -14653,6 +14906,8 @@ static void ggml_compute_forward_conv_transpose_2d(
|
|||
}
|
||||
}
|
||||
|
||||
memset(dst->data, 0, ggml_nbytes(dst));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -16151,7 +16406,6 @@ static void ggml_compute_forward_add_rel_pos_f32(
|
|||
const int ip0 = dp*ith;
|
||||
const int ip1 = MIN(ip0 + dp, np);
|
||||
|
||||
|
||||
for (int64_t i13 = ip0; i13 < ip1; ++i13) {
|
||||
for (int64_t i12 = 0; i12 < ne12; ++i12) {
|
||||
for (int64_t i11 = 0; i11 < ne11; ++i11) {
|
||||
|
@ -16218,7 +16472,6 @@ static void ggml_compute_forward_map_unary_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_map_unary(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
|
@ -16266,7 +16519,6 @@ static void ggml_compute_forward_map_binary_f32(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
static void ggml_compute_forward_map_binary(
|
||||
const struct ggml_compute_params * params,
|
||||
const struct ggml_tensor * src0,
|
||||
|
@ -16318,7 +16570,6 @@ static void ggml_compute_forward_map_custom2_f32(
|
|||
fun(dst, a, b);
|
||||
}
|
||||
|
||||
|
||||
// ggml_compute_forward_map_custom3
|
||||
|
||||
static void ggml_compute_forward_map_custom3_f32(
|
||||
|
@ -16593,7 +16844,6 @@ static void ggml_compute_forward_cross_entropy_loss_back_f32(
|
|||
ggml_vec_sub_f32(nc, ds0, ds0, s1);
|
||||
ggml_vec_scale_f32(nc, ds0, d[0] / (float) nr);
|
||||
|
||||
|
||||
#ifndef NDEBUG
|
||||
for (int i = 0; i < nc; ++i) {
|
||||
assert(!isnan(ds0[i]));
|
||||
|
@ -16621,7 +16871,6 @@ static void ggml_compute_forward_cross_entropy_loss_back(
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
/////////////////////////////////
|
||||
|
||||
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
||||
|
@ -16833,6 +17082,14 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|||
{
|
||||
ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_0:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_stage_0(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_1:
|
||||
{
|
||||
ggml_compute_forward_conv_2d_stage_1(params, tensor->src[0], tensor->src[1], tensor);
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
ggml_compute_forward_conv_transpose_2d(params, tensor->src[0], tensor->src[1], tensor);
|
||||
|
@ -17762,11 +18019,19 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
|
|||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_CONV_2D:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_1D:
|
||||
case GGML_OP_CONV_2D_STAGE_0:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_1:
|
||||
{
|
||||
GGML_ASSERT(false); // TODO: not implemented
|
||||
} break;
|
||||
|
@ -18695,6 +18960,7 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||
const int64_t ne0 = node->ne[0];
|
||||
const int64_t ne1 = node->ne[1];
|
||||
const int64_t ne2 = node->ne[2];
|
||||
const int64_t ne3 = node->ne[3];
|
||||
const int64_t nk = ne00*ne01;
|
||||
const int64_t ew0 = nk * ne02;
|
||||
|
||||
|
@ -18705,7 +18971,8 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||
|
||||
if (node->src[0]->type == GGML_TYPE_F16 &&
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(ggml_fp16_t)*(ne0*ne1*ew0);
|
||||
// im2col: [N*OH*OW, IC*KH*KW]
|
||||
cur = sizeof(ggml_fp16_t)*(ne3*ne0*ne1*ew0);
|
||||
} else if (node->src[0]->type == GGML_TYPE_F32 &&
|
||||
node->src[1]->type == GGML_TYPE_F32) {
|
||||
cur = sizeof(float)* (ne10*ne11*ne12);
|
||||
|
@ -18715,6 +18982,14 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) {
|
|||
|
||||
work_size = MAX(work_size, cur);
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_0:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_CONV_2D_STAGE_1:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
} break;
|
||||
case GGML_OP_CONV_TRANSPOSE_2D:
|
||||
{
|
||||
n_tasks = n_threads;
|
||||
|
@ -19903,7 +20178,6 @@ static enum ggml_opt_result ggml_opt_adam(
|
|||
|
||||
opt->loss_after = fx;
|
||||
|
||||
|
||||
// check convergence
|
||||
if (fabsf(fx - fx_prev[0])/fx < params.adam.eps_f) {
|
||||
GGML_PRINT_DEBUG("converged\n");
|
||||
|
|
15
ggml.h
15
ggml.h
|
@ -401,15 +401,16 @@ extern "C" {
|
|||
GGML_OP_ALIBI,
|
||||
GGML_OP_CLAMP,
|
||||
GGML_OP_CONV_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||
GGML_OP_CONV_TRANSPOSE_1D,
|
||||
GGML_OP_CONV_2D,
|
||||
GGML_OP_CONV_2D_STAGE_0, // internal
|
||||
GGML_OP_CONV_2D_STAGE_1, // internal
|
||||
GGML_OP_CONV_TRANSPOSE_2D,
|
||||
GGML_OP_POOL_1D,
|
||||
GGML_OP_POOL_2D,
|
||||
|
||||
GGML_OP_CONV_1D_STAGE_0, // internal
|
||||
GGML_OP_CONV_1D_STAGE_1, // internal
|
||||
|
||||
GGML_OP_UPSCALE, // nearest interpolate
|
||||
|
||||
GGML_OP_FLASH_ATTN,
|
||||
|
@ -1020,9 +1021,9 @@ extern "C" {
|
|||
struct ggml_tensor * b,
|
||||
float eps);
|
||||
|
||||
// A: n columns, m rows
|
||||
// B: n columns, p rows (i.e. we transpose it internally)
|
||||
// result is m columns, p rows
|
||||
// A: k columns, n rows => [ne03, ne02, n, k]
|
||||
// B: k columns, m rows (i.e. we transpose it internally) => [ne03 * x, ne02 * y, m, k]
|
||||
// result is n columns, m rows => [ne03 * x, ne02 * y, m, n]
|
||||
GGML_API struct ggml_tensor * ggml_mul_mat(
|
||||
struct ggml_context * ctx,
|
||||
struct ggml_tensor * a,
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue