From b47b8a9cfeb439d271bf997fb985fd6d82b3af5e Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sat, 22 Jul 2023 21:17:57 +0300 Subject: [PATCH 01/14] llama : optimize memory buffers (#2325) --- examples/common.cpp | 24 +++++----- examples/main/main.cpp | 11 ++--- llama.cpp | 104 ++++++++++++++++++++--------------------- 3 files changed, 66 insertions(+), 73 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 730b28bde..2dc6654da 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -578,18 +578,18 @@ std::vector llama_tokenize(struct llama_context * ctx, const std::s struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params) { auto lparams = llama_context_default_params(); - lparams.n_ctx = params.n_ctx; - lparams.n_batch = params.n_batch; - lparams.n_gpu_layers = params.n_gpu_layers; - lparams.main_gpu = params.main_gpu; - lparams.tensor_split = params.tensor_split; - lparams.low_vram = params.low_vram; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; - lparams.logits_all = params.perplexity; - lparams.embedding = params.embedding; + lparams.n_ctx = params.n_ctx; + lparams.n_batch = params.n_batch; + lparams.n_gpu_layers = params.n_gpu_layers; + lparams.main_gpu = params.main_gpu; + lparams.tensor_split = params.tensor_split; + lparams.low_vram = params.low_vram; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; + lparams.logits_all = params.perplexity; + lparams.embedding = params.embedding; lparams.rope_freq_base = params.rope_freq_base; lparams.rope_freq_scale = params.rope_freq_scale; diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 656382f81..4b4cd1de4 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -139,17 +139,14 @@ int main(int argc, char ** argv) { params.n_threads, std::thread::hardware_concurrency(), llama_print_system_info()); } - // determine the maximum memory usage needed to do inference for the given n_batch and n_predict parameters + // determine the maximum memory usage needed to do inference for the given n_batch and n_ctx parameters // uncomment the "used_mem" line in llama.cpp to see the results if (params.mem_test) { { - const std::vector tmp(params.n_batch, llama_token_bos()); - llama_eval(ctx, tmp.data(), tmp.size(), 0, params.n_threads); - } + fprintf(stderr, "%s: testing memory usage for n_batch = %d, n_ctx = %d\n", __func__, params.n_batch, params.n_ctx); - { - const std::vector tmp = { 0, }; - llama_eval(ctx, tmp.data(), tmp.size(), params.n_predict - 1, params.n_threads); + const std::vector tmp(params.n_batch, llama_token_bos()); + llama_eval(ctx, tmp.data(), tmp.size(), params.n_ctx, params.n_threads); } llama_print_timings(ctx); diff --git a/llama.cpp b/llama.cpp index 0a381afd5..135aa9fef 100644 --- a/llama.cpp +++ b/llama.cpp @@ -98,18 +98,17 @@ static void ggml_graph_compute_helper(std::vector & buf, ggml_cgraph * } // -// memory sizes +// memory sizes (calculated for n_batch == 512) // static const std::map & MEM_REQ_SCRATCH0(int n_ctx) { static std::map k_sizes = { - /* empirical scaling, still a guess */ - { MODEL_3B, ((size_t) n_ctx / 16ull + 128ull) * MB }, - { MODEL_7B, ((size_t) n_ctx / 16ull + 256ull) * MB }, - { MODEL_13B, ((size_t) n_ctx / 12ull + 256ull) * MB }, - { MODEL_30B, ((size_t) n_ctx / 10ull + 256ull) * MB }, - { MODEL_65B, ((size_t) n_ctx / 8ull + 512ull) * MB }, + { MODEL_3B, ((size_t) n_ctx / 16ull + 92ull) * MB }, + { MODEL_7B, ((size_t) n_ctx / 16ull + 100ull) * MB }, + { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB }, + { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB }, + { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess }; return k_sizes; } @@ -117,38 +116,24 @@ static const std::map & MEM_REQ_SCRATCH0(int n_ctx) static const std::map & MEM_REQ_SCRATCH1() { static std::map k_sizes = { - { MODEL_3B, 256ull * MB }, - { MODEL_7B, 512ull * MB }, - { MODEL_13B, 512ull * MB }, - { MODEL_30B, 512ull * MB }, - { MODEL_65B, 1024ull * MB }, + { MODEL_3B, 128ull * MB }, + { MODEL_7B, 160ull * MB }, + { MODEL_13B, 192ull * MB }, + { MODEL_30B, 256ull * MB }, + { MODEL_65B, 384ull * MB }, // guess }; return k_sizes; } -// 2*n_embd*n_ctx*n_layer*sizeof(float16) -static const std::map & MEM_REQ_KV_SELF() +// used to store the compute graph tensors + non-scratch data +static const std::map & MEM_REQ_EVAL() { static std::map k_sizes = { - { MODEL_3B, 682ull * MB }, - { MODEL_7B, 1026ull * MB }, - { MODEL_13B, 1608ull * MB }, - { MODEL_30B, 3124ull * MB }, - { MODEL_65B, 5120ull * MB }, - }; - return k_sizes; -} - -// this is mostly needed for temporary mul_mat buffers to dequantize the data -// not actually needed if BLAS is disabled -static const std::map & MEM_REQ_EVAL(int n_ctx) -{ - static std::map k_sizes = { - { MODEL_3B, ((size_t) n_ctx / 256ull + 512ull) * MB }, - { MODEL_7B, ((size_t) n_ctx / 256ull + 768ull) * MB }, - { MODEL_13B, ((size_t) n_ctx / 256ull + 1024ull) * MB }, - { MODEL_30B, ((size_t) n_ctx / 256ull + 1280ull) * MB }, - { MODEL_65B, ((size_t) n_ctx / 256ull + 1536ull) * MB }, + { MODEL_3B, 8ull * MB }, + { MODEL_7B, 10ull * MB }, + { MODEL_13B, 12ull * MB }, + { MODEL_30B, 16ull * MB }, + { MODEL_65B, 24ull * MB }, // guess }; return k_sizes; } @@ -199,6 +184,15 @@ struct llama_hparams { bool operator!=(const llama_hparams & other) const { return static_cast(memcmp(this, &other, sizeof(llama_hparams))); } + + size_t kv_size() const { + size_t result = 2ull; + result *= (size_t) n_embd; + result *= (size_t) n_ctx; + result *= (size_t) n_layer; + result *= sizeof(ggml_fp16_t); + return result; + } }; struct llama_layer { @@ -1069,7 +1063,7 @@ static void llama_model_load_internal( { model.buf.resize(ctx_size); if (use_mlock) { - model.mlock_buf.init(model.buf.addr); + model.mlock_buf.init (model.buf.addr); model.mlock_buf.grow_to(model.buf.size); } @@ -1186,11 +1180,11 @@ static void llama_model_load_internal( mmapped_size - vram_weights + // weights in VRAM not in memory MEM_REQ_SCRATCH0(hparams.n_ctx).at(model.type) + MEM_REQ_SCRATCH1().at(model.type) + - MEM_REQ_EVAL(hparams.n_ctx).at(model.type); + MEM_REQ_EVAL().at(model.type); // this is the memory required by one llama_state const size_t mem_required_state = - scale*MEM_REQ_KV_SELF().at(model.type); + scale*hparams.kv_size(); fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per state)\n", __func__, mem_required / 1024.0 / 1024.0, mem_required_state / 1024.0 / 1024.0); @@ -1231,7 +1225,7 @@ static void llama_model_load_internal( fprintf(stderr, "%s: cannot offload v cache to GPU due to low VRAM option\n", __func__); } else { fprintf(stderr, "%s: offloading v cache to GPU\n", __func__); - vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2; + vram_kv_cache += hparams.kv_size() / 2; } } if (n_gpu_layers > (int) hparams.n_layer + 2) { @@ -1239,7 +1233,7 @@ static void llama_model_load_internal( fprintf(stderr, "%s: cannot offload k cache to GPU due to low VRAM option\n", __func__); } else { fprintf(stderr, "%s: offloading k cache to GPU\n", __func__); - vram_kv_cache += MEM_REQ_KV_SELF().at(model.type) / 2; + vram_kv_cache += hparams.kv_size() / 2; } } #elif defined(GGML_USE_CLBLAST) @@ -1739,10 +1733,12 @@ static bool llama_eval_internal( } #if 0 - printf("\n%s: used_mem = %.3f MB, scratch -- %.3f MB %.3f MB\n", __func__, + printf("\n%s: used_mem: eval ctx %.3f MB, scratch %.3f MB %.3f MB, work buf %.3f MB, n_past = %d, N = %d\n", __func__, ggml_used_mem(ctx0)/1024.0/1024.0, lctx.get_buf_max_mem(0)/1024.0/1024.0, - lctx.get_buf_max_mem(1)/1024.0/1024.0); + lctx.get_buf_max_mem(1)/1024.0/1024.0, + lctx.work_buffer.size()/1024.0/1024.0, + n_past, N); #endif ggml_free(ctx0); @@ -2448,8 +2444,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s case LLAMA_FTYPE_MOSTLY_Q5_0: quantized_type = GGML_TYPE_Q5_0; break; case LLAMA_FTYPE_MOSTLY_Q5_1: quantized_type = GGML_TYPE_Q5_1; break; case LLAMA_FTYPE_MOSTLY_Q8_0: quantized_type = GGML_TYPE_Q8_0; break; - case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break; - case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; + case LLAMA_FTYPE_MOSTLY_F16: quantized_type = GGML_TYPE_F16; break; + case LLAMA_FTYPE_ALL_F32: quantized_type = GGML_TYPE_F32; break; #ifdef GGML_USE_K_QUANTS // K-quants @@ -2533,16 +2529,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s } else { new_type = quantized_type; #ifdef GGML_USE_K_QUANTS - bool convert_incompatible_tensor = false; - if (quantized_type == GGML_TYPE_Q2_K || quantized_type == GGML_TYPE_Q3_K || quantized_type == GGML_TYPE_Q4_K || - quantized_type == GGML_TYPE_Q5_K || quantized_type == GGML_TYPE_Q6_K) { - int nx = tensor.ne.at(0); - int ny = tensor.ne.at(1); - if (nx % QK_K != 0 || ny % QK_K != 0) { - fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); - convert_incompatible_tensor = true; - } - } if (tensor.name == "output.weight") { int nx = tensor.ne.at(0); int ny = tensor.ne.at(1); @@ -2568,6 +2554,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q4_K; else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K; } + bool convert_incompatible_tensor = false; + if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K || + new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) { + int nx = tensor.ne.at(0); + int ny = tensor.ne.at(1); + if (nx % QK_K != 0 || ny % QK_K != 0) { + fprintf(stderr, "\n\nTensor sizes %d x %d are not divisible by %d, required for k-quants.\n",nx,ny,QK_K); + convert_incompatible_tensor = true; + } + } if (convert_incompatible_tensor) { if (tensor.name == "output.weight") { new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing. @@ -2594,7 +2590,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s f32_data = (float *) f32_conv_buf.addr; } - printf("quantizing .. "); + printf("quantizing to %s .. ", ggml_type_name(new_type)); fflush(stdout); work.resize(nelements * 4); // upper bound on size @@ -2775,7 +2771,7 @@ struct llama_context * llama_new_context_with_model( ctx->embedding.resize(hparams.n_embd); } - ctx->buf_compute.resize(MEM_REQ_EVAL(hparams.n_ctx).at(ctx->model.type)); + ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type)); ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type)); ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type)); From b9b7d94fc10a8039befd1bc3af4f4b09c620c351 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sat, 22 Jul 2023 21:27:34 +0200 Subject: [PATCH 02/14] CUDA: Fixed 7b q3_K_S with mul_mat_vec_q (#2313) --- ggml-cuda.cu | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index c07b54611..f07bdc78d 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -220,7 +220,7 @@ typedef struct { static_assert(sizeof(block_q6_K) == sizeof(ggml_fp16_t) + 13*QK_K/16, "wrong q6_K block size/padding"); #define WARP_SIZE 32 -#define MATRIX_ROW_PADDING 256 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses +#define MATRIX_ROW_PADDING 512 // last row of quant. matrices is a multiple of this to avoid out-of-bounds memory accesses #define CUDA_ADD_BLOCK_SIZE 256 #define CUDA_MUL_BLOCK_SIZE 256 @@ -2815,8 +2815,8 @@ inline void ggml_cuda_op_mul_mat_vec( #endif if (use_mul_mat_vec_q) { - int64_t padded_row_size = ne00 + MATRIX_ROW_PADDING - 1; - padded_row_size -= padded_row_size % MATRIX_ROW_PADDING; + const int64_t padded_row_size = ne00 % MATRIX_ROW_PADDING == 0 ? + ne00 : ne00 - ne00 % MATRIX_ROW_PADDING + MATRIX_ROW_PADDING; size_t as; void * src1_q8_1 = ggml_cuda_pool_malloc(padded_row_size*sizeof(block_q8_1)/QK8_1, &as); quantize_row_q8_1_cuda(src1_ddf_i, src1_q8_1, ne00, padded_row_size, cudaStream_main); @@ -3642,7 +3642,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { size_t size = ggml_nbytes_split(tensor, nrows_split); const size_t original_size = size; - // pad last row to a multiple of 256 elements to avoid out-of-bounds memory accesses + // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses if (ne0 % MATRIX_ROW_PADDING != 0) { size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING) * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type); @@ -3658,7 +3658,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) { } - CUDA_CHECK(cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice)); extra->data_device[id] = buf; From d2a43664f93ba30a84e42713bb69f936cbdacf2a Mon Sep 17 00:00:00 2001 From: Kawrakow <48489457+ikawrakow@users.noreply.github.com> Date: Sun, 23 Jul 2023 08:49:20 +0300 Subject: [PATCH 03/14] Speed up Q4_K (#2322) Co-authored-by: Iwan Kawrakow --- ggml-cuda.cu | 72 ++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 59 insertions(+), 13 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index f07bdc78d..2c5d15773 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -935,12 +935,18 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, uint16_t aux[4]; const uint8_t * sc = (const uint8_t *)aux; +#if K_QUANTS_PER_ITERATION == 2 + uint32_t q32[4]; + const uint8_t * q4 = (const uint8_t *)q32; +#else + uint16_t q16[4]; + const uint8_t * q4 = (const uint8_t *)q16; +#endif + float tmp = 0; // partial sum for thread in warp for (int i = ix; i < num_blocks_per_row; i += K_QUANTS_PER_ITERATION) { - const uint8_t * q1 = x[i].qs + q_offset; - const uint8_t * q2 = q1 + 64; const float * y1 = yy + i*QK_K + y_offset; const float * y2 = y1 + 128; @@ -953,14 +959,41 @@ static __global__ void dequantize_mul_mat_vec_q4_k(const void * __restrict__ vx, aux[2] = ((a[im+4] >> 0) & kmask2) | ((a[im+0] & kmask3) >> 2); aux[3] = ((a[im+4] >> 4) & kmask2) | ((a[im+2] & kmask3) >> 2); +#if K_QUANTS_PER_ITERATION == 2 + const uint32_t * q1 = (const uint32_t *)(x[i].qs + q_offset); + const uint32_t * q2 = q1 + 16; + + q32[0] = q1[0] & 0x0f0f0f0f; + q32[1] = q1[0] & 0xf0f0f0f0; + q32[2] = q2[0] & 0x0f0f0f0f; + q32[3] = q2[0] & 0xf0f0f0f0; + float4 s = {0.f, 0.f, 0.f, 0.f}; float smin = 0; - for (int l = 0; l < n; ++l) { - s.x += y1[l] * (q1[l] & 0xF); s.y += y1[l+32] * (q1[l] >> 4); - s.z += y2[l] * (q2[l] & 0xF); s.w += y2[l+32] * (q2[l] >> 4); + for (int l = 0; l < 4; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+ 4]; + s.z += y2[l] * q4[l+8]; s.w += y2[l+32] * q4[l+12]; smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; } - tmp += dall * (s.x * sc[0] + s.y * sc[1] + s.z * sc[4] + s.w * sc[5]) - dmin * smin; + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#else + const uint16_t * q1 = (const uint16_t *)(x[i].qs + q_offset); + const uint16_t * q2 = q1 + 32; + + q16[0] = q1[0] & 0x0f0f; + q16[1] = q1[0] & 0xf0f0; + q16[2] = q2[0] & 0x0f0f; + q16[3] = q2[0] & 0xf0f0; + + float4 s = {0.f, 0.f, 0.f, 0.f}; + float smin = 0; + for (int l = 0; l < 2; ++l) { + s.x += y1[l] * q4[l+0]; s.y += y1[l+32] * q4[l+2]; + s.z += y2[l] * q4[l+4]; s.w += y2[l+32] * q4[l+6]; + smin += y1[l] * sc[2] + y1[l+32] * sc[3] + y2[l] * sc[6] + y2[l+32] * sc[7]; + } + tmp += dall * (s.x * sc[0] + s.y * sc[1] * 1.f/16.f + s.z * sc[4] + s.w * sc[5] * 1.f/16.f) - dmin * smin; +#endif } #else @@ -1521,7 +1554,7 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( #if __CUDA_ARCH__ >= MIN_CC_DP4A // lowest compute capability for integer intrinsics const block_q4_K * bq4_K = (const block_q4_K *) vbq; - const int bq8_offset = QR4_K * (iqs / QI8_1); + const int bq8_offset = QR4_K * (iqs / QI8_1); // 0, 2, 4, 6 float sumf_d = 0.0f; float sumf_m = 0.0f; @@ -1531,11 +1564,20 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const int v = *((int *) &bq4_K->qs[sizeof(int) * iqs]); - for (int i = 0; i < QR4_K; ++i) { - const int isc = bq8_offset + i; + const uint16_t * scales = (const uint16_t *)bq4_K->scales; + uint16_t aux[2]; + const int j = bq8_offset/2; + if (j < 2) { + aux[0] = scales[j+0] & 0x3f3f; + aux[1] = scales[j+2] & 0x3f3f; + } else { + aux[0] = ((scales[j+2] >> 0) & 0x0f0f) | ((scales[j-2] & 0xc0c0) >> 2); + aux[1] = ((scales[j+2] >> 4) & 0x0f0f) | ((scales[j-0] & 0xc0c0) >> 2); + } + const uint8_t * sc = (const uint8_t *)aux; + const uint8_t * m = sc + 2; - uint8_t sc, m; - get_scale_min_k4(isc, bq4_K->scales, sc, m); + for (int i = 0; i < QR4_K; ++i) { const block_q8_1 * bq8i = bq8_1 + bq8_offset + i; const int ui = *((int*) &bq8i->qs[sizeof(int) * (iqs % QI8_1)]); @@ -1543,8 +1585,8 @@ static __device__ __forceinline__ float vec_dot_q4_K_q8_1( const int vi = (v >> (4*i)) & 0x0F0F0F0F; - sumf_d += d8i * (__dp4a(vi, ui, 0) * sc); // SIMD dot product - sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m); // multiply constant part of q4_K with sum of q8_1 values + sumf_d += d8i * (__dp4a(vi, ui, 0) * sc[i]); // SIMD dot product + sumf_m += d8i * (__dp4a(0x01010101, ui, 0) * m[i]); // multiply constant part of q4_K with sum of q8_1 values } return d*sumf_d - dmin*sumf_m; @@ -2497,7 +2539,9 @@ static size_t g_scratch_offset = 0; static int g_device_count = -1; static int g_main_device = 0; +#ifndef GGML_CUDA_FORCE_DMMV static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; +#endif static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; @@ -2520,7 +2564,9 @@ void ggml_init_cublas() { g_tensor_split[id] = total_vram; total_vram += prop.totalGlobalMem; +#ifndef GGML_CUDA_FORCE_DMMV g_compute_capabilities[id] = 100*prop.major + 10*prop.minor; +#endif } for (int id = 0; id < g_device_count; ++id) { g_tensor_split[id] /= total_vram; From 83a00ce69bef9124c0702424a012ea799128b77d Mon Sep 17 00:00:00 2001 From: Jiahao Li Date: Sun, 23 Jul 2023 19:00:37 +0800 Subject: [PATCH 04/14] metal : support bcast add & dup & cont op (#2323) --- ggml-metal.m | 12 +++++++++++- ggml-metal.metal | 11 +++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/ggml-metal.m b/ggml-metal.m index 2810fa2a8..78a3b65f1 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -42,6 +42,7 @@ struct ggml_metal_context { id pipeline_##name GGML_METAL_DECL_KERNEL(add); + GGML_METAL_DECL_KERNEL(add_row); // TODO: avoid this extra kernel, instead extend the "add" kernel to support broadcast GGML_METAL_DECL_KERNEL(mul); GGML_METAL_DECL_KERNEL(mul_row); // TODO: avoid this extra kernel, instead extend the "mul" kernel to support broadcast GGML_METAL_DECL_KERNEL(scale); @@ -157,6 +158,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) { fprintf(stderr, "%s: loaded %-32s %16p\n", __func__, "kernel_"#name, (void *) ctx->pipeline_##name); GGML_METAL_ADD_KERNEL(add); + GGML_METAL_ADD_KERNEL(add_row); GGML_METAL_ADD_KERNEL(mul); GGML_METAL_ADD_KERNEL(mul_row); GGML_METAL_ADD_KERNEL(scale); @@ -464,10 +466,16 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - [encoder setComputePipelineState:ctx->pipeline_add]; + if (ggml_nelements(src1) == ne10) { + // src1 is a row + [encoder setComputePipelineState:ctx->pipeline_add_row]; + } else { + [encoder setComputePipelineState:ctx->pipeline_add]; + } [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + [encoder setBytes:&ne00 length:sizeof(ne00) atIndex:3]; const int64_t n = ggml_nelements(dst); @@ -919,7 +927,9 @@ void ggml_metal_graph_compute( [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; } break; + case GGML_OP_DUP: case GGML_OP_CPY: + case GGML_OP_CONT: { if (encoder == nil) { encoder = [command_buffer computeCommandEncoder]; diff --git a/ggml-metal.metal b/ggml-metal.metal index 5a9a6d842..987376d56 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -67,6 +67,17 @@ kernel void kernel_add( dst[tpig] = src0[tpig] + src1[tpig]; } +// assumption: src1 is a row +// broadcast src1 into src0 +kernel void kernel_add_row( + device const float * src0, + device const float * src1, + device float * dst, + constant int64_t & ne00, + uint tpig[[thread_position_in_grid]]) { + dst[tpig] = src0[tpig] + src1[tpig % ne00]; +} + kernel void kernel_mul( device const float * src0, device const float * src1, From 355c80f49e32b0b15c0a457f3bad380e57f5b9ac Mon Sep 17 00:00:00 2001 From: AustinMroz Date: Sun, 23 Jul 2023 06:16:48 -0500 Subject: [PATCH 05/14] examples : simplify vim plugin (#2327) Uses builtin json_encode and json_decode functions to simplify escaping Removes the need for temp files --- examples/llm.vim | 45 +++++---------------------------------------- 1 file changed, 5 insertions(+), 40 deletions(-) diff --git a/examples/llm.vim b/examples/llm.vim index 16e308c38..efecad0cd 100644 --- a/examples/llm.vim +++ b/examples/llm.vim @@ -2,57 +2,22 @@ function! Llm() let url = "http://127.0.0.1:8080/completion" - " Save the current cursor position - let save_cursor = getpos('.') - - silent! %s/\n/\\n/g - silent! %s/\t/\\t/g - silent! %s/\\n$// - " Get the content of the current buffer let buffer_content = join(getline(1, '$'), "\n") - " Replace true newlines with "\n" - let buffer_content = substitute(buffer_content, '\n', '\\n', 'g') - - " Trim leading/trailing whitespace - let buffer_content = substitute(buffer_content, '^\s\+', '', '') - let buffer_content = substitute(buffer_content, '\s\+$', '', '') - " Create the JSON payload - " can't escape backslash, \n gets replaced as \\n - let json_payload = '{"prompt":"' . escape(buffer_content, '"/') . '","temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream":false}' - - let prompt_tmpfile = tempname() - let response_tmpfile = tempname() - call writefile([json_payload], prompt_tmpfile) + let json_payload = {"temp":0.72,"top_k":100,"top_p":0.73,"repeat_penalty":1.100000023841858,"n_predict":10,"stream": v:false} + let json_payload.prompt = buffer_content " Define the curl command - let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -o ' . shellescape(response_tmpfile) . ' -d @' . shellescape(prompt_tmpfile) . ' ' . url - silent execute '!'.curl_command - - let response = join(readfile(response_tmpfile), '') - let start_marker = '{"content":"' - let end_marker = '","generation_settings' - let content_start = stridx(response, start_marker) + len(start_marker) - let content_end = stridx(response, end_marker, content_start) + let curl_command = 'curl -k -s -X POST -H "Content-Type: application/json" -d @- ' . url + let response = system(curl_command, json_encode(json_payload)) " Extract the content field from the response - let content = strpart(response, content_start, content_end - content_start) + let content = json_decode(response).content " Insert the content at the cursor position call setline(line('.'), getline('.') . content) - - " Replace newline "\n" strings with actual newlines in the content - silent! %s/\\n/\r/g - " and tabs - silent! %s/\\t/\t/g - " and quote marks for C sources - silent! %s/\\"/\"/g - - " Remove the temporary file - call delete(prompt_tmpfile) - call delete(response_tmpfile) endfunction command! Llm call Llm() From 91171b8072f6f0c8ae3a61e23451acb538bb9ece Mon Sep 17 00:00:00 2001 From: Jose Maldonado <63384398+yukiteruamano@users.noreply.github.com> Date: Sun, 23 Jul 2023 07:52:08 -0400 Subject: [PATCH 06/14] make : fix CLBLAST compile support in FreeBSD (#2331) * Fix Makefile for CLBLAST compile support and instructions for compile llama.cpp FreeBSD * More general use-case for CLBLAST support (Linux and FreeBSD) --- Makefile | 8 +++++--- README.md | 17 +++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 1ea3c4562..e620835ef 100644 --- a/Makefile +++ b/Makefile @@ -235,13 +235,15 @@ ggml-cuda.o: ggml-cuda.cu ggml-cuda.h endif # LLAMA_CUBLAS ifdef LLAMA_CLBLAST - CFLAGS += -DGGML_USE_CLBLAST - CXXFLAGS += -DGGML_USE_CLBLAST + + CFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL) + CXXFLAGS += -DGGML_USE_CLBLAST $(shell pkg-config --cflags clblast OpenCL) + # Mac provides OpenCL as a framework ifeq ($(UNAME_S),Darwin) LDFLAGS += -lclblast -framework OpenCL else - LDFLAGS += -lclblast -lOpenCL + LDFLAGS += $(shell pkg-config --libs clblast OpenCL) endif OBJS += ggml-opencl.o diff --git a/README.md b/README.md index f45e4bf08..c9fe6187b 100644 --- a/README.md +++ b/README.md @@ -242,6 +242,23 @@ In order to build llama.cpp you have three different options. zig build -Doptimize=ReleaseFast ``` +- Using `gmake` (FreeBSD): + + 1. Install and activate [DRM in FreeBSD](https://wiki.freebsd.org/Graphics) + 2. Add your user to **video** group + 3. Install compilation dependencies. + + ```bash + sudo pkg install gmake automake autoconf pkgconf llvm15 clinfo clover \ + opencl clblast openblas + + gmake CC=/usr/local/bin/clang15 CXX=/usr/local/bin/clang++15 -j4 + ``` + + **Notes:** With this packages you can build llama.cpp with OPENBLAS and + CLBLAST support for use OpenCL GPU acceleration in FreeBSD. Please read + the instructions for use and activate this options in this document below. + ### Metal Build Using Metal allows the computation to be executed on the GPU for Apple devices: From a940458e4814e87bd0d3fbdb3f3d2733b4a3ccb1 Mon Sep 17 00:00:00 2001 From: Christian Demsar Date: Sun, 23 Jul 2023 07:56:34 -0400 Subject: [PATCH 07/14] llama : print max tensor size to stderr (#2336) --- llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llama.cpp b/llama.cpp index 135aa9fef..0731c75ad 100644 --- a/llama.cpp +++ b/llama.cpp @@ -2795,7 +2795,7 @@ struct llama_context * llama_new_context_with_model( const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx); - printf("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); + fprintf(stderr, "%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0); #define LLAMA_METAL_CHECK_BUF(result) \ if (!(result)) { \ From bc3ec2cdc9ea20b0faba2e1b4576fab3a911e4d1 Mon Sep 17 00:00:00 2001 From: wzy <32936898+Freed-Wu@users.noreply.github.com> Date: Sun, 23 Jul 2023 19:57:02 +0800 Subject: [PATCH 08/14] flake : support `nix build '.#opencl'` (#2337) --- flake.nix | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/flake.nix b/flake.nix index 7f148f144..4178e97ff 100644 --- a/flake.nix +++ b/flake.nix @@ -7,7 +7,8 @@ flake-utils.lib.eachDefaultSystem (system: let inherit (pkgs.stdenv) isAarch32 isAarch64 isDarwin; - osSpecific = with pkgs; [ openmpi ] ++ + buildInputs = with pkgs; [ openmpi ]; + osSpecific = with pkgs; buildInputs ++ ( if isAarch64 && isDarwin then with pkgs.darwin.apple_sdk_11_0.frameworks; [ @@ -29,18 +30,24 @@ nativeBuildInputs = with pkgs; [ cmake pkgconfig ]; llama-python = pkgs.python3.withPackages (ps: with ps; [ numpy sentencepiece ]); + postPatch = '' + substituteInPlace ./ggml-metal.m \ + --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" + substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python' + ''; + postInstall = '' + mv $out/bin/main $out/bin/llama + mv $out/bin/server $out/bin/llama-server + ''; + cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ]; in { packages.default = pkgs.stdenv.mkDerivation { name = "llama.cpp"; src = ./.; - postPatch = '' - substituteInPlace ./ggml-metal.m \ - --replace '[bundle pathForResource:@"ggml-metal" ofType:@"metal"];' "@\"$out/bin/ggml-metal.metal\";" - substituteInPlace ./*.py --replace '/usr/bin/env python' '${llama-python}/bin/python' - ''; + postPatch = postPatch; nativeBuildInputs = nativeBuildInputs; buildInputs = osSpecific; - cmakeFlags = [ "-DLLAMA_BUILD_SERVER=ON" "-DLLAMA_MPI=ON" "-DBUILD_SHARED_LIBS=ON" "-DCMAKE_SKIP_BUILD_RPATH=ON" ] + cmakeFlags = cmakeFlags ++ (if isAarch64 && isDarwin then [ "-DCMAKE_C_FLAGS=-D__ARM_FEATURE_DOTPROD=1" "-DLLAMA_METAL=ON" @@ -48,10 +55,19 @@ "-DLLAMA_BLAS=ON" "-DLLAMA_BLAS_VENDOR=OpenBLAS" ]); - postInstall = '' - mv $out/bin/main $out/bin/llama - mv $out/bin/server $out/bin/llama-server - ''; + postInstall = postInstall; + meta.mainProgram = "llama"; + }; + packages.opencl = pkgs.stdenv.mkDerivation { + name = "llama.cpp"; + src = ./.; + postPatch = postPatch; + nativeBuildInputs = nativeBuildInputs; + buildInputs = with pkgs; buildInputs ++ [ clblast ]; + cmakeFlags = cmakeFlags ++ [ + "-DLLAMA_CLBLAST=ON" + ]; + postInstall = postInstall; meta.mainProgram = "llama"; }; apps.llama-server = { From 1d0824b2476e7fda09751a0235c9e571b76d6f2c Mon Sep 17 00:00:00 2001 From: maddes8cht <55592906+maddes8cht@users.noreply.github.com> Date: Sun, 23 Jul 2023 13:59:48 +0200 Subject: [PATCH 09/14] llama : print help to stdout (#2338) --- examples/common.cpp | 158 ++++++++++++++++++++++---------------------- 1 file changed, 79 insertions(+), 79 deletions(-) diff --git a/examples/common.cpp b/examples/common.cpp index 2dc6654da..661039765 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -458,91 +458,91 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { } void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { - fprintf(stderr, "usage: %s [options]\n", argv[0]); - fprintf(stderr, "\n"); - fprintf(stderr, "options:\n"); - fprintf(stderr, " -h, --help show this help message and exit\n"); - fprintf(stderr, " -i, --interactive run in interactive mode\n"); - fprintf(stderr, " --interactive-first run in interactive mode and wait for input right away\n"); - fprintf(stderr, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); - fprintf(stderr, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); - fprintf(stderr, " -r PROMPT, --reverse-prompt PROMPT\n"); - fprintf(stderr, " halt generation at PROMPT, return control in interactive mode\n"); - fprintf(stderr, " (can be specified more than once for multiple prompts).\n"); - fprintf(stderr, " --color colorise output to distinguish prompt and user input from generations\n"); - fprintf(stderr, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); - fprintf(stderr, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); - fprintf(stderr, " -p PROMPT, --prompt PROMPT\n"); - fprintf(stderr, " prompt to start generation with (default: empty)\n"); - fprintf(stderr, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); - fprintf(stderr, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); - fprintf(stderr, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); - fprintf(stderr, " not supported with --interactive or other interactive options\n"); - fprintf(stderr, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); - fprintf(stderr, " --random-prompt start with a randomized prompt.\n"); - fprintf(stderr, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); - fprintf(stderr, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); - fprintf(stderr, " -f FNAME, --file FNAME\n"); - fprintf(stderr, " prompt file to start generation.\n"); - fprintf(stderr, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); - fprintf(stderr, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); - fprintf(stderr, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); - fprintf(stderr, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); - fprintf(stderr, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); - fprintf(stderr, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); - fprintf(stderr, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); - fprintf(stderr, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); - fprintf(stderr, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); - fprintf(stderr, " --mirostat N use Mirostat sampling.\n"); - fprintf(stderr, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); - fprintf(stderr, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); - fprintf(stderr, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); - fprintf(stderr, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); - fprintf(stderr, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); - fprintf(stderr, " modifies the likelihood of token appearing in the completion,\n"); - fprintf(stderr, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); - fprintf(stderr, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); - fprintf(stderr, " --cfg-negative-prompt PROMPT \n"); - fprintf(stderr, " negative prompt to use for guidance. (default: empty)\n"); - fprintf(stderr, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); - fprintf(stderr, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); - fprintf(stderr, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); - fprintf(stderr, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); - fprintf(stderr, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); - fprintf(stderr, " --no-penalize-nl do not penalize newline token\n"); - fprintf(stderr, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); - fprintf(stderr, " not recommended: doubles context memory required and no measurable increase in quality\n"); - fprintf(stderr, " --temp N temperature (default: %.1f)\n", (double)params.temp); - fprintf(stderr, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); - fprintf(stderr, " --perplexity compute perplexity over each ctx window of the prompt\n"); - fprintf(stderr, " --perplexity-lines compute perplexity over each line of the prompt\n"); - fprintf(stderr, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); - fprintf(stderr, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); + fprintf(stdout, "usage: %s [options]\n", argv[0]); + fprintf(stdout, "\n"); + fprintf(stdout, "options:\n"); + fprintf(stdout, " -h, --help show this help message and exit\n"); + fprintf(stdout, " -i, --interactive run in interactive mode\n"); + fprintf(stdout, " --interactive-first run in interactive mode and wait for input right away\n"); + fprintf(stdout, " -ins, --instruct run in instruction mode (use with Alpaca models)\n"); + fprintf(stdout, " --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n"); + fprintf(stdout, " -r PROMPT, --reverse-prompt PROMPT\n"); + fprintf(stdout, " halt generation at PROMPT, return control in interactive mode\n"); + fprintf(stdout, " (can be specified more than once for multiple prompts).\n"); + fprintf(stdout, " --color colorise output to distinguish prompt and user input from generations\n"); + fprintf(stdout, " -s SEED, --seed SEED RNG seed (default: -1, use random seed for < 0)\n"); + fprintf(stdout, " -t N, --threads N number of threads to use during computation (default: %d)\n", params.n_threads); + fprintf(stdout, " -p PROMPT, --prompt PROMPT\n"); + fprintf(stdout, " prompt to start generation with (default: empty)\n"); + fprintf(stdout, " -e process prompt escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\)\n"); + fprintf(stdout, " --prompt-cache FNAME file to cache prompt state for faster startup (default: none)\n"); + fprintf(stdout, " --prompt-cache-all if specified, saves user input and generations to cache as well.\n"); + fprintf(stdout, " not supported with --interactive or other interactive options\n"); + fprintf(stdout, " --prompt-cache-ro if specified, uses the prompt cache but does not update it.\n"); + fprintf(stdout, " --random-prompt start with a randomized prompt.\n"); + fprintf(stdout, " --in-prefix STRING string to prefix user inputs with (default: empty)\n"); + fprintf(stdout, " --in-suffix STRING string to suffix after user inputs with (default: empty)\n"); + fprintf(stdout, " -f FNAME, --file FNAME\n"); + fprintf(stdout, " prompt file to start generation.\n"); + fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); + fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); + fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); + fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); + fprintf(stdout, " --typical N locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)\n", (double)params.typical_p); + fprintf(stdout, " --repeat-last-n N last n tokens to consider for penalize (default: %d, 0 = disabled, -1 = ctx_size)\n", params.repeat_last_n); + fprintf(stdout, " --repeat-penalty N penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty); + fprintf(stdout, " --presence-penalty N repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty); + fprintf(stdout, " --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty); + fprintf(stdout, " --mirostat N use Mirostat sampling.\n"); + fprintf(stdout, " Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n"); + fprintf(stdout, " (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat); + fprintf(stdout, " --mirostat-lr N Mirostat learning rate, parameter eta (default: %.1f)\n", (double)params.mirostat_eta); + fprintf(stdout, " --mirostat-ent N Mirostat target entropy, parameter tau (default: %.1f)\n", (double)params.mirostat_tau); + fprintf(stdout, " -l TOKEN_ID(+/-)BIAS, --logit-bias TOKEN_ID(+/-)BIAS\n"); + fprintf(stdout, " modifies the likelihood of token appearing in the completion,\n"); + fprintf(stdout, " i.e. `--logit-bias 15043+1` to increase likelihood of token ' Hello',\n"); + fprintf(stdout, " or `--logit-bias 15043-1` to decrease likelihood of token ' Hello'\n"); + fprintf(stdout, " --cfg-negative-prompt PROMPT \n"); + fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); + fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); + fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); + fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); + fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); + fprintf(stdout, " --no-penalize-nl do not penalize newline token\n"); + fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); + fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); + fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); + fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); + fprintf(stdout, " --perplexity-lines compute perplexity over each line of the prompt\n"); + fprintf(stdout, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); + fprintf(stdout, " --chunks N max number of chunks to process (default: %d, -1 = all)\n", params.n_chunks); if (llama_mlock_supported()) { - fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); + fprintf(stdout, " --mlock force system to keep model in RAM rather than swapping or compressing\n"); } if (llama_mmap_supported()) { - fprintf(stderr, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); + fprintf(stdout, " --no-mmap do not memory-map model (slower load but may reduce pageouts if not using mlock)\n"); } - fprintf(stderr, " --numa attempt optimizations that help on some NUMA systems\n"); - fprintf(stderr, " if run without this previously, it is recommended to drop the system page cache before using this\n"); - fprintf(stderr, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); + fprintf(stdout, " --numa attempt optimizations that help on some NUMA systems\n"); + fprintf(stdout, " if run without this previously, it is recommended to drop the system page cache before using this\n"); + fprintf(stdout, " see https://github.com/ggerganov/llama.cpp/issues/1437\n"); #ifdef LLAMA_SUPPORTS_GPU_OFFLOAD - fprintf(stderr, " -ngl N, --n-gpu-layers N\n"); - fprintf(stderr, " number of layers to store in VRAM\n"); - fprintf(stderr, " -ts SPLIT --tensor-split SPLIT\n"); - fprintf(stderr, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); - fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); - fprintf(stderr, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); + fprintf(stdout, " -ngl N, --n-gpu-layers N\n"); + fprintf(stdout, " number of layers to store in VRAM\n"); + fprintf(stdout, " -ts SPLIT --tensor-split SPLIT\n"); + fprintf(stdout, " how to split tensors across multiple GPUs, comma-separated list of proportions, e.g. 3,1\n"); + fprintf(stdout, " -mg i, --main-gpu i the GPU to use for scratch and small tensors\n" ); + fprintf(stdout, " -lv, --low-vram don't allocate VRAM scratch buffer\n" ); #endif - fprintf(stderr, " --mtest compute maximum memory usage\n"); - fprintf(stderr, " --export export the computation graph to 'llama.ggml'\n"); - fprintf(stderr, " --verbose-prompt print prompt before generation\n"); - fprintf(stderr, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); - fprintf(stderr, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); - fprintf(stderr, " -m FNAME, --model FNAME\n"); - fprintf(stderr, " model path (default: %s)\n", params.model.c_str()); - fprintf(stderr, "\n"); + fprintf(stdout, " --mtest compute maximum memory usage\n"); + fprintf(stdout, " --export export the computation graph to 'llama.ggml'\n"); + fprintf(stdout, " --verbose-prompt print prompt before generation\n"); + fprintf(stdout, " --lora FNAME apply LoRA adapter (implies --no-mmap)\n"); + fprintf(stdout, " --lora-base FNAME optional model to use as a base for the layers modified by the LoRA adapter\n"); + fprintf(stdout, " -m FNAME, --model FNAME\n"); + fprintf(stdout, " model path (default: %s)\n", params.model.c_str()); + fprintf(stdout, "\n"); } std::string gpt_random_prompt(std::mt19937 & rng) { From e76d630df17e235e6b9ef416c45996765d2e36fb Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Sun, 23 Jul 2023 15:09:47 +0300 Subject: [PATCH 10/14] llama : grouped-query attention + LLaMAv2 70B support (#2276) * CUDA: GQA implementation * llama : support for GQA and LLaMAv2 70B ggml-ci * py : fix hparams parsing (if-else blocks) ggml-ci * py : oh boy .. ggml-ci * help : fix gqa value for 70B ggml-ci --------- Co-authored-by: JohannesGaessler --- convert.py | 66 +++++++++++------ examples/common.cpp | 12 +++- examples/common.h | 3 +- examples/main/main.cpp | 4 +- ggml-cuda.cu | 71 ++++++++++++------- llama.cpp | 156 +++++++++++++++++++++++++++-------------- llama.h | 11 +-- 7 files changed, 215 insertions(+), 108 deletions(-) diff --git a/convert.py b/convert.py index e3f1096e1..8d7af06d1 100755 --- a/convert.py +++ b/convert.py @@ -142,9 +142,9 @@ def find_n_mult(n_ff: int, n_embd: int) -> int: @dataclass class Params: n_vocab: int - n_embd: int - n_mult: int - n_head: int + n_embd: int + n_mult: int + n_head: int n_layer: int @staticmethod @@ -167,11 +167,11 @@ class Params: n_head=n_embd // 128 # guessed return Params( - n_vocab=n_vocab, - n_embd=n_embd, - n_mult=256, - n_head=n_head, - n_layer=n_layer, + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = 256, + n_head = n_head, + n_layer = n_layer, ) @staticmethod @@ -179,28 +179,53 @@ class Params: config = json.load(open(config_path)) n_vocab = config["vocab_size"]; - n_embd = config["hidden_size"]; - n_head = config["num_attention_heads"]; + n_embd = config["hidden_size"]; + n_head = config["num_attention_heads"]; n_layer = config["num_hidden_layers"]; - n_ff = config["intermediate_size"]; + n_ff = config["intermediate_size"]; n_mult = find_n_mult(n_ff, n_embd); return Params( - n_vocab=n_vocab, - n_embd=n_embd, - n_mult=n_mult, - n_head=n_head, - n_layer=n_layer, + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = n_mult, + n_head = n_head, + n_layer = n_layer, + ) + + # LLaMA v2 70B params.json + # {"dim": 8192, "multiple_of": 4096, "ffn_dim_multiplier": 1.3, "n_heads": 64, "n_kv_heads": 8, "n_layers": 80, "norm_eps": 1e-05, "vocab_size": -1 + @staticmethod + def loadOriginalParamsJson(model: 'LazyModel', config_path: 'Path') -> 'Params': + config = json.load(open(config_path)) + + n_vocab = config["vocab_size"]; + n_embd = config["dim"]; + n_head = config["n_heads"]; + n_layer = config["n_layers"]; + n_mult = config["multiple_of"]; + + if n_vocab == -1: + n_vocab = model["tok_embeddings.weight"].shape[0] + + return Params( + n_vocab = n_vocab, + n_embd = n_embd, + n_mult = n_mult, + n_head = n_head, + n_layer = n_layer, ) @staticmethod def load(model_plus: 'ModelPlus') -> 'Params': + hf_config_path = model_plus.paths[0].parent / "config.json" orig_config_path = model_plus.paths[0].parent / "params.json" - hf_transformer_config_path = model_plus.paths[0].parent / "config.json" - if hf_transformer_config_path.exists(): - params = Params.loadHFTransformerJson(model_plus.model, hf_transformer_config_path) + if hf_config_path.exists(): + params = Params.loadHFTransformerJson(model_plus.model, hf_config_path) + elif orig_config_path.exists(): + params = Params.loadOriginalParamsJson(model_plus.model, orig_config_path) else: params = Params.guessed(model_plus.model) @@ -1036,8 +1061,7 @@ class OutputFile: @staticmethod def write_vocab_only(fname_out: Path, vocab: Vocab) -> None: of = OutputFile(fname_out) - params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, - n_head=1, n_layer=0) + params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0, n_head=1, n_layer=0) of = OutputFile(fname_out) of.write_file_header(params, file_type=GGMLFileType.AllF32) of.write_vocab(vocab) diff --git a/examples/common.cpp b/examples/common.cpp index 661039765..5608ca87f 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -168,6 +168,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_ctx = std::stoi(argv[i]); + } else if (arg == "-gqa" || arg == "--gqa") { + if (++i >= argc) { + invalid_param = true; + break; + } + params.n_gqa = std::stoi(argv[i]); } else if (arg == "--rope-freq-base") { if (++i >= argc) { invalid_param = true; @@ -485,6 +491,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " -f FNAME, --file FNAME\n"); fprintf(stdout, " prompt file to start generation.\n"); fprintf(stdout, " -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity)\n", params.n_predict); + fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); + fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); + fprintf(stdout, " -gqa N, --gqa N grouped-query attention factor (TEMP!!! use 8 for LLaMAv2 70B) (default: %d)\n", params.n_gqa); fprintf(stdout, " --top-k N top-k sampling (default: %d, 0 = disabled)\n", params.top_k); fprintf(stdout, " --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)params.top_p); fprintf(stdout, " --tfs N tail free sampling, parameter z (default: %.1f, 1.0 = disabled)\n", (double)params.tfs_z); @@ -505,7 +514,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --cfg-negative-prompt PROMPT \n"); fprintf(stdout, " negative prompt to use for guidance. (default: empty)\n"); fprintf(stdout, " --cfg-scale N strength of guidance (default: %f, 1.0 = disable)\n", params.cfg_scale); - fprintf(stdout, " -c N, --ctx-size N size of the prompt context (default: %d)\n", params.n_ctx); fprintf(stdout, " --rope-freq-base N RoPE base frequency (default: %.1f)\n", params.rope_freq_base); fprintf(stdout, " --rope-freq-scale N RoPE frequency scaling factor (default: %g)\n", params.rope_freq_scale); fprintf(stdout, " --ignore-eos ignore end of stream token and continue generating (implies --logit-bias 2-inf)\n"); @@ -513,7 +521,6 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stdout, " --memory-f32 use f32 instead of f16 for memory key+value (default: disabled)\n"); fprintf(stdout, " not recommended: doubles context memory required and no measurable increase in quality\n"); fprintf(stdout, " --temp N temperature (default: %.1f)\n", (double)params.temp); - fprintf(stdout, " -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch); fprintf(stdout, " --perplexity compute perplexity over each ctx window of the prompt\n"); fprintf(stdout, " --perplexity-lines compute perplexity over each line of the prompt\n"); fprintf(stdout, " --keep number of tokens to keep from the initial prompt (default: %d, -1 = all)\n", params.n_keep); @@ -580,6 +587,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param lparams.n_ctx = params.n_ctx; lparams.n_batch = params.n_batch; + lparams.n_gqa = params.n_gqa; lparams.n_gpu_layers = params.n_gpu_layers; lparams.main_gpu = params.main_gpu; lparams.tensor_split = params.tensor_split; diff --git a/examples/common.h b/examples/common.h index c936de6fa..fb8f6d65f 100644 --- a/examples/common.h +++ b/examples/common.h @@ -27,6 +27,7 @@ struct gpt_params { int32_t n_predict = -1; // new tokens to predict int32_t n_ctx = 512; // context size int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS) + int32_t n_gqa = 1; // grouped-query attention factor (TODO: move to hparams) int32_t n_keep = 0; // number of tokens to keep from initial prompt int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited) int32_t n_gpu_layers = 0; // number of layers to store in VRAM @@ -47,7 +48,7 @@ struct gpt_params { int32_t repeat_last_n = 64; // last n tokens to penalize (0 = disable penalty, -1 = context size) float frequency_penalty = 0.00f; // 0.0 = disabled float presence_penalty = 0.00f; // 0.0 = disabled - int mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 + int32_t mirostat = 0; // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0 float mirostat_tau = 5.00f; // target entropy float mirostat_eta = 0.10f; // learning rate diff --git a/examples/main/main.cpp b/examples/main/main.cpp index 4b4cd1de4..3bd8ba262 100644 --- a/examples/main/main.cpp +++ b/examples/main/main.cpp @@ -93,8 +93,8 @@ int main(int argc, char ** argv) { } if (params.n_ctx > 2048) { - fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified);" - " you are on your own\n", __func__, params.n_ctx); + // TODO: determine the actual max context of the model (e.g. 4096 for LLaMA v2) and use that instead of 2048 + fprintf(stderr, "%s: warning: base model only supports context sizes no greater than 2048 tokens (%d specified)\n", __func__, params.n_ctx); } else if (params.n_ctx < 8) { fprintf(stderr, "%s: warning: minimum context size is 8, using minimum size.\n", __func__); params.n_ctx = 8; diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 2c5d15773..720447440 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1787,11 +1787,15 @@ static __global__ void dequantize_mul_mat_vec(const void * __restrict__ vx, cons } } -static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, const int nchannels_x) { +static __global__ void mul_mat_p021_f16_f32( + const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, + const int ncols_x, const int nrows_x, const int nchannels_x, const int nchannels_y) { + const half * x = (const half *) vx; const int row_x = blockDim.y*blockIdx.y + threadIdx.y; const int channel = blockDim.z*blockIdx.z + threadIdx.z; + const int channel_x = channel / (nchannels_y / nchannels_x); const int nrows_y = ncols_x; const int nrows_dst = nrows_x; @@ -1807,7 +1811,7 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const } // x is transposed and permuted - const int ix = row_x*nchannels_x*ncols_x + channel*ncols_x + col_x; + const int ix = row_x*nchannels_x*ncols_x + channel_x*ncols_x + col_x; const float xi = __half2float(x[ix]); const int row_y = col_x; @@ -1835,12 +1839,13 @@ static __global__ void mul_mat_p021_f16_f32(const void * __restrict__ vx, const static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous const void * __restrict__ vx, const float * __restrict__ y, float * __restrict__ dst, const int ncols_x, const int nrows_x, - const int row_stride_x, const int channel_stride_x) { + const int row_stride_x, const int channel_stride_x, const int channel_x_divisor) { const half * x = (const half *) vx; const int row_x = blockDim.y*blockIdx.y + threadIdx.y; const int channel = blockDim.z*blockIdx.z + threadIdx.z; + const int channel_x = channel / channel_x_divisor; const int nrows_y = ncols_x; const int nrows_dst = nrows_x; @@ -1857,7 +1862,7 @@ static __global__ void mul_mat_vec_nc_f16_f32( // nc == non-contiguous break; } - const int ix = channel*channel_stride_x + row_x*row_stride_x + col_x; + const int ix = channel_x*channel_stride_x + row_x*row_stride_x + col_x; const float xi = __half2float(x[ix]); const int row_y = col_x; @@ -2366,20 +2371,23 @@ static to_fp32_cuda_t ggml_get_to_fp32_cuda(ggml_type type) { } } -static void ggml_mul_mat_p021_f16_f32_cuda(const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int nchannels_x, cudaStream_t stream) { - const dim3 block_nums(1, nrows_x, nchannels_x); +static void ggml_mul_mat_p021_f16_f32_cuda( + const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, + const int nchannels_x, const int nchannels_y, cudaStream_t stream) { + + const dim3 block_nums(1, nrows_x, nchannels_y); const dim3 block_dims(WARP_SIZE, 1, 1); - mul_mat_p021_f16_f32<<>>(vx, y, dst, ncols_x, nrows_x, nchannels_x); + mul_mat_p021_f16_f32<<>>(vx, y, dst, ncols_x, nrows_x, nchannels_x, nchannels_y); } static void ggml_mul_mat_vec_nc_f16_f32_cuda( const void * vx, const float * y, float * dst, const int ncols_x, const int nrows_x, const int row_stride_x, - const int nchannels_x, const int channel_stride_x, cudaStream_t stream) { + const int nchannels_x, const int nchannels_y, const int channel_stride_x, cudaStream_t stream) { - const dim3 block_nums(1, nrows_x, nchannels_x); + const dim3 block_nums(1, nrows_x, nchannels_y); const dim3 block_dims(WARP_SIZE, 1, 1); mul_mat_vec_nc_f16_f32<<>> - (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x); + (vx, y, dst, ncols_x, nrows_x, row_stride_x, channel_stride_x, nchannels_y/nchannels_x); } static void ggml_cpy_f32_f32_cuda( @@ -3143,6 +3151,9 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm const int64_t ne11 = use_src1 ? src1->ne[1] : 1; const int64_t ne12 = use_src1 ? src1->ne[2] : 1; const int64_t ne13 = use_src1 ? src1->ne[3] : 1; + const int64_t nrows1 = use_src1 ? ggml_nrows(src1) : 1; + + GGML_ASSERT(ne03 == ne13); const int64_t ne0 = dst->ne[0]; const int64_t ne1 = dst->ne[1]; @@ -3154,12 +3165,19 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT); // strides for iteration over dims 3 and 2 - const int64_t num_iters = flatten_rows ? 1 : ne02 * ne03; - const int64_t stride_mod = flatten_rows ? ne02 * ne03 : 1; + const int64_t num_iters_0 = ne02 >= ne12 ? ne02*ne03 : ne12*ne13; + const int64_t num_iters = flatten_rows ? 1 : num_iters_0; + const int64_t stride_mod = flatten_rows ? num_iters_0 : 1; const int64_t src0_stride = ne00 * ne01 * stride_mod; const int64_t src1_stride = ne10 * ne11 * stride_mod; const int64_t dst_stride = ne0 * ne1 * stride_mod; + const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01; + const int64_t i03_max = flatten_rows ? 1 : ne03; + const int64_t i02_max = flatten_rows ? 1 : (ne02 >= ne12 ? ne02 : ne12); + const int64_t i02_divisor = ne02 >= ne12 ? 1 : ne12 / ne02; + GGML_ASSERT(!(flatten_rows && ne02 < ne12)); + const size_t src0_ts = ggml_type_size(src0->type); const size_t src0_bs = ggml_blck_size(src0->type); @@ -3176,6 +3194,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm dst->op == GGML_OP_SCALE || dst->op == GGML_OP_DIAG_MASK_INF || dst->op == GGML_OP_ROPE); const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT; + GGML_ASSERT(!(split && ne02 < ne12)); const to_fp32_cuda_t to_fp32_cuda = ggml_get_to_fp32_cuda(src0->type); @@ -3212,7 +3231,7 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm row_high = id == g_device_count - 1 ? nrows0 : nrows0*g_tensor_split[id + 1]; } else { row_low = 0; - row_high = nrows0; + row_high = nrows0*i02_divisor; } if (row_low == row_high) { continue; @@ -3260,16 +3279,12 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm dst_ddf[id] = (float *) ggml_cuda_pool_malloc(size_dst_ddf, &dst_asf[id]); } - const int64_t i03_max = flatten_rows ? 1 : ne03; - const int64_t i02_max = flatten_rows ? 1 : ne02; - const int64_t rows_per_iter = flatten_rows ? nrows0 : ne01; - for (int64_t i03 = 0; i03 < i03_max; i03++) { const int64_t i13 = i03 % ne13; for (int64_t i02 = 0; i02 < i02_max; i02++) { const int64_t i12 = i02 % ne12; - const int64_t i0 = i03*ne02 + i02; + const int64_t i0 = i03*i02_max + i02; // i0 values that contain the lower/upper rows for a split tensor when using multiple GPUs const int64_t i0_offset_low = row_low/rows_per_iter; @@ -3303,10 +3318,10 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm const int64_t i11 = i13*ne12 + i12; // for split tensors the data begins at i0 == i0_offset_low - char * src0_ddq_i = src0_ddq[id] + (i0 - i0_offset_low)*src0_stride*src0_ts/src0_bs; - float * src0_ddf_i = src0_ddf[id] + (i0 - i0_offset_low)*src0_stride; + char * src0_ddq_i = src0_ddq[id] + (i0/i02_divisor - i0_offset_low)*src0_stride*src0_ts/src0_bs; + float * src0_ddf_i = src0_ddf[id] + (i0/i02_divisor - i0_offset_low)*src0_stride; float * src1_ddf_i = src1_ddf[id] + i11*src1_stride; - float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; + float * dst_ddf_i = dst_ddf[id] + (i0 - i0_offset_low)*dst_stride; // for split tensors the data pointer needs to be rounded down // to the bin edge for i03, i02 bins beyond the first @@ -3345,11 +3360,11 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm } } - if (!src0_on_device || !src0_is_contiguous) { + if ((!src0_on_device || !src0_is_contiguous) && i02 % i02_divisor == 0) { if (src0_is_f32) { - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddf_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); } else { - CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02, i01_low, i01_high, cudaStream_main)); + CUDA_CHECK(ggml_cuda_cpy_tensor_2d(src0_ddq_i, src0, i03, i02/i02_divisor, i01_low, i01_high, cudaStream_main)); } } @@ -3503,6 +3518,8 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; + const int64_t ne12 = src1->ne[2]; + CUDA_CHECK(cudaSetDevice(g_main_device)); cudaStream_t cudaStream_main = g_cudaStreams_main[g_main_device]; @@ -3515,7 +3532,7 @@ void ggml_cuda_mul_mat_vec_p021(const ggml_tensor * src0, const ggml_tensor * sr struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra; float * dst_ddf = (float *) dst_extra->data_device[g_main_device]; - ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, cudaStream_main); + ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, cudaStream_main); } void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst){ @@ -3529,6 +3546,8 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int64_t ne01 = src0->ne[1]; const int64_t ne02 = src0->ne[2]; + const int64_t ne12 = src1->ne[2]; + const int64_t nb01 = src0->nb[1]; const int64_t nb02 = src0->nb[2]; @@ -3547,7 +3566,7 @@ void ggml_cuda_mul_mat_vec_nc(const ggml_tensor * src0, const ggml_tensor * src1 const int row_stride_x = nb01 / sizeof(half); const int channel_stride_x = nb02 / sizeof(half); - ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, channel_stride_x, cudaStream_main); + ggml_mul_mat_vec_nc_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, row_stride_x, ne02, ne12, channel_stride_x, cudaStream_main); } void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) { diff --git a/llama.cpp b/llama.cpp index 0731c75ad..5a8453bec 100644 --- a/llama.cpp +++ b/llama.cpp @@ -67,6 +67,7 @@ enum e_model { MODEL_13B, MODEL_30B, MODEL_65B, + MODEL_70B, }; static const size_t kB = 1024; @@ -109,6 +110,7 @@ static const std::map & MEM_REQ_SCRATCH0(int n_ctx) { MODEL_13B, ((size_t) n_ctx / 12ull + 120ull) * MB }, { MODEL_30B, ((size_t) n_ctx / 9ull + 160ull) * MB }, { MODEL_65B, ((size_t) n_ctx / 6ull + 256ull) * MB }, // guess + { MODEL_70B, ((size_t) n_ctx / 7ull + 164ull) * MB }, }; return k_sizes; } @@ -121,6 +123,7 @@ static const std::map & MEM_REQ_SCRATCH1() { MODEL_13B, 192ull * MB }, { MODEL_30B, 256ull * MB }, { MODEL_65B, 384ull * MB }, // guess + { MODEL_70B, 304ull * MB }, }; return k_sizes; } @@ -134,6 +137,7 @@ static const std::map & MEM_REQ_EVAL() { MODEL_13B, 12ull * MB }, { MODEL_30B, 16ull * MB }, { MODEL_65B, 24ull * MB }, // guess + { MODEL_70B, 24ull * MB }, }; return k_sizes; } @@ -148,6 +152,7 @@ static const std::map & VRAM_REQ_SCRATCH_BASE() { MODEL_13B, 640ull * kB }, { MODEL_30B, 768ull * kB }, { MODEL_65B, 1536ull * kB }, + { MODEL_70B, 1536ull * kB }, // TODO (likely can be reduced) }; return k_sizes; } @@ -162,19 +167,25 @@ static const std::map & VRAM_REQ_SCRATCH_PER_CONTEXT() { MODEL_13B, 160ull }, { MODEL_30B, 208ull }, { MODEL_65B, 416ull }, + { MODEL_70B, 416ull }, // TODO (likely can be reduced) }; return k_sizes; } // default hparams (LLaMA 7B) struct llama_hparams { - uint32_t n_vocab = 32000; - uint32_t n_ctx = 512; // this is provided as user input? - uint32_t n_embd = 4096; - uint32_t n_mult = 256; - uint32_t n_head = 32; - uint32_t n_layer = 32; - uint32_t n_rot = 64; + uint32_t n_vocab = 32000; + uint32_t n_ctx = 512; // this is provided as user input? + uint32_t n_embd = 4096; + uint32_t n_mult = 256; + uint32_t n_head = 32; + uint32_t n_head_kv = 32; + uint32_t n_layer = 32; + uint32_t n_rot = 64; + + // LLaMAv2 + // TODO: load from model data hparams + float f_ffn_mult = 1.0f; float rope_freq_base = 10000.0f; float rope_freq_scale = 1.0f; @@ -182,12 +193,24 @@ struct llama_hparams { enum llama_ftype ftype = LLAMA_FTYPE_MOSTLY_F16; bool operator!=(const llama_hparams & other) const { - return static_cast(memcmp(this, &other, sizeof(llama_hparams))); + return static_cast(memcmp(this, &other, sizeof(llama_hparams))); // NOLINT + } + + uint32_t n_gqa() const { + return n_head/n_head_kv; + } + + uint32_t n_embd_head() const { + return n_embd/n_head; + } + + uint32_t n_embd_gqa() const { + return n_embd/n_gqa(); } size_t kv_size() const { size_t result = 2ull; - result *= (size_t) n_embd; + result *= (size_t) n_embd_gqa(); result *= (size_t) n_ctx; result *= (size_t) n_layer; result *= sizeof(ggml_fp16_t); @@ -493,12 +516,16 @@ struct llama_file_loader { } void read_hparams() { hparams.n_vocab = file.read_u32(); - hparams.n_embd = file.read_u32(); - hparams.n_mult = file.read_u32(); - hparams.n_head = file.read_u32(); + hparams.n_embd = file.read_u32(); + hparams.n_mult = file.read_u32(); + hparams.n_head = file.read_u32(); hparams.n_layer = file.read_u32(); - hparams.n_rot = file.read_u32(); - hparams.ftype = (enum llama_ftype) file.read_u32(); + hparams.n_rot = file.read_u32(); + hparams.ftype = (enum llama_ftype) file.read_u32(); + + // LLaMAv2 + // TODO: read from header + hparams.n_head_kv = hparams.n_head; } void read_vocab() { vocab.id_to_token.resize(hparams.n_vocab); @@ -797,7 +824,7 @@ static bool kv_cache_init( ggml_type wtype, int n_ctx, int n_gpu_layers) { - const int n_embd = hparams.n_embd; + const int n_embd = hparams.n_embd_gqa(); const int n_layer = hparams.n_layer; const int64_t n_mem = n_layer*n_ctx; @@ -841,6 +868,7 @@ struct llama_context_params llama_context_default_params() { /*.seed =*/ LLAMA_DEFAULT_SEED, /*.n_ctx =*/ 512, /*.n_batch =*/ 512, + /*.n_gqa =*/ 1, /*.gpu_layers =*/ 0, /*.main_gpu =*/ 0, /*.tensor_split =*/ nullptr, @@ -960,6 +988,7 @@ static const char *llama_model_type_name(e_model type) { case MODEL_13B: return "13B"; case MODEL_30B: return "30B"; case MODEL_65B: return "65B"; + case MODEL_70B: return "70B"; default: LLAMA_ASSERT(false); } } @@ -970,6 +999,7 @@ static void llama_model_load_internal( llama_vocab & vocab, int n_ctx, int n_batch, + int n_gqa, int n_gpu_layers, int main_gpu, const float * tensor_split, @@ -991,6 +1021,7 @@ static void llama_model_load_internal( model.hparams = ml->file_loader->hparams; model.n_gpu_layers = n_gpu_layers; llama_file_version file_version = ml->file_loader->file_version; + auto & hparams = model.hparams; { @@ -1010,11 +1041,25 @@ static void llama_model_load_internal( hparams.n_ctx = n_ctx; + // LLaMAv2 + // TODO: temporary until GGUF + LLAMA_ASSERT(hparams.n_head % n_gqa == 0); + hparams.n_head_kv = hparams.n_head / n_gqa; + if (model.type == e_model::MODEL_65B && n_gqa == 8) { + fprintf(stderr, "%s: warning: assuming 70B model based on GQA == %d\n", __func__, n_gqa); + model.type = e_model::MODEL_70B; + hparams.f_ffn_mult = 1.3f; // from the params.json of the 70B model + } + hparams.rope_freq_base = rope_freq_base; hparams.rope_freq_scale = rope_freq_scale; } - const uint32_t n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; + // ref: https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/model.py#L194-L199 + const uint32_t n_ff_raw = 2*(4*hparams.n_embd)/3; + const uint32_t n_ff_mult = hparams.f_ffn_mult*n_ff_raw; + const uint32_t n_ff = ((n_ff_mult + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult; + //const uint32_t n_ff = 28672; { fprintf(stderr, "%s: format = %s\n", __func__, llama_file_version_name(file_version)); @@ -1023,12 +1068,14 @@ static void llama_model_load_internal( fprintf(stderr, "%s: n_embd = %u\n", __func__, hparams.n_embd); fprintf(stderr, "%s: n_mult = %u\n", __func__, hparams.n_mult); fprintf(stderr, "%s: n_head = %u\n", __func__, hparams.n_head); + fprintf(stderr, "%s: n_head_kv = %u\n", __func__, hparams.n_head_kv); fprintf(stderr, "%s: n_layer = %u\n", __func__, hparams.n_layer); - fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); + fprintf(stderr, "%s: n_rot = %u\n", __func__, hparams.n_rot); // a.k.a. n_embd_head, n_head_dim + fprintf(stderr, "%s: n_gqa = %u\n", __func__, hparams.n_gqa()); + fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: freq_base = %.1f\n", __func__, hparams.rope_freq_base); fprintf(stderr, "%s: freq_scale = %g\n", __func__, hparams.rope_freq_scale); fprintf(stderr, "%s: ftype = %u (%s)\n", __func__, hparams.ftype, llama_ftype_name(hparams.ftype)); - fprintf(stderr, "%s: n_ff = %u\n", __func__, n_ff); fprintf(stderr, "%s: model size = %s\n", __func__, llama_model_type_name(model.type)); } @@ -1098,9 +1145,10 @@ static void llama_model_load_internal( size_t vram_weights = 0; size_t vram_scratch = 0; { - const uint32_t n_embd = hparams.n_embd; - const uint32_t n_layer = hparams.n_layer; - const uint32_t n_vocab = hparams.n_vocab; + const uint32_t n_embd = hparams.n_embd; + const uint32_t n_embd_gqa = hparams.n_embd_gqa(); + const uint32_t n_layer = hparams.n_layer; + const uint32_t n_vocab = hparams.n_vocab; ml->ggml_ctx = ctx; @@ -1148,16 +1196,16 @@ static void llama_model_load_internal( layer.attention_norm = ml->get_tensor(layers_i + ".attention_norm.weight", {n_embd}, backend); - layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split); - layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd}, backend_split); - layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd}, backend_split); - layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split); + layer.wq = ml->get_tensor(layers_i + ".attention.wq.weight", {n_embd, n_embd}, backend_split); + layer.wk = ml->get_tensor(layers_i + ".attention.wk.weight", {n_embd, n_embd_gqa}, backend_split); + layer.wv = ml->get_tensor(layers_i + ".attention.wv.weight", {n_embd, n_embd_gqa}, backend_split); + layer.wo = ml->get_tensor(layers_i + ".attention.wo.weight", {n_embd, n_embd}, backend_split); layer.ffn_norm = ml->get_tensor(layers_i + ".ffn_norm.weight", {n_embd}, backend); - layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split); - layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split); - layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split); + layer.w1 = ml->get_tensor(layers_i + ".feed_forward.w1.weight", {n_embd, n_ff}, backend_split); + layer.w2 = ml->get_tensor(layers_i + ".feed_forward.w2.weight", { n_ff, n_embd}, backend_split); + layer.w3 = ml->get_tensor(layers_i + ".feed_forward.w3.weight", {n_embd, n_ff}, backend_split); if (backend == GGML_BACKEND_GPU) { vram_weights += @@ -1281,6 +1329,7 @@ static bool llama_model_load( llama_vocab & vocab, int n_ctx, int n_batch, + int n_gqa, int n_gpu_layers, int main_gpu, const float * tensor_split, @@ -1294,7 +1343,7 @@ static bool llama_model_load( llama_progress_callback progress_callback, void *progress_callback_user_data) { try { - llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, + llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gqa, n_gpu_layers, main_gpu, tensor_split, rope_freq_base, rope_freq_scale, low_vram, memory_type, use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data); return true; } catch (const std::exception & err) { @@ -1338,17 +1387,22 @@ static bool llama_eval_internal( LLAMA_ASSERT(!!kv_self.ctx); - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - const int n_vocab = hparams.n_vocab; - const int n_rot = hparams.n_embd/hparams.n_head; - const int n_gpu_layers = model.n_gpu_layers; + const int64_t n_embd = hparams.n_embd; + const int64_t n_layer = hparams.n_layer; + const int64_t n_ctx = hparams.n_ctx; + const int64_t n_head = hparams.n_head; + const int64_t n_head_kv = hparams.n_head_kv; + const int64_t n_embd_head = hparams.n_embd_head(); + const int64_t n_vocab = hparams.n_vocab; + const int64_t n_embd_gqa = hparams.n_embd_gqa(); + + LLAMA_ASSERT(n_embd_head == hparams.n_rot); const float freq_base = hparams.rope_freq_base; const float freq_scale = hparams.rope_freq_scale; + const int n_gpu_layers = model.n_gpu_layers; + auto & mem_per_token = lctx.mem_per_token; auto & buf_compute = lctx.buf_compute; @@ -1446,11 +1500,11 @@ static bool llama_eval_internal( offload_func_kq(tmpq); ggml_set_name(tmpq, "tmpq"); - struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Kcur); ggml_set_name(Kcur, "Kcur"); - struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N), n_past, n_rot, 0, 0, freq_base, freq_scale); + struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale); offload_func_kq(Qcur); ggml_set_name(Qcur, "Qcur"); @@ -1462,17 +1516,17 @@ static bool llama_eval_internal( offload_func_v(tmpv); ggml_set_name(tmpv, "tmpv"); - struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd, N)); + struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N)); offload_func_v(Vcur); ggml_set_name(Vcur, "Vcur"); - struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd, (ggml_element_size(kv_self.k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past)); offload_func_kq(k); ggml_set_name(k, "k"); - struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd, + struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa, ( n_ctx)*ggml_element_size(kv_self.v), - (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd + n_past*ggml_element_size(kv_self.v)); + (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v)); offload_func_v(v); ggml_set_name(v, "v"); @@ -1491,8 +1545,8 @@ static bool llama_eval_internal( struct ggml_tensor * K = ggml_permute(ctx0, ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(kv_self.k)*n_embd), - n_embd/n_head, n_head, n_past + N), + ggml_view_1d(ctx0, kv_self.k, (n_past + N)*n_embd_gqa, il*n_ctx*ggml_element_size(kv_self.k)*n_embd_gqa), + n_embd_head, n_head_kv, n_past + N), 0, 2, 1, 3); offload_func_kq(K); ggml_set_name(K, "K"); @@ -1502,9 +1556,9 @@ static bool llama_eval_internal( offload_func_kq(KQ); ggml_set_name(KQ, "KQ"); - // KQ_scaled = KQ / sqrt(n_embd/n_head) + // KQ_scaled = KQ / sqrt(n_embd_head) struct ggml_tensor * KQ_scale = ggml_new_f32(ctx0, 1.0f/sqrtf(float(n_embd)/n_head)); - ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)"); + ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)"); // KQ_scaled shape [n_past + N, N, n_head, 1] struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale); @@ -1524,10 +1578,10 @@ static bool llama_eval_internal( // split cached V into n_head heads struct ggml_tensor * V = ggml_view_3d(ctx0, kv_self.v, - n_past + N, n_embd/n_head, n_head, + n_past + N, n_embd_head, n_head_kv, n_ctx*ggml_element_size(kv_self.v), - n_ctx*ggml_element_size(kv_self.v)*n_embd/n_head, - il*n_ctx*ggml_element_size(kv_self.v)*n_embd); + n_ctx*ggml_element_size(kv_self.v)*n_embd_head, + n_ctx*ggml_element_size(kv_self.v)*n_embd_gqa*il); offload_func_v(V); ggml_set_name(V, "V"); @@ -1539,7 +1593,7 @@ static bool llama_eval_internal( // make V contiguous in memory to speed up the matmul, however we waste time on the copy // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation // is there a better way? - struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd/n_head, n_head)); + struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head)); struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max); #endif @@ -2693,7 +2747,7 @@ struct llama_model * llama_load_model_from_file( ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32; - if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers, + if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gqa, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.rope_freq_base, params.rope_freq_scale,params.low_vram, memory_type, params.use_mmap, params.use_mlock, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) { diff --git a/llama.h b/llama.h index bbf28e686..1089909a6 100644 --- a/llama.h +++ b/llama.h @@ -83,11 +83,12 @@ extern "C" { typedef void (*llama_progress_callback)(float progress, void *ctx); struct llama_context_params { - uint32_t seed; // RNG seed, -1 for random - int32_t n_ctx; // text context - int32_t n_batch; // prompt processing batch size - int32_t n_gpu_layers; // number of layers to store in VRAM - int32_t main_gpu; // the GPU that is used for scratch and small tensors + uint32_t seed; // RNG seed, -1 for random + int32_t n_ctx; // text context + int32_t n_batch; // prompt processing batch size + int32_t n_gqa; // grouped-query attention (TEMP - will be moved to model hparams) + int32_t n_gpu_layers; // number of layers to store in VRAM + int32_t main_gpu; // the GPU that is used for scratch and small tensors const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES) From 95a6c595e7ca8dbe47ccf8824e04213e10357f9a Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 23 Jul 2023 14:36:02 +0200 Subject: [PATCH 11/14] ggml: move op parameters from tensors to ggml_tensor::op_params (#2333) * ggml: move op parameters from tensors to ggml_tensor::op_params * alibi: use memcpy for float params * remove `src[1] = NULL` in ops --- ggml-cuda.cu | 21 +- ggml-metal.m | 20 +- ggml.c | 667 +++++++++++++++------------------------------------ ggml.h | 4 + 4 files changed, 226 insertions(+), 486 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 720447440..6fb55d838 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -2742,6 +2742,7 @@ inline void ggml_cuda_op_mul( (void) dst; (void) src0_ddq_i; (void) i02; + (void) i1; } inline void ggml_cuda_op_gelu( @@ -3037,15 +3038,15 @@ inline void ggml_cuda_op_rope( const int64_t ne00 = src0->ne[0]; const int64_t i01_diff = i01_high - i01_low; - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; - + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; // RoPE alteration for extended context + float freq_base, freq_scale; - memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); const float theta_scale = powf(freq_base, -2.0f/n_dims); const float p = (((mode & 1) == 0 ? n_past + i02 : i02)) * freq_scale; @@ -3061,6 +3062,7 @@ inline void ggml_cuda_op_rope( rope_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, p, theta_scale, cudaStream_main); } + (void) src1; (void) dst; (void) src0_ddq_i; (void) src1_ddf_i; @@ -3079,11 +3081,12 @@ inline void ggml_cuda_op_diag_mask_inf( const int64_t ne01 = src0->ne[1]; const int64_t i01_diff = i01_high - i01_low; - const int n_past = ((int32_t *) src1->data)[0]; + const int n_past = ((int32_t *) dst->op_params)[0]; // compute diag_mask_inf_f32_cuda(src0_ddf_i, dst_ddf_i, ne00, i01_diff, ne01, n_past, cudaStream_main); + (void) src1; (void) dst; (void) src0_ddq_i; (void) src1_ddf_i; @@ -3803,7 +3806,7 @@ void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bo char * src0_ddc = (char *) src0_extra->data_device[g_main_device]; size_t offset = 0; if (tensor->op == GGML_OP_VIEW) { - memcpy(&offset, tensor->src[2]->data, sizeof(size_t)); + memcpy(&offset, tensor->op_params, sizeof(size_t)); } extra = ggml_cuda_alloc_temp_tensor_extra(); extra->data_device[g_main_device] = src0_ddc + offset; diff --git a/ggml-metal.m b/ggml-metal.m index 78a3b65f1..bf3f68fe4 100644 --- a/ggml-metal.m +++ b/ggml-metal.m @@ -585,7 +585,7 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - const int n_past = ((int32_t *)(src1->data))[0]; + const int n_past = ((int32_t *)(dst->op_params))[0]; [encoder setComputePipelineState:ctx->pipeline_diag_mask_inf]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; @@ -850,9 +850,10 @@ void ggml_metal_graph_compute( GGML_ASSERT((src0t == GGML_TYPE_F32)); - const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past); - const int n_head = ((int32_t *) src1->data)[1]; - const float max_bias = ((float *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; UNUSED(n_past); + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); if (__builtin_popcount(n_head) != 1) { GGML_ASSERT(false && "only power-of-two n_head implemented"); @@ -890,15 +891,14 @@ void ggml_metal_graph_compute( encoder = [command_buffer computeCommandEncoder]; } - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - - const int n_past = ((int32_t *)(src1->data))[0]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; float freq_base; float freq_scale; - memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); [encoder setComputePipelineState:ctx->pipeline_rope]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; diff --git a/ggml.c b/ggml.c index 6055da867..747a39241 100644 --- a/ggml.c +++ b/ggml.c @@ -4590,6 +4590,7 @@ struct ggml_tensor * ggml_new_tensor_impl( /*.ne =*/ { 1, 1, 1, 1 }, /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, + /*.op_params =*/ {0}, /*.is_param =*/ false, /*.grad =*/ NULL, /*.src =*/ { NULL }, @@ -4969,6 +4970,11 @@ struct ggml_tensor * ggml_format_name(struct ggml_tensor * tensor, const char * return tensor; } +static void ggml_set_op_params(struct ggml_tensor * tensor, const void * params, size_t params_size) { + assert(params_size <= GGML_MAX_OP_PARAMS); + memcpy(tensor->op_params, params, params_size); +} + struct ggml_tensor * ggml_view_tensor( struct ggml_context * ctx, const struct ggml_tensor * src) { @@ -5019,7 +5025,6 @@ struct ggml_tensor * ggml_dup_impl( result->op = GGML_OP_DUP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5143,23 +5148,13 @@ struct ggml_tensor * ggml_acc_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5); - - ((int32_t *) c->data)[0] = nb1; - ((int32_t *) c->data)[1] = nb2; - ((int32_t *) c->data)[2] = nb3; - ((int32_t *) c->data)[3] = offset; - ((int32_t *) c->data)[4] = inplace ? 1 : 0; - - ggml_scratch_load(ctx); + int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_ACC; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; } @@ -5332,7 +5327,6 @@ struct ggml_tensor * ggml_sqr_impl( result->op = GGML_OP_SQR; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5366,7 +5360,6 @@ struct ggml_tensor * ggml_sqrt_impl( result->op = GGML_OP_SQRT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5401,7 +5394,6 @@ struct ggml_tensor * ggml_log_impl( result->op = GGML_OP_LOG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5434,7 +5426,6 @@ struct ggml_tensor * ggml_sum( result->op = GGML_OP_SUM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5461,7 +5452,6 @@ struct ggml_tensor * ggml_sum_rows( result->op = GGML_OP_SUM_ROWS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5484,7 +5474,6 @@ struct ggml_tensor * ggml_mean( result->op = GGML_OP_MEAN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5508,7 +5497,6 @@ struct ggml_tensor * ggml_argmax( result->op = GGML_OP_ARGMAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5586,7 +5574,6 @@ struct ggml_tensor * ggml_abs_impl( result->op = GGML_OP_ABS; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5621,7 +5608,6 @@ struct ggml_tensor * ggml_sgn_impl( result->op = GGML_OP_SGN; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5655,7 +5641,6 @@ struct ggml_tensor * ggml_neg_impl( result->op = GGML_OP_NEG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5689,7 +5674,6 @@ struct ggml_tensor * ggml_step_impl( result->op = GGML_OP_STEP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5723,7 +5707,6 @@ struct ggml_tensor * ggml_tanh_impl( result->op = GGML_OP_TANH; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5757,7 +5740,6 @@ struct ggml_tensor * ggml_elu_impl( result->op = GGML_OP_ELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5791,7 +5773,6 @@ struct ggml_tensor * ggml_relu_impl( result->op = GGML_OP_RELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5825,7 +5806,6 @@ struct ggml_tensor * ggml_gelu_impl( result->op = GGML_OP_GELU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5859,7 +5839,6 @@ struct ggml_tensor * ggml_gelu_quick_impl( result->op = GGML_OP_GELU_QUICK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5893,7 +5872,6 @@ struct ggml_tensor * ggml_silu_impl( result->op = GGML_OP_SILU; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -5948,10 +5926,11 @@ struct ggml_tensor * ggml_norm_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + // TODO: maybe store epsilon here? + result->op = GGML_OP_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } @@ -5980,10 +5959,11 @@ struct ggml_tensor * ggml_rms_norm_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + // TODO: maybe store epsilon here? + result->op = GGML_OP_RMS_NORM; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; // TODO: maybe store epsilon here? return result; } @@ -6136,23 +6116,13 @@ struct ggml_tensor * ggml_set_impl( // make a view of the destination struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 5); - - (( int32_t * ) c->data)[0] = nb1; - (( int32_t * ) c->data)[1] = nb2; - (( int32_t * ) c->data)[2] = nb3; - (( int32_t * ) c->data)[3] = offset; - (( int32_t * ) c->data)[4] = inplace ? 1 : 0; - - ggml_scratch_load(ctx); + int32_t params[] = { nb1, nb2, nb3, offset, inplace ? 1 : 0 }; + ggml_set_op_params(result, params, sizeof(params)); result->op = GGML_OP_SET; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; } @@ -6277,7 +6247,6 @@ struct ggml_tensor * ggml_cont_impl( result->op = GGML_OP_CONT; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6321,7 +6290,6 @@ struct ggml_tensor * ggml_reshape( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6346,7 +6314,6 @@ struct ggml_tensor * ggml_reshape_1d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6372,7 +6339,6 @@ struct ggml_tensor * ggml_reshape_2d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6399,7 +6365,6 @@ struct ggml_tensor * ggml_reshape_3d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6428,7 +6393,6 @@ struct ggml_tensor * ggml_reshape_4d( result->op = GGML_OP_RESHAPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6450,19 +6414,11 @@ struct ggml_tensor * ggml_view_1d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 1, &ne0, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - ggml_scratch_save(ctx); - - struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); - - ggml_scratch_load(ctx); + ggml_set_op_params(result, &offset, sizeof(offset)); result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = offs; return result; } @@ -6488,13 +6444,7 @@ struct ggml_tensor * ggml_view_2d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 2, ne, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - ggml_scratch_save(ctx); - - struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); - - ggml_scratch_load(ctx); + ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = result->nb[1]*ne1; @@ -6503,8 +6453,6 @@ struct ggml_tensor * ggml_view_2d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = offs; return result; } @@ -6532,13 +6480,7 @@ struct ggml_tensor * ggml_view_3d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 3, ne, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - ggml_scratch_save(ctx); - - struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); - - ggml_scratch_load(ctx); + ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = nb2; @@ -6547,8 +6489,6 @@ struct ggml_tensor * ggml_view_3d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = offs; return result; } @@ -6578,13 +6518,7 @@ struct ggml_tensor * ggml_view_4d( struct ggml_tensor * result = ggml_new_tensor_impl(ctx, a->type, 4, ne, (char *) a->data + offset); ggml_format_name(result, "%s (view)", a->name); - ggml_scratch_save(ctx); - - struct ggml_tensor * offs = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(offs, "offset"); - memcpy(offs->data, &offset, 2*sizeof(int32_t)); - - ggml_scratch_load(ctx); + ggml_set_op_params(result, &offset, sizeof(offset)); result->nb[1] = nb1; result->nb[2] = nb2; @@ -6593,8 +6527,6 @@ struct ggml_tensor * ggml_view_4d( result->op = GGML_OP_VIEW; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = offs; return result; } @@ -6655,22 +6587,9 @@ struct ggml_tensor * ggml_permute( result->op = GGML_OP_PERMUTE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - if (is_node) { - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); - - ((int32_t *) b->data)[0] = axis0; - ((int32_t *) b->data)[1] = axis1; - ((int32_t *) b->data)[2] = axis2; - ((int32_t *) b->data)[3] = axis3; - - ggml_scratch_load(ctx); - - result->src[2] = b; - } + int32_t params[] = { axis0, axis1, axis2, axis3 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); return result; } @@ -6698,7 +6617,6 @@ struct ggml_tensor * ggml_transpose( result->op = GGML_OP_TRANSPOSE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6776,7 +6694,6 @@ struct ggml_tensor * ggml_diag( result->op = GGML_OP_DIAG; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6797,19 +6714,12 @@ struct ggml_tensor * ggml_diag_mask_inf_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = inplace ? 1 : 0; - - ggml_scratch_load(ctx); + int32_t params[] = { n_past, inplace ? 1 : 0 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_DIAG_MASK_INF; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -6844,20 +6754,12 @@ struct ggml_tensor * ggml_diag_mask_zero_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2); - ggml_set_name(b, "n_past, inplace"); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = inplace ? 1 : 0; - - ggml_scratch_load(ctx); + int32_t params[] = { n_past, inplace ? 1 : 0 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_DIAG_MASK_ZERO; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -6893,7 +6795,6 @@ struct ggml_tensor * ggml_soft_max_impl( result->op = GGML_OP_SOFT_MAX; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; return result; } @@ -6969,23 +6870,14 @@ struct ggml_tensor * ggml_rope_impl( struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_dims; - ((int32_t *) b->data)[2] = mode; - ((int32_t *) b->data)[3] = n_ctx; - memcpy((int32_t *) b->data + 4, &freq_base, sizeof(float)); - memcpy((int32_t *) b->data + 5, &freq_scale, sizeof(float)); - - ggml_scratch_load(ctx); + int32_t params[6] = { n_past, n_dims, mode, n_ctx }; + memcpy(params + 4, &freq_base, sizeof(float)); + memcpy(params + 5, &freq_scale, sizeof(float)); + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_ROPE; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -7042,22 +6934,12 @@ struct ggml_tensor * ggml_rope_back( struct ggml_tensor * result = ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); - ggml_set_name(b, "n_past, n_dims, mode"); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_dims; - ((int32_t *) b->data)[2] = mode; - ((int32_t *) b->data)[3] = n_ctx; - - ggml_scratch_load(ctx); + int32_t params[] = { n_past, n_dims, mode, n_ctx }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_ROPE_BACK; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -7082,21 +6964,13 @@ struct ggml_tensor * ggml_alibi( //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); struct ggml_tensor * result = ggml_view_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); - - ((int32_t *) b->data)[0] = n_past; - ((int32_t *) b->data)[1] = n_head; - GGML_ASSERT(sizeof(float) == sizeof(int32_t)); - (((float *) b->data)[2]) = bias_max; - - ggml_scratch_load(ctx); + int32_t op_params[3] = { n_past, n_head }; + memcpy(op_params + 2, &bias_max, sizeof(float)); + ggml_set_op_params(result, &op_params, sizeof(op_params)); result->op = GGML_OP_ALIBI; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -7118,19 +6992,12 @@ struct ggml_tensor * ggml_clamp( // TODO: when implement backward, fix this: struct ggml_tensor * result = ggml_view_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 2); - - ((float *) b->data)[0] = min; - ((float *) b->data)[1] = max; - - ggml_scratch_load(ctx); + float params[] = { min, max }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_CLAMP; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = b; return result; } @@ -7163,18 +7030,13 @@ GGML_API struct ggml_tensor * ggml_conv_1d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - ggml_scratch_save(ctx); - struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); - ((int32_t*)c->data)[0] = s0; - ((int32_t*)c->data)[1] = p0; - ((int32_t*)c->data)[2] = d0; - ggml_scratch_load(ctx); + int32_t params[] = { s0, p0, d0 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_CONV_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; } @@ -7207,21 +7069,13 @@ struct ggml_tensor* ggml_conv_2d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - ggml_scratch_save(ctx); - struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 6); - ((int32_t*)c->data)[0] = s0; - ((int32_t*)c->data)[1] = s1; - ((int32_t*)c->data)[2] = p0; - ((int32_t*)c->data)[3] = p1; - ((int32_t*)c->data)[4] = d0; - ((int32_t*)c->data)[5] = d1; - ggml_scratch_load(ctx); + int32_t params[] = { s0, s1, p0, p1, d0, d1 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_CONV_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = c; return result; @@ -7245,7 +7099,7 @@ static int64_t ggml_calc_pool_output_size(int64_t ins, int ks, int s, int p) { return (ins + 2 * p - ks) / s + 1; } -// ggml_pool_2d +// ggml_pool_1d struct ggml_tensor* ggml_pool_1d( struct ggml_context * ctx, @@ -7268,18 +7122,12 @@ struct ggml_tensor* ggml_pool_1d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 2, ne); - ggml_scratch_save(ctx); - struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 4); - ((int32_t*)c->data)[0] = op; - ((int32_t*)c->data)[1] = k0; - ((int32_t*)c->data)[2] = s0; - ((int32_t*)c->data)[3] = p0; - ggml_scratch_load(ctx); + int32_t params[] = { op, k0, s0, p0 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_POOL_1D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = c; return result; } @@ -7311,21 +7159,12 @@ struct ggml_tensor* ggml_pool_2d( }; struct ggml_tensor* result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); - ggml_scratch_save(ctx); - struct ggml_tensor* c = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 7); - ((int32_t*)c->data)[0] = op; - ((int32_t*)c->data)[1] = k0; - ((int32_t*)c->data)[2] = k1; - ((int32_t*)c->data)[3] = s0; - ((int32_t*)c->data)[4] = s1; - ((int32_t*)c->data)[5] = p0; - ((int32_t*)c->data)[6] = p1; - ggml_scratch_load(ctx); + int32_t params[] = { op, k0, k1, s0, s1, p0, p1 }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_POOL_2D; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = c; return result; } @@ -7484,21 +7323,12 @@ struct ggml_tensor * ggml_win_part( struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 3); - - ((int32_t *) b->data)[0] = npx; - ((int32_t *) b->data)[1] = npy; - ((int32_t *) b->data)[2] = w; - - ggml_scratch_load(ctx); + int32_t params[] = { npx, npy, w }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_WIN_PART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = b; return result; } @@ -7523,19 +7353,12 @@ struct ggml_tensor * ggml_win_unpart( const int64_t ne[4] = { a->ne[0], w0, h0, 1, }; struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 3, ne); - ggml_scratch_save(ctx); - - struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 1); - - ((int32_t *) b->data)[0] = w; - - ggml_scratch_load(ctx); + int32_t params[] = { w }; + ggml_set_op_params(result, ¶ms, sizeof(params)); result->op = GGML_OP_WIN_UNPART; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[1] = NULL; - result->src[2] = b; return result; } @@ -7553,19 +7376,13 @@ struct ggml_tensor * ggml_map_unary_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_UNARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[2] = addr_tensor; return result; } @@ -7600,20 +7417,14 @@ struct ggml_tensor * ggml_map_binary_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_BINARY; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = addr_tensor; return result; } @@ -7647,19 +7458,13 @@ struct ggml_tensor * ggml_map_custom1_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_CUSTOM1; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; - result->src[2] = addr_tensor; return result; } @@ -7692,20 +7497,14 @@ struct ggml_tensor * ggml_map_custom2_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_CUSTOM2; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = addr_tensor; return result; } @@ -7741,21 +7540,15 @@ struct ggml_tensor * ggml_map_custom3_impl_f32( is_node = true; } - struct ggml_tensor *result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); + struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); - ggml_scratch_save(ctx); - - struct ggml_tensor * addr_tensor = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, sizeof(void *) / sizeof(int32_t)); - *((void (**)(void))addr_tensor->data) = (void (*)(void))fun; - - ggml_scratch_load(ctx); + ggml_set_op_params(result, (const void *) &fun, sizeof(fun)); result->op = GGML_OP_MAP_CUSTOM3; result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL; result->src[0] = a; result->src[1] = b; - result->src[2] = addr_tensor; - result->src[3] = c; + result->src[2] = c; return result; } @@ -8983,21 +8776,17 @@ static void ggml_compute_forward_acc_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - GGML_ASSERT(opt0->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(opt0) == 5); - // view src0 and dst with these strides and data offset inbytes during acc // nb0 is implicitely element_size because src0 and dst are contiguous - size_t nb1 = ((int32_t *) opt0->data)[0]; - size_t nb2 = ((int32_t *) opt0->data)[1]; - size_t nb3 = ((int32_t *) opt0->data)[2]; - size_t offset = ((int32_t *) opt0->data)[3]; - bool inplace = (bool) ((int32_t *) opt0->data)[4]; + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; if (!inplace && (params->type == GGML_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. @@ -9066,13 +8855,12 @@ static void ggml_compute_forward_acc( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_acc_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_acc_f32(params, src0, src1, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -11092,21 +10880,17 @@ static void ggml_compute_forward_set_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(ggml_are_same_shape(src0, dst)); GGML_ASSERT(ggml_is_contiguous(dst) && ggml_is_contiguous(src0)); - GGML_ASSERT(opt0->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(opt0) == 5); - // view src0 and dst with these strides and data offset inbytes during set // nb0 is implicitely element_size because src0 and dst are contiguous - size_t nb1 = ((int32_t *) opt0->data)[0]; - size_t nb2 = ((int32_t *) opt0->data)[1]; - size_t nb3 = ((int32_t *) opt0->data)[2]; - size_t offset = ((int32_t *) opt0->data)[3]; - bool inplace = (bool) ((int32_t *) opt0->data)[4]; + size_t nb1 = ((int32_t *) dst->op_params)[0]; + size_t nb2 = ((int32_t *) dst->op_params)[1]; + size_t nb3 = ((int32_t *) dst->op_params)[2]; + size_t offset = ((int32_t *) dst->op_params)[3]; + bool inplace = (bool) ((int32_t *) dst->op_params)[4]; if (!inplace && (params->type == GGML_TASK_INIT)) { // memcpy needs to be synchronized across threads to avoid race conditions. @@ -11166,13 +10950,12 @@ static void ggml_compute_forward_set( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_set_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_set_f32(params, src0, src1, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -11568,17 +11351,14 @@ static void ggml_compute_forward_diag( static void ggml_compute_forward_diag_mask_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst, const float value) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 2); const int ith = params->ith; const int nth = params->nth; - const int n_past = ((int32_t *) src1->data)[0]; - const bool inplace = (bool)((int32_t *) src1->data)[1]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const bool inplace = (bool)((int32_t *) dst->op_params)[1]; GGML_ASSERT(n_past >= 0); @@ -11621,12 +11401,11 @@ static void ggml_compute_forward_diag_mask_f32( static void ggml_compute_forward_diag_mask_inf( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, -INFINITY); + ggml_compute_forward_diag_mask_f32(params, src0, dst, -INFINITY); } break; default: { @@ -11638,12 +11417,11 @@ static void ggml_compute_forward_diag_mask_inf( static void ggml_compute_forward_diag_mask_zero( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_diag_mask_f32(params, src0, src1, dst, 0); + ggml_compute_forward_diag_mask_f32(params, src0, dst, 0); } break; default: { @@ -11841,20 +11619,17 @@ static void ggml_compute_forward_soft_max_back( static void ggml_compute_forward_alibi_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { assert(params->ith == 0); - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 3); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_head = ((int32_t *) src1->data)[1]; - const float max_bias = ((float *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); assert(n_past >= 0); @@ -11907,20 +11682,17 @@ static void ggml_compute_forward_alibi_f32( static void ggml_compute_forward_alibi_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { assert(params->ith == 0); - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 3); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const int n_past = ((int32_t *) src1->data)[0]; - const int n_head = ((int32_t *) src1->data)[1]; - const float max_bias = ((float *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_head = ((int32_t *) dst->op_params)[1]; + float max_bias; + memcpy(&max_bias, (int32_t *) dst->op_params + 2, sizeof(float)); assert(n_past >= 0); @@ -11973,16 +11745,15 @@ static void ggml_compute_forward_alibi_f16( static void ggml_compute_forward_alibi( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_alibi_f16(params, src0, src1, dst); + ggml_compute_forward_alibi_f16(params, src0, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_alibi_f32(params, src0, src1, dst); + ggml_compute_forward_alibi_f32(params, src0, dst); } break; case GGML_TYPE_Q4_0: case GGML_TYPE_Q4_1: @@ -12012,19 +11783,17 @@ static void ggml_compute_forward_alibi( static void ggml_compute_forward_clamp_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { assert(params->ith == 0); - GGML_ASSERT(src1->type == GGML_TYPE_F32); - GGML_ASSERT(ggml_nelements(src1) == 2); - if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; } - const float min = ((float *) src1->data)[0]; - const float max = ((float *) src1->data)[1]; + float min; + float max; + memcpy(&min, (float *) dst->op_params + 0, sizeof(float)); + memcpy(&max, (float *) dst->op_params + 1, sizeof(float)); const int ith = params->ith; const int nth = params->nth; @@ -12054,12 +11823,11 @@ static void ggml_compute_forward_clamp_f32( static void ggml_compute_forward_clamp( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_clamp_f32(params, src0, src1, dst); + ggml_compute_forward_clamp_f32(params, src0, dst); } break; case GGML_TYPE_F16: case GGML_TYPE_Q4_0: @@ -12089,10 +11857,7 @@ static void ggml_compute_forward_clamp( static void ggml_compute_forward_rope_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12101,12 +11866,12 @@ static void ggml_compute_forward_rope_f32( float freq_base; float freq_scale; - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; - memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); assert(n_past >= 0); @@ -12221,10 +11986,7 @@ static void ggml_compute_forward_rope_f32( static void ggml_compute_forward_rope_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { - GGML_ASSERT(src1->type == GGML_TYPE_I32); - GGML_ASSERT(ggml_nelements(src1) == 6); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12233,12 +11995,12 @@ static void ggml_compute_forward_rope_f16( float freq_base; float freq_scale; - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; - memcpy(&freq_base, (int32_t *) src1->data + 4, sizeof(float)); - memcpy(&freq_scale, (int32_t *) src1->data + 5, sizeof(float)); + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; + const int n_ctx = ((int32_t *) dst->op_params)[3]; + memcpy(&freq_base, (int32_t *) dst->op_params + 4, sizeof(float)); + memcpy(&freq_scale, (int32_t *) dst->op_params + 5, sizeof(float)); assert(n_past >= 0); @@ -12353,16 +12115,15 @@ static void ggml_compute_forward_rope_f16( static void ggml_compute_forward_rope( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_f16(params, src0, src1, dst); + ggml_compute_forward_rope_f16(params, src0, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_f32(params, src0, src1, dst); + ggml_compute_forward_rope_f32(params, src0, dst); } break; default: { @@ -12376,10 +12137,7 @@ static void ggml_compute_forward_rope( static void ggml_compute_forward_rope_back_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 4); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12389,9 +12147,9 @@ static void ggml_compute_forward_rope_back_f32( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; assert(n_past >= 0); @@ -12475,10 +12233,7 @@ static void ggml_compute_forward_rope_back_f32( static void ggml_compute_forward_rope_back_f16( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 3); if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -12488,9 +12243,9 @@ static void ggml_compute_forward_rope_back_f16( // dx = rope_back(dy, src1) // src0 is dy, src1 contains options - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; + const int n_past = ((int32_t *) dst->op_params)[0]; + const int n_dims = ((int32_t *) dst->op_params)[1]; + const int mode = ((int32_t *) dst->op_params)[2]; assert(n_past >= 0); @@ -12574,16 +12329,15 @@ static void ggml_compute_forward_rope_back_f16( static void ggml_compute_forward_rope_back( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * src1, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_rope_back_f16(params, src0, src1, dst); + ggml_compute_forward_rope_back_f16(params, src0, dst); } break; case GGML_TYPE_F32: { - ggml_compute_forward_rope_back_f32(params, src0, src1, dst); + ggml_compute_forward_rope_back_f32(params, src0, dst); } break; default: { @@ -12780,7 +12534,7 @@ static void ggml_compute_forward_conv_1d_s1_ph( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { @@ -12983,7 +12737,7 @@ static void ggml_compute_forward_conv_1d_s2_ph( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { @@ -13003,14 +12757,13 @@ static void ggml_compute_forward_conv_1d_s2_ph( // ggml_compute_forward_conv_1d static void ggml_compute_forward_conv_1d( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { - const int32_t s0 = ((const int32_t*)(opt0->data))[0]; - const int32_t p0 = ((const int32_t*)(opt0->data))[1]; - const int32_t d0 = ((const int32_t*)(opt0->data))[2]; + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + const struct ggml_tensor * src1, + struct ggml_tensor * dst) { + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[1]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[2]; GGML_ASSERT(d0 == 1); // dilation not supported GGML_ASSERT(p0 == src0->ne[0]/2); // only half padding supported if (s0 == 1) { @@ -13028,7 +12781,6 @@ static void ggml_compute_forward_conv_2d_f16_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { GGML_ASSERT(src0->type == GGML_TYPE_F16); GGML_ASSERT(src1->type == GGML_TYPE_F32); @@ -13048,12 +12800,12 @@ static void ggml_compute_forward_conv_2d_f16_f32( // size of the convolution row - the kernel size unrolled across all channels const int ew0 = nk0*nk1*ne02; - const int32_t s0 = ((const int32_t*)(opt0->data))[0]; - const int32_t s1 = ((const int32_t*)(opt0->data))[1]; - const int32_t p0 = ((const int32_t*)(opt0->data))[2]; - const int32_t p1 = ((const int32_t*)(opt0->data))[3]; - const int32_t d0 = ((const int32_t*)(opt0->data))[4]; - const int32_t d1 = ((const int32_t*)(opt0->data))[5]; + const int32_t s0 = ((const int32_t*)(dst->op_params))[0]; + const int32_t s1 = ((const int32_t*)(dst->op_params))[1]; + const int32_t p0 = ((const int32_t*)(dst->op_params))[2]; + const int32_t p1 = ((const int32_t*)(dst->op_params))[3]; + const int32_t d0 = ((const int32_t*)(dst->op_params))[4]; + const int32_t d1 = ((const int32_t*)(dst->op_params))[5]; GGML_ASSERT(nb00 == sizeof(ggml_fp16_t)); GGML_ASSERT(nb10 == sizeof(float)); @@ -13125,17 +12877,15 @@ static void ggml_compute_forward_conv_2d( const struct ggml_compute_params * params, const struct ggml_tensor * src0, const struct ggml_tensor * src1, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst - ) { + struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F16: { - ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, opt0, dst); + ggml_compute_forward_conv_2d_f16_f32(params, src0, src1, dst); } break; case GGML_TYPE_F32: { - //ggml_compute_forward_conv_2d_f32(params, src0, src1, opt0, dst); + //ggml_compute_forward_conv_2d_f32(params, src0, src1, dst); GGML_ASSERT(false); } break; default: @@ -13200,12 +12950,11 @@ static void ggml_compute_forward_pool_1d_sk_p0( // ggml_compute_forward_pool_1d static void ggml_compute_forward_pool_1d( - const struct ggml_compute_params* params, - const struct ggml_tensor* src0, - const struct ggml_tensor* opt0, - struct ggml_tensor* dst) { - GGML_ASSERT(opt0->ne[0] == 4); - const int* opts = (const int*)opt0->data; + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + const int32_t* opts = (const int32_t*)dst->op_params; enum ggml_op_pool op = opts[0]; const int k0 = opts[1]; const int s0 = opts[2]; @@ -13219,12 +12968,12 @@ static void ggml_compute_forward_pool_1d( // ggml_compute_forward_pool_2d_sk_p0 static void ggml_compute_forward_pool_2d_sk_p0( - const struct ggml_compute_params * params, - const enum ggml_op_pool op, - const struct ggml_tensor * src, - const int k0, - const int k1, - struct ggml_tensor * dst) { + const struct ggml_compute_params * params, + const enum ggml_op_pool op, + const struct ggml_tensor * src, + const int k0, + const int k1, + struct ggml_tensor * dst) { assert(src->type == GGML_TYPE_F32); assert(params->ith == 0); @@ -13284,12 +13033,11 @@ static void ggml_compute_forward_pool_2d_sk_p0( // ggml_compute_forward_pool_2d static void ggml_compute_forward_pool_2d( - const struct ggml_compute_params * params, - const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, - struct ggml_tensor * dst) { - GGML_ASSERT(opt0->ne[0] == 7); - const int* opts = (const int*)opt0->data; + const struct ggml_compute_params * params, + const struct ggml_tensor * src0, + struct ggml_tensor * dst) { + + const int32_t * opts = (const int32_t *)dst->op_params; enum ggml_op_pool op = opts[0]; const int k0 = opts[1]; const int k1 = opts[2]; @@ -13314,7 +13062,7 @@ static void ggml_compute_forward_flash_attn_f32( const struct ggml_tensor * k, const struct ggml_tensor * v, const bool masked, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -13492,7 +13240,7 @@ static void ggml_compute_forward_flash_attn_f16( const struct ggml_tensor * k, const struct ggml_tensor * v, const bool masked, - struct ggml_tensor * dst) { + struct ggml_tensor * dst) { int64_t t0 = ggml_perf_time_us(); UNUSED(t0); @@ -14257,7 +14005,6 @@ static void ggml_compute_forward_flash_attn_back( static void ggml_compute_forward_win_part_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -14266,9 +14013,9 @@ static void ggml_compute_forward_win_part_f32( GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - const int32_t nep0 = ((const int32_t *)(opt0->data))[0]; - const int32_t nep1 = ((const int32_t *)(opt0->data))[1]; - const int32_t w = ((const int32_t *)(opt0->data))[2]; + const int32_t nep0 = ((const int32_t *)(dst->op_params))[0]; + const int32_t nep1 = ((const int32_t *)(dst->op_params))[1]; + const int32_t w = ((const int32_t *)(dst->op_params))[2]; assert(ne00 == ne0); assert(ne3 == nep0*nep1); @@ -14302,12 +14049,11 @@ static void ggml_compute_forward_win_part_f32( static void ggml_compute_forward_win_part( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_part_f32(params, src0, opt0, dst); + ggml_compute_forward_win_part_f32(params, src0, dst); } break; default: { @@ -14321,7 +14067,6 @@ static void ggml_compute_forward_win_part( static void ggml_compute_forward_win_unpart_f32( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) { return; @@ -14330,7 +14075,7 @@ static void ggml_compute_forward_win_unpart_f32( GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne); GGML_TENSOR_LOCALS(int64_t, ne, dst, ne); - const int32_t w = ((const int32_t *)(opt0->data))[0]; + const int32_t w = ((const int32_t *)(dst->op_params))[0]; // padding const int px = (w - ne1%w)%w; @@ -14364,12 +14109,11 @@ static void ggml_compute_forward_win_unpart_f32( static void ggml_compute_forward_win_unpart( const struct ggml_compute_params * params, const struct ggml_tensor * src0, - const struct ggml_tensor * opt0, struct ggml_tensor * dst) { switch (src0->type) { case GGML_TYPE_F32: { - ggml_compute_forward_win_unpart_f32(params, src0, opt0, dst); + ggml_compute_forward_win_unpart_f32(params, src0, dst); } break; default: { @@ -14888,7 +14632,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_ACC: { - ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_acc(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_SUB: { @@ -15008,7 +14752,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_SET: { - ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_set(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_CPY: { @@ -15048,11 +14792,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_DIAG_MASK_INF: { - ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_diag_mask_inf(params, tensor->src[0], tensor); } break; case GGML_OP_DIAG_MASK_ZERO: { - ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_diag_mask_zero(params, tensor->src[0], tensor); } break; case GGML_OP_SOFT_MAX: { @@ -15064,35 +14808,35 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_ROPE: { - ggml_compute_forward_rope(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope(params, tensor->src[0], tensor); } break; case GGML_OP_ROPE_BACK: { - ggml_compute_forward_rope_back(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_rope_back(params, tensor->src[0], tensor); } break; case GGML_OP_ALIBI: { - ggml_compute_forward_alibi(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_alibi(params, tensor->src[0], tensor); } break; case GGML_OP_CLAMP: { - ggml_compute_forward_clamp(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_clamp(params, tensor->src[0], tensor); } break; case GGML_OP_CONV_1D: { - ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_conv_1d(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_CONV_2D: { - ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor); + ggml_compute_forward_conv_2d(params, tensor->src[0], tensor->src[1], tensor); } break; case GGML_OP_POOL_1D: { - ggml_compute_forward_pool_1d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_pool_1d(params, tensor->src[0], tensor); } break; case GGML_OP_POOL_2D: { - ggml_compute_forward_pool_2d(params, tensor->src[0], tensor->src[1], tensor); + ggml_compute_forward_pool_2d(params, tensor->src[0], tensor); } break; case GGML_OP_FLASH_ATTN: { @@ -15114,40 +14858,45 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm } break; case GGML_OP_WIN_PART: { - ggml_compute_forward_win_part(params, tensor->src[0], tensor->src[2], tensor); + ggml_compute_forward_win_part(params, tensor->src[0], tensor); } break; case GGML_OP_WIN_UNPART: { - ggml_compute_forward_win_unpart(params, tensor->src[0], tensor->src[2], tensor); + ggml_compute_forward_win_unpart(params, tensor->src[0], tensor); } break; case GGML_OP_MAP_UNARY: { - const ggml_unary_op_f32_t fun = *((ggml_unary_op_f32_t *)tensor->src[2]->data); + ggml_unary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); ggml_compute_forward_map_unary(params, tensor->src[0], tensor, fun); } break; case GGML_OP_MAP_BINARY: { - const ggml_binary_op_f32_t fun = *((ggml_binary_op_f32_t *)tensor->src[2]->data); + ggml_binary_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); ggml_compute_forward_map_binary(params, tensor->src[0], tensor->src[1], tensor, fun); } break; case GGML_OP_MAP_CUSTOM1: { - const ggml_custom1_op_f32_t fun = *((ggml_custom1_op_f32_t *)tensor->src[2]->data); + ggml_custom1_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); ggml_compute_forward_map_custom1(params, tensor->src[0], tensor, fun); } break; case GGML_OP_MAP_CUSTOM2: { - const ggml_custom2_op_f32_t fun = *((ggml_custom2_op_f32_t *)tensor->src[2]->data); + ggml_custom2_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); ggml_compute_forward_map_custom2(params, tensor->src[0], tensor->src[1], tensor, fun); } break; case GGML_OP_MAP_CUSTOM3: { - const ggml_custom3_op_f32_t fun = *((ggml_custom3_op_f32_t *)tensor->src[2]->data); - ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[3], tensor, fun); + ggml_custom3_op_f32_t fun; + memcpy(&fun, tensor->op_params, sizeof(fun)); + ggml_compute_forward_map_custom3(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor, fun); } break; case GGML_OP_CROSS_ENTROPY_LOSS: @@ -15211,12 +14960,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor src0->grad = ggml_add_impl(ctx, src0->grad, tensor->grad, inplace); } if (src1->grad) { - GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5); - GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2]; - const size_t offset = (( int32_t * ) tensor->src[2]->data)[3]; + const size_t nb1 = ((int32_t *) tensor->op_params)[0]; + const size_t nb2 = ((int32_t *) tensor->op_params)[1]; + const size_t nb3 = ((int32_t *) tensor->op_params)[2]; + const size_t offset = ((int32_t *) tensor->op_params)[3]; struct ggml_tensor * tensor_grad_view = ggml_view_4d(ctx, tensor->grad, @@ -15524,12 +15271,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor } break; case GGML_OP_SET: { - GGML_ASSERT(ggml_nelements(tensor->src[2]) == 5); - GGML_ASSERT(tensor->src[2]->type == GGML_TYPE_I32); - const size_t nb1 = (( int32_t * ) tensor->src[2]->data)[0]; - const size_t nb2 = (( int32_t * ) tensor->src[2]->data)[1]; - const size_t nb3 = (( int32_t * ) tensor->src[2]->data)[2]; - const size_t offset = (( int32_t * ) tensor->src[2]->data)[3]; + const size_t nb1 = ((int32_t *) tensor->op_params)[0]; + const size_t nb2 = ((int32_t *) tensor->op_params)[1]; + const size_t nb3 = ((int32_t *) tensor->op_params)[2]; + const size_t offset = ((int32_t *) tensor->op_params)[3]; struct ggml_tensor * tensor_grad_view = NULL; @@ -15606,8 +15351,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor if (src0->grad) { size_t offset; - GGML_ASSERT(sizeof(offset) <= ggml_nbytes(tensor->src[2])); - memcpy(&offset, tensor->src[2]->data, sizeof(offset)); + memcpy(&offset, tensor->op_params, sizeof(offset)); size_t nb1 = tensor->nb[1]; size_t nb2 = tensor->nb[2]; @@ -15634,7 +15378,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - int32_t * axes = (int32_t *) tensor->src[2]->data; + int32_t * axes = (int32_t *) tensor->op_params; int axis0 = axes[0] & 0x3; int axis1 = axes[1] & 0x3; int axis2 = axes[2] & 0x3; @@ -15690,9 +15434,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); - const int n_past = ((int32_t *) src1->data)[0]; + const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = ggml_add_impl(ctx, src0->grad, ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), @@ -15706,9 +15448,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 2); - const int n_past = ((int32_t *) src1->data)[0]; + const int n_past = ((int32_t *) tensor->op_params)[0]; src0->grad = ggml_add_impl(ctx, src0->grad, ggml_diag_mask_zero_impl(ctx, tensor->grad, n_past, false), @@ -15737,12 +15477,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor { // necessary for llama if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 6); - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; + const int n_past = ((int32_t *) tensor->op_params)[0]; + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; src0->grad = ggml_add_impl(ctx, src0->grad, ggml_rope_back(ctx, @@ -15760,12 +15498,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor case GGML_OP_ROPE_BACK: { if (src0->grad) { - assert(src1->type == GGML_TYPE_I32); - assert(ggml_nelements(src1) == 4); - const int n_past = ((int32_t *) src1->data)[0]; - const int n_dims = ((int32_t *) src1->data)[1]; - const int mode = ((int32_t *) src1->data)[2]; - const int n_ctx = ((int32_t *) src1->data)[3]; + const int n_past = ((int32_t *) tensor->op_params)[0]; + const int n_dims = ((int32_t *) tensor->op_params)[1]; + const int mode = ((int32_t *) tensor->op_params)[2]; + const int n_ctx = ((int32_t *) tensor->op_params)[3]; src0->grad = ggml_add_impl(ctx, src0->grad, ggml_rope(ctx, @@ -16543,9 +16279,6 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_GET_ROWS_BACK: case GGML_OP_DIAG: case GGML_OP_DIAG_MASK_ZERO: - { - n_tasks = 1; - } break; case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: case GGML_OP_SOFT_MAX_BACK: @@ -17289,7 +17022,7 @@ struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** tensor = ggml_view_4d(*ctx_eval, args[0], ne[0], ne[1], ne[2], ne[3], 0, 0, 0, 0); uint64_t offs; - memcpy(&offs, args[2]->data, sizeof(offs)); + memcpy(&offs, tensor->op_params, sizeof(offs)); tensor->data = ((char *) tensor->data) + offs; } break; diff --git a/ggml.h b/ggml.h index 5023b1652..871c85a89 100644 --- a/ggml.h +++ b/ggml.h @@ -199,6 +199,7 @@ #define GGML_MAX_CONTEXTS 64 #define GGML_MAX_SRC 6 #define GGML_MAX_NAME 48 +#define GGML_MAX_OP_PARAMS 32 #define GGML_DEFAULT_N_THREADS 4 @@ -418,6 +419,9 @@ extern "C" { // compute data enum ggml_op op; + // op params - allocated as int32_t for alignment + int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(uint32_t)]; + bool is_param; struct ggml_tensor * grad; From 3602ac4255fd4cd589821346290b464a64b955d1 Mon Sep 17 00:00:00 2001 From: slaren Date: Sun, 23 Jul 2023 15:19:39 +0200 Subject: [PATCH 12/14] fix n_tasks (#2342) ggml-ci --- ggml.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ggml.c b/ggml.c index 747a39241..9ee4a8d7f 100644 --- a/ggml.c +++ b/ggml.c @@ -16278,6 +16278,9 @@ struct ggml_cplan ggml_graph_plan(struct ggml_cgraph * cgraph, int n_threads) { case GGML_OP_GET_ROWS: case GGML_OP_GET_ROWS_BACK: case GGML_OP_DIAG: + { + n_tasks = 1; + } break; case GGML_OP_DIAG_MASK_ZERO: case GGML_OP_DIAG_MASK_INF: case GGML_OP_SOFT_MAX: From 57921ca6db53e08eb90010fba99add85be28b5a1 Mon Sep 17 00:00:00 2001 From: wzy <32936898+Freed-Wu@users.noreply.github.com> Date: Sun, 23 Jul 2023 21:33:02 +0800 Subject: [PATCH 13/14] common : n_threads == -1 uses std::thread::hardware_concurrency() (#2347) * Fix #2345, fix incorrect n_threads * Update examples/common.cpp --------- Co-authored-by: Georgi Gerganov --- examples/common.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/examples/common.cpp b/examples/common.cpp index 5608ca87f..7a1928f2b 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -117,6 +117,9 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { break; } params.n_threads = std::stoi(argv[i]); + if (params.n_threads <= 0) { + params.n_threads = std::thread::hardware_concurrency(); + } } else if (arg == "-p" || arg == "--prompt") { if (++i >= argc) { invalid_param = true; From 70d26ac3883009946e737525506fa88f52727132 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Sun, 23 Jul 2023 17:49:06 +0200 Subject: [PATCH 14/14] Fix __dp4a documentation (#2348) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c9fe6187b..a0e0ea2e0 100644 --- a/README.md +++ b/README.md @@ -401,7 +401,7 @@ Building the program with BLAS support may lead to some performance improvements | Option | Legal values | Default | Description | |-------------------------|------------------------|---------|-------------| - | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 7.0/Turing/RTX 2000 or higher). Does not affect k-quants. | + | LLAMA_CUDA_FORCE_DMMV | Boolean | false | Force the use of dequantization + matrix vector multiplication kernels instead of using kernels that do matrix vector multiplication on quantized data. By default the decision is made based on compute capability (MMVQ for 6.1/Pascal/GTX 1000 or higher). Does not affect k-quants. | | LLAMA_CUDA_DMMV_X | Positive integer >= 32 | 32 | Number of values in x direction processed by the CUDA dequantization + matrix vector multiplication kernel per iteration. Increasing this value can improve performance on fast GPUs. Power of 2 heavily recommended. Does not affect k-quants. | | LLAMA_CUDA_MMV_Y | Positive integer | 1 | Block size in y direction for the CUDA mul mat vec kernels. Increasing this value can improve performance on fast GPUs. Power of 2 recommended. Does not affect k-quants. | | LLAMA_CUDA_DMMV_F16 | Boolean | false | If enabled, use half-precision floating point arithmetic for the CUDA dequantization + mul mat vec kernels. Can improve performance on relatively recent GPUs. |