From b33dee282f5d8032b5f780152732dc45cbf2d349 Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Fri, 9 Jun 2023 11:11:04 +0300 Subject: [PATCH 1/2] metal : fix build "tanhf" -> "tanh" --- ggml-metal.metal | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ggml-metal.metal b/ggml-metal.metal index 745fe8ad3..c94ef83f9 100644 --- a/ggml-metal.metal +++ b/ggml-metal.metal @@ -89,7 +89,7 @@ kernel void kernel_gelu( device float * dst, uint tpig[[thread_position_in_grid]]) { float x = src0[tpig]; - dst[tpig] = 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); + dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x))); } kernel void kernel_soft_max( From ae9663f1887513e152839e91f61c513075a19422 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20G=C3=A4=C3=9Fler?= Date: Fri, 9 Jun 2023 13:58:15 +0200 Subject: [PATCH 2/2] Windows nvcc workaround (#1753) Fix gibberish output on Windows when using CUDA --- ggml-cuda.cu | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index b1e513bc9..a62f26e1e 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -1512,6 +1512,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm i01_high = row_high % ne01; } } + + // There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables. + // Removing the first assert or changing the order of the arguments causes the second assert to fail. + // Removing both asserts results in i01_high becoming 0 which in turn results in garbage output. + // The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU). + GGML_ASSERT(i01_low == 0 || g_device_count > 1); + GGML_ASSERT(i01_high == ne01 || g_device_count > 1); + const int64_t i01_diff = i01_high - i01_low; if (i01_diff == 0) { continue; @@ -1727,6 +1735,7 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const row_low -= row_low % GGML_CUDA_DMMV_Y; row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1]; row_high -= row_high % GGML_CUDA_DMMV_Y; + GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0); } else { GGML_ASSERT(false); }