Merge branch 'master' into concedo_experimental
This commit is contained in:
commit
507939c135
2 changed files with 10 additions and 1 deletions
|
@ -1512,6 +1512,14 @@ static void ggml_cuda_op(const ggml_tensor * src0, const ggml_tensor * src1, ggm
|
|||
i01_high = row_high % ne01;
|
||||
}
|
||||
}
|
||||
|
||||
// There is possibly a bug in the Windows nvcc compiler regarding instruction reordering or optimizing out local variables.
|
||||
// Removing the first assert or changing the order of the arguments causes the second assert to fail.
|
||||
// Removing both asserts results in i01_high becoming 0 which in turn results in garbage output.
|
||||
// The root cause seems to be a problem with i0_offset_high becoming 0 when it should always be >0 (for single GPU).
|
||||
GGML_ASSERT(i01_low == 0 || g_device_count > 1);
|
||||
GGML_ASSERT(i01_high == ne01 || g_device_count > 1);
|
||||
|
||||
const int64_t i01_diff = i01_high - i01_low;
|
||||
if (i01_diff == 0) {
|
||||
continue;
|
||||
|
@ -1727,6 +1735,7 @@ void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const
|
|||
row_low -= row_low % GGML_CUDA_DMMV_Y;
|
||||
row_high = id == g_device_count - 1 ? nrows : nrows*g_tensor_split[id + 1];
|
||||
row_high -= row_high % GGML_CUDA_DMMV_Y;
|
||||
GGML_ASSERT(nrows % GGML_CUDA_DMMV_Y == 0);
|
||||
} else {
|
||||
GGML_ASSERT(false);
|
||||
}
|
||||
|
|
|
@ -89,7 +89,7 @@ kernel void kernel_gelu(
|
|||
device float * dst,
|
||||
uint tpig[[thread_position_in_grid]]) {
|
||||
float x = src0[tpig];
|
||||
dst[tpig] = 0.5f*x*(1.0f + tanhf(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
dst[tpig] = 0.5f*x*(1.0f + tanh(SQRT_2_OVER_PI*x*(1.0f + GELU_COEF_A*x*x)));
|
||||
}
|
||||
|
||||
kernel void kernel_soft_max(
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue