[SYCL] Fix SYCL im2col
and convert
Overflow with Large Dims (#9052)
* sycl: fix im2col overflow and sync with cuda Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl: fix convert overflow Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl: fix convert and dequantize Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl: fix ib in dmmv Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl:refine convert Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * sycl: move downsample global_range into common Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * test: add im2col and convert test cases Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * test: make new cases only in sycl Signed-off-by: zhentaoyu <zhentao.yu@intel.com> * test: comment new test_cases for only local testing Signed-off-by: zhentaoyu <zhentao.yu@intel.com> --------- Signed-off-by: zhentaoyu <zhentao.yu@intel.com>
This commit is contained in:
parent
90db8146d5
commit
4f8d19ff17
11 changed files with 333 additions and 262 deletions
|
@ -2145,6 +2145,13 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
||||
test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {3000, 128, 1, 1}, {3, 128, 1280, 1}, 1, 0, 1, 0, 1, 0, false));
|
||||
|
||||
// sycl backend will limit task global_range < MAX_INT
|
||||
// test cases for 2D im2col with large input W and H (occurs in stable-diffusion)
|
||||
// however these cases need to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
||||
// these cases are verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
|
||||
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F16, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
||||
// test_cases.emplace_back(new test_im2col(GGML_TYPE_F32, GGML_TYPE_F16, GGML_TYPE_F32, {1024, 1024, 256, 1}, {3, 3, 256, 1}, 1, 1, 1, 1, 1, 1, true));
|
||||
|
||||
test_cases.emplace_back(new test_conv_transpose_1d());
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 3, 0, 1));
|
||||
test_cases.emplace_back(new test_conv_transpose_1d({3,2,1,1}, {2,3,2,1}, 2, 0, 1));
|
||||
|
@ -2287,6 +2294,12 @@ static bool test_backend(ggml_backend_t backend, test_mode mode, const char * op
|
|||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
|
||||
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
|
||||
|
||||
// sycl backend will limit task global_range < MAX_INT
|
||||
// test case for f16-type-convert-to-fp32 kernel with large k under fp32 compute dtype (occurs in stable-diffusion)
|
||||
// however this case needs to alloc more memory which may fail in some devices (Intel Arc770, etc.)
|
||||
// this case is verified (pass) in Intel(R) Data Center GPU Max 1100 (sycl backend) and NV A30 (cuda backend)
|
||||
// test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F16, 512, 262144, 9216, {1, 1}, {1, 1}));
|
||||
|
||||
for (ggml_type type_a : base_types) {
|
||||
for (ggml_type type_b : {GGML_TYPE_F32 /*, GGML_TYPE_F16 */}) {
|
||||
for (int n_mats : {4, 8}) {
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue