From 6487e4b9db8612c08ecc51747a1205067bfee2e4 Mon Sep 17 00:00:00 2001 From: wenxizhu Date: Wed, 7 Jul 2021 14:17:04 +0800 Subject: [PATCH 1/2] Fix the integer overflow problem of the scatter_nd op. --- python/tvm/topi/cuda/scatter.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index cee13d7e01a2..d260cfae4de3 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -802,8 +802,12 @@ def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr): ib.scope_attr(bx, "thread_extent", bdim) with ib.for_range(0, ceil_div(fused_shape, bdim)) as i: - index = i * fused_updates_dimension + bx * tdim + tx - with ib.if_scope(index < fused_shape): + # The (i * fused_updates_dimension + bx*tdim + tx < fused_shape) + # would cause an int32 overflow when i is a very big number. + # So we use "i < (fused_shape - bx*tdim - tx) / fused_updates_dimension" + # here to avoid the error. + index = tvm.tir.Div(fused_shape - bx*tdim - tx, fused_updates_dimension) + with ib.if_scope(i < index): out[index] = data[index] with ib.for_range(0, fused_indices_dimension) as i: From cd30a24ecd2b7ad5ab687bad0e27604616d975b5 Mon Sep 17 00:00:00 2001 From: wenxizhu Date: Fri, 9 Jul 2021 11:43:52 +0800 Subject: [PATCH 2/2] Fix scatter_nd's crash problem: 1. Existing scatter_nd cuda implementation has a very large bound, which could overflow int32 range when input tensor shape is large enough; 2. The overflow could cause the if statement always evaluate to true, thus conducts invalid memory accesses; 3. We fix this problem in this commit by reducing the bound, the original large bound is not only unnecessary, but also degrading the performance; With this fix, scatter_op's performance improves 100x on some cases. --- python/tvm/topi/cuda/scatter.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index d260cfae4de3..c697b648786e 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -801,13 +801,11 @@ def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr): bdim = ceil_div(fused_updates_dimension, tdim) ib.scope_attr(bx, "thread_extent", bdim) - with ib.for_range(0, ceil_div(fused_shape, bdim)) as i: - # The (i * fused_updates_dimension + bx*tdim + tx < fused_shape) - # would cause an int32 overflow when i is a very big number. - # So we use "i < (fused_shape - bx*tdim - tx) / fused_updates_dimension" - # here to avoid the error. - index = tvm.tir.Div(fused_shape - bx*tdim - tx, fused_updates_dimension) - with ib.if_scope(i < index): + # Copy data into the output. This loop writes to the same portions of + # memory as the following loop, so we do not need a memory sync. + with ib.for_range(0, ceil_div(fused_shape, fused_updates_dimension), name="i") as i: + index = i * fused_updates_dimension + bx * tdim + tx + with ib.if_scope(bx * tdim + tx < fused_updates_dimension): out[index] = data[index] with ib.for_range(0, fused_indices_dimension) as i: