From 6487e4b9db8612c08ecc51747a1205067bfee2e4 Mon Sep 17 00:00:00 2001
From: wenxizhu <wenxizhu@tencent.com>
Date: Wed, 7 Jul 2021 14:17:04 +0800
Subject: [PATCH 1/2] Fix the integer overflow problem of the scatter_nd op.

---
 python/tvm/topi/cuda/scatter.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index cee13d7e01a2..d260cfae4de3 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -802,8 +802,12 @@ def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr):
         ib.scope_attr(bx, "thread_extent", bdim)
 
         with ib.for_range(0, ceil_div(fused_shape, bdim)) as i:
-            index = i * fused_updates_dimension + bx * tdim + tx
-            with ib.if_scope(index < fused_shape):
+            # The (i * fused_updates_dimension + bx*tdim + tx < fused_shape)
+            # would cause an int32 overflow when i is a very big number.
+            # So we use "i < (fused_shape - bx*tdim - tx) / fused_updates_dimension"
+            # here to avoid the error.
+            index = tvm.tir.Div(fused_shape - bx*tdim - tx, fused_updates_dimension)
+            with ib.if_scope(i < index):
                 out[index] = data[index]
 
         with ib.for_range(0, fused_indices_dimension) as i:

From cd30a24ecd2b7ad5ab687bad0e27604616d975b5 Mon Sep 17 00:00:00 2001
From: wenxizhu <wenxizhu@tencent.com>
Date: Fri, 9 Jul 2021 11:43:52 +0800
Subject: [PATCH 2/2] Fix scatter_nd's crash problem: 1. Existing scatter_nd
 cuda implementation has a very large bound,    which could overflow int32
 range when input tensor shape is    large enough; 2. The overflow could cause
 the if statement always evaluate to    true, thus conducts invalid memory
 accesses; 3. We fix this problem in this commit by reducing the bound, the   
 original large bound is not only unnecessary, but also degrading    the
 performance; With this fix, scatter_op's performance improves    100x on some
 cases.

---
 python/tvm/topi/cuda/scatter.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py
index d260cfae4de3..c697b648786e 100644
--- a/python/tvm/topi/cuda/scatter.py
+++ b/python/tvm/topi/cuda/scatter.py
@@ -801,13 +801,11 @@ def gen_ir(data_ptr, indices_ptr, updates_ptr, out_ptr):
         bdim = ceil_div(fused_updates_dimension, tdim)
         ib.scope_attr(bx, "thread_extent", bdim)
 
-        with ib.for_range(0, ceil_div(fused_shape, bdim)) as i:
-            # The (i * fused_updates_dimension + bx*tdim + tx < fused_shape)
-            # would cause an int32 overflow when i is a very big number.
-            # So we use "i < (fused_shape - bx*tdim - tx) / fused_updates_dimension"
-            # here to avoid the error.
-            index = tvm.tir.Div(fused_shape - bx*tdim - tx, fused_updates_dimension)
-            with ib.if_scope(i < index):
+        # Copy data into the output. This loop writes to the same portions of
+        # memory as the following loop, so we do not need a memory sync.
+        with ib.for_range(0, ceil_div(fused_shape, fused_updates_dimension), name="i") as i:
+            index = i * fused_updates_dimension + bx * tdim + tx
+            with ib.if_scope(bx * tdim + tx < fused_updates_dimension):
                 out[index] = data[index]
 
         with ib.for_range(0, fused_indices_dimension) as i: