From 08a44943918cffd40538d83624f63f95e7387fc2 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Wed, 21 Jun 2023 17:31:19 +0300
Subject: [PATCH 1/4] [Relay] Introduce arguments limit to FuseOps pass

In PR #8313 a parameter `max_function_args` was introduced. It leads to
limit number of function argument and in case when this value is
exceeded then concatenation layer is split to a several concat
operations.

I faced a problem on Adreno GPU that for kernel with big number of
arguments the enqueueNDRange was crashed without any errors. The
problem appeared because of the huge number of arguments. But in this
case not only concat layer was a root cause of the problem. Also after
fusing several operations the final functions had a big number of
arguments.

As it was discussed in #8313, adding a limitation on the number of
function arguments to the FuseOps pass might be a good improvement. In
this PR I introduced such mechanism for limitation number of function
arguments for FuseOps pass and add an arguments limit to OpenCL devices
at 128 parameters.

The idea of current approach is calculate the number of arguments for
each node in fusing algorithm and in case then the number of function
arguments exceeds the limit, specified by `max_function_args`, then the
fusing should be stopped. In case when node has several inputs and for
some of the inputs the number of arguments wasn't computed, then we
postpone fusing for this node and will try fuse this node later when
the number of arguments will be computed for all inputs. This approach
with postponed fusing helps to avoid additional computations during
compilation.

Additionally, case of dynamic shapes should be handled.  In case of
dynamic shape, function arguments also included sizes of dynamic
dimension and strides. The number of strides can be computed by
calculating number of tensor dimensions (the number of strides equals
to the rank of the tensor). The number of additional parameters with
sizes of dynamic dimensions can be calculated by computing number of
dynamic dimensions.
---
 include/tvm/relay/transform.h                 |   5 +-
 include/tvm/topi/transform.h                  |   2 +-
 python/tvm/relay/op/tensor.py                 |   9 +-
 python/tvm/relay/transform/transform.py       |   8 +-
 python/tvm/target/target.py                   |   2 +-
 src/relay/analysis/graph_partitioner.cc       | 136 ++++++++++++
 src/relay/analysis/graph_partitioner.h        |  44 +++-
 src/relay/backend/build_module.cc             |   2 +-
 src/relay/backend/vm/compiler.cc              |   7 +
 src/relay/transforms/fuse_ops.cc              |  23 +-
 src/relay/transforms/split_args.cc            |  95 +++++---
 src/target/source/codegen_metal.cc            |   4 +-
 src/target/target_kind.cc                     |   5 +
 tests/python/relay/test_pass_fuse_ops.py      | 116 ++++++++++
 tests/python/relay/test_pass_split_args.py    |  96 ++++----
 .../unittest/test_target_codegen_opencl.py    | 210 +++++++++++++++++-
 16 files changed, 670 insertions(+), 94 deletions(-)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index 5f591f1d89ad..da4d05f0e63e 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -120,9 +120,12 @@ TVM_DLL Pass FoldConstant(bool fold_qnn = false);
 /*!
  * \brief Split function with huge number of arguments to smaller pieces.
  *
+ * \param max_function_args Maximum number of function arguments. If it is 0 then SplitArgs won't
+ *                          split function.
+ *
  * \return The pass.
  */
-TVM_DLL Pass SplitArgs(int max_function_args);
+TVM_DLL Pass SplitArgs(uint64_t max_function_args);
 
 /*!
  * \brief Fuse operations into expr into separate functions.
diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h
index cab3466765b4..ee2e9c6114fd 100644
--- a/include/tvm/topi/transform.h
+++ b/include/tvm/topi/transform.h
@@ -722,7 +722,7 @@ inline te::Tensor dynamic_strided_slice(const te::Tensor& x, const te::Tensor& b
 }
 
 /*!
- * \brief Calcluate the output shape of strided_slice, the entry point for Relay type relation
+ * \brief Calculate the output shape of strided_slice, the entry point for Relay type relation
  *
  * \param ishape The input tensor shape
  * \param begin The indices to begin with in the slicing
diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py
index 6b488719eb84..26caa4584c79 100644
--- a/python/tvm/relay/op/tensor.py
+++ b/python/tvm/relay/op/tensor.py
@@ -23,7 +23,7 @@
 
 from . import _make
 from .dyn import _make as _dyn_make
-from ..expr import Tuple, Expr, Constant
+from ..expr import Tuple, Expr, Constant, Call
 from . import op as reg
 
 
@@ -1141,12 +1141,15 @@ def concatenate(data, axis):
     result: relay.Expr
         The concatenated tensor.
     """
-    data = list(data)
+    if not isinstance(data, Call):
+        data = list(data)
     if not data:
         raise ValueError("relay.concatenate requires data to be non-empty.")
+    if not isinstance(data, Call):
+        data = Tuple(data)
     if not isinstance(axis, int):
         raise ValueError("For now, we only support integer axis")
-    return _make.concatenate(Tuple(data), axis)
+    return _make.concatenate(data, axis)
 
 
 def einsum(data, equation):
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index b8af0518b29c..c162164daace 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1376,10 +1376,16 @@ def ToMixedPrecision(mixed_precision_type="float16", missing_op_mode=1):
 def SplitArgs(max_function_args):
     """Split function with huge number of arguments to smaller pieces.
 
+    Parameters
+    ----------
+    max_function_args: int
+      Maximum number of function arguments. If it is 0 then SplitArgs won't split function.
+
+
     Returns
     -------
     ret : tvm.transform.Pass
-        The registered pass for constant folding.
+        The registered pass.
     """
     return _ffi_api.SplitArgs(max_function_args)
 
diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py
index 0c834c5f026e..09ee0ac20dfe 100644
--- a/python/tvm/target/target.py
+++ b/python/tvm/target/target.py
@@ -194,7 +194,7 @@ def max_shared_memory_per_block(self):
 
     @property
     def max_function_args(self):
-        return int(self.attrs.get("max_function_args", -1))
+        return int(self.attrs.get("max_function_args", 0))
 
     @property
     def vtcm_capacity(self):
diff --git a/src/relay/analysis/graph_partitioner.cc b/src/relay/analysis/graph_partitioner.cc
index 861fd58d9e5c..cf397fc3e8f7 100644
--- a/src/relay/analysis/graph_partitioner.cc
+++ b/src/relay/analysis/graph_partitioner.cc
@@ -169,6 +169,7 @@ void GraphPartitioner::MergeFromTo(Group* child, Group* parent) {
   if (child == parent) return;
   // update the number of nodes of the parent group
   parent->num_nodes += child->num_nodes;
+  parent->args_num += child->args_num;
   child->parent = parent;
   // update anchor ref and pattern
   if (child->anchor_ref != nullptr) {
@@ -180,6 +181,10 @@ void GraphPartitioner::MergeFromTo(Group* child, Group* parent) {
 
 void GraphPartitioner::CommitFuse_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink,
                                    Group* target) {
+  if (postpone_node_ != nullptr) {
+    postponed_fusing_map_.insert({postpone_node_, src});
+    return;
+  }
   if (src == sink) return;
   if (visited_.count(src)) return;
   visited_.insert(src);
@@ -220,7 +225,113 @@ size_t GraphPartitioner::CountFusedNodesWithNewChild(IndexedForwardGraph::Node*
   return target->FindRoot()->num_nodes + CountNodesUptoSink_(child, dom_parent);
 }
 
+size_t GraphPartitioner::CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides) {
+  size_t any_dims = 0;
+  for (const auto& dim : ttype->shape) {
+    if (dim.as<AnyNode>()) {
+      any_dims++;
+    }
+  }
+  if (with_strides && any_dims > 0) any_dims += ttype->shape.size();
+  return any_dims;
+}
+
+size_t GraphPartitioner::CountArgs_(IndexedForwardGraph::Node* src,
+                                    const IndexedForwardGraph& graph, bool update_postpone) {
+  std::unordered_set<Group*> visited_groups;
+  Group* gnode = groups_[src->index];
+  ICHECK(gnode != nullptr);
+  auto sum = gnode->args_num;
+  visited_groups.insert(gnode->FindRoot());
+  auto calcArgs = [this, src, &graph, &visited_groups,
+                   update_postpone](const relay::Expr& arg) -> size_t {
+    if (arg.as<VarNode>()) return 0;
+    auto* node = graph.node_map.at(arg.get());
+    Group* prev_group = groups_[node->index]->FindRoot();
+    if (visited_groups.count(prev_group) == 0) {
+      visited_groups.insert(prev_group);
+      if (prev_group->args_num > 0) {
+        // Get number of arguments from group
+        return prev_group->args_num;
+      } else if (update_postpone) {
+        // Update pointer to node which should be postponed for deferred fusing
+        postpone_node_ = src;
+      } else {
+        // Calculate number of arguments for the node which wasn't processed before
+        return CountArgs_(node, graph, update_postpone);
+      }
+    }
+    return 0;
+  };
+  if (auto call_node = GetRef<ObjectRef>(src->ref).as<CallNode>()) {
+    for (auto& it : call_node->args) {
+      sum += calcArgs(it);
+    }
+  } else if (auto tuple_node = GetRef<ObjectRef>(src->ref).as<TupleNode>()) {
+    for (auto& it : tuple_node->fields) {
+      sum += calcArgs(it);
+    }
+  }
+  return sum;
+}
+
+size_t GraphPartitioner::CountArgsLimit_(const IndexedForwardGraph::Node* child) {
+  auto* outputs_list = child->outputs.head;
+  size_t output_args = 0;
+  while (outputs_list != nullptr) {
+    output_args++;
+    if (auto call_node = GetRef<ObjectRef>(outputs_list->value.node->ref).as<CallNode>()) {
+      if (const auto* ttype = call_node->checked_type().as<TensorTypeNode>()) {
+        output_args += CountAdditionalArgs_(ttype, false);
+      }
+    }
+    outputs_list = outputs_list->next;
+  }
+  return (max_function_args_ > output_args) ? max_function_args_ - output_args : 0;
+}
+
+size_t GraphPartitioner::CountFusedArgs(const IndexedForwardGraph& graph,
+                                        IndexedForwardGraph::Node* child) {
+  size_t args_num = 0;
+  auto* outputs_list = child->outputs.head;
+  while (outputs_list != nullptr) {
+    args_num = std::max(args_num, CountArgs_(outputs_list->value.node, graph));
+    outputs_list = outputs_list->next;
+  }
+  return args_num;
+}
+
 void GraphPartitioner::InitGroups(const IndexedForwardGraph& graph) {
+  auto args_counter = [this](const tvm::Object* obj) {
+    size_t args_num = 0;
+    if (auto call_node = GetRef<ObjectRef>(obj).as<CallNode>()) {
+      for (auto& it : call_node->args) {
+        if (it.as<VarNode>() || it.as<TupleGetItemNode>()) {
+          args_num++;
+          if (const auto* ttype = it.as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
+            args_num += CountAdditionalArgs_(ttype);
+          }
+        }
+      }
+    } else if (auto tuple_node = GetRef<ObjectRef>(obj).as<TupleNode>()) {
+      for (auto& it : tuple_node->fields) {
+        if (it.as<VarNode>() || it.as<TupleGetItemNode>()) {
+          args_num++;
+          if (const auto* ttype = it.as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
+            args_num += CountAdditionalArgs_(ttype);
+          }
+        }
+      }
+    } else if (GetRef<ObjectRef>(obj).as<VarNode>()) {
+      args_num++;
+      if (const auto* ttype =
+              GetRef<ObjectRef>(obj).as<ExprNode>()->checked_type().as<TensorTypeNode>()) {
+        args_num += CountAdditionalArgs_(ttype);
+      }
+    }
+    return args_num;
+  };
+
   groups_.resize(graph.post_dfs_order.size());
   for (size_t nid = 0; nid < groups_.size(); ++nid) {
     const auto* graph_node = graph.post_dfs_order[nid];
@@ -231,6 +342,7 @@ void GraphPartitioner::InitGroups(const IndexedForwardGraph& graph) {
     if (group_node->pattern == relay::kOutEWiseFusable) {
       group_node->anchor_ref = graph_node->ref;
     }
+    group_node->args_num = args_counter(graph_node->ref);
     groups_[nid] = group_node;
   }
 }
@@ -244,6 +356,21 @@ void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph,    //
     auto* dom_node = post_dom_tree.nodes[nid];
     Group* group_node = groups_[nid];
     ICHECK(group_node != nullptr);
+    postpone_node_ = nullptr;
+    // Check if fusing of some inputs was postponed
+    if (postponed_fusing_map_.count(graph_node)) {
+      auto range = postponed_fusing_map_.equal_range(graph_node);
+      for (auto it = range.first; it != range.second; ++it) {
+        // If number of arguments is less than limit then the input can be fused
+        if (CountArgs_(graph_node, graph, false) <= CountArgsLimit_(graph_node)) {
+          auto* src = it->second;
+          auto* snode = post_dom_tree.nodes[src->index]->parent->gnode;
+          if (groups_[snode->index]->anchor_ref != nullptr) continue;
+          CommitFuse(src, snode);
+        }
+      }
+      postponed_fusing_map_.erase(graph_node);
+    }
     // no actions for opaque nodes
     if (group_node->pattern == kOpaque) continue;
     // no actions needed if the current node have no dominator
@@ -254,6 +381,15 @@ void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph,    //
     // refuse the fusion if too many ops are going to be fused together
     if (CountFusedNodesWithNewChild(graph_node, dom_node->parent->gnode) > max_fuse_depth_)
       continue;
+    // refuse the fusion if too many arguments are going to be in fused function
+    if (max_function_args_ > 0) {
+      auto limit = CountArgsLimit_(graph_node);
+      if (limit > 0) {
+        if (CountFusedArgs(graph, graph_node) > limit) {
+          continue;
+        }
+      }
+    }
 
     if (phase == 2) {
       // Fuse injective ops into intermediate tuples, if any
diff --git a/src/relay/analysis/graph_partitioner.h b/src/relay/analysis/graph_partitioner.h
index 9433aafa119d..0319a37b5dbd 100644
--- a/src/relay/analysis/graph_partitioner.h
+++ b/src/relay/analysis/graph_partitioner.h
@@ -78,7 +78,7 @@ class IndexedForwardGraph {
   std::vector<Node*> post_dfs_order;
 
   /*! \brief Dump the graph into string. */
-  void DebugDump() {
+  void DebugDump() const {
     std::ostringstream os;
     for (size_t i = 0; i < post_dfs_order.size(); ++i) {
       Node* node = post_dfs_order[i];
@@ -162,8 +162,12 @@ class DominatorTree {
  */
 class GraphPartitioner {
  public:
-  explicit GraphPartitioner(support::Arena* arena, int opt_level, size_t max_fuse_depth)
-      : arena_(arena), opt_level_(opt_level), max_fuse_depth_(max_fuse_depth) {}
+  explicit GraphPartitioner(support::Arena* arena, int opt_level, size_t max_fuse_depth,
+                            size_t max_function_args)
+      : arena_(arena),
+        opt_level_(opt_level),
+        max_fuse_depth_(max_fuse_depth),
+        max_function_args_(max_function_args) {}
   /*!
    * \brief Group as a union find data structure.
    */
@@ -183,6 +187,10 @@ class GraphPartitioner {
      * \brief The number of nodes belonging to this group
      */
     uint32_t num_nodes{1};
+    /*!
+     * \brief The number of function arguments belonging to this group
+     */
+    size_t args_num{0};
 
     /*! \brief Optional attributes to annotate the grouped function. */
     runtime::Map<runtime::String, ObjectRef> attrs;
@@ -205,10 +213,21 @@ class GraphPartitioner {
   int opt_level_;
   /*! \brief The maximum number of operations in one fused function */
   size_t max_fuse_depth_;
+  /*! \brief The maximum number of arguments in one fused function */
+  size_t max_function_args_;
   /*! \brief The internal groups. */
   std::vector<Group*> groups_;
   /*! \brief internal field used for deduplication */
   std::unordered_set<IndexedForwardGraph::Node*> visited_;
+  /*! \brief The map with nodes which were postponed for fusing. */
+  std::unordered_multimap<const IndexedForwardGraph::Node*, IndexedForwardGraph::Node*>
+      postponed_fusing_map_;
+  /*!
+   * \brief Fusing of this node should be postponed till all child nodes will be evaluated.
+   *        It is used to calculate number of arguments which will be passed to this node in
+   * generated function.
+   */
+  const IndexedForwardGraph::Node* postpone_node_{nullptr};
   // Internal implementation of CheckPath
   template <typename F>
   bool CheckPath_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink, F fcond);
@@ -247,6 +266,23 @@ class GraphPartitioner {
   void CommitFuse(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
 
   size_t CountNodesUptoSink_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
+  // Count the number of additional arguments. In case of dynamic shape,
+  // generated function takes several additional arguments, such as size of
+  // dynamic dimension and strides.
+  // This function calculates number of such additional arguments.
+  size_t CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides = true);
+  // Calculate the number of arguments for the node.
+  size_t CountArgs_(IndexedForwardGraph::Node* src, const IndexedForwardGraph& graph,
+                    bool update_postpone = true);
+  // Count actual limit of arguments for a generated function.
+  // max_function_args_ specifies the number of maximum function arguments. But
+  // usually, output tensors also passed to the function as arguments.
+  // Additionally, in case of dynamic shape, it is necessary to take into
+  // account the number of parameters which specifies the size of dynamic
+  // dimension.
+  // This function computes limit of arguments by the following formula:
+  // limit = max_function_args_ - output_args_count
+  size_t CountArgsLimit_(const IndexedForwardGraph::Node* child);
 
   // Count the number of nodes in a fused subgraph if child is additionally fused.
   // dom_parent is already known to be a part of the subgraph.
@@ -256,6 +292,8 @@ class GraphPartitioner {
   // is important for correct calculation.
   size_t CountFusedNodesWithNewChild(IndexedForwardGraph::Node* child,
                                      IndexedForwardGraph::Node* dom_parent);
+  // Count the number of arguments in a fused subgraph if the output of child is additionally fused.
+  size_t CountFusedArgs(const IndexedForwardGraph& graph, IndexedForwardGraph::Node* child);
 
   // Initialize the groups.
   void InitGroups(const IndexedForwardGraph& graph);
diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc
index abb39a65679e..83c252d831c5 100644
--- a/src/relay/backend/build_module.cc
+++ b/src/relay/backend/build_module.cc
@@ -337,7 +337,7 @@ class RelayBuildModule : public runtime::ModuleNode {
     if (config_->optional_homogeneous_target.defined()) {
       // This pass currently only supports the homogeneous case.
       pass_seqs.push_back(transform::SplitArgs(
-          config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", -1)
+          config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", 0)
               .value()
               .IntValue()));
     }
diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc
index cb79970b25fc..c5b6c7f2f040 100644
--- a/src/relay/backend/vm/compiler.cc
+++ b/src/relay/backend/vm/compiler.cc
@@ -1059,6 +1059,13 @@ IRModule VMCompiler::OptimizeModuleImpl(IRModule mod) {
   // Always plan devices so the remaining passes don't need to distinguish homogeneous vs
   // heterogeneous execution.
   pass_seqs.push_back(transform::PlanDevices(config_));
+  if (config_->optional_homogeneous_target.defined()) {
+    // This pass currently only supports the homogeneous case.
+    pass_seqs.push_back(transform::SplitArgs(
+        config_->optional_homogeneous_target->GetAttr<Integer>("max_function_args", 0)
+            .value()
+            .IntValue()));
+  }
 
   pass_seqs.push_back(transform::FuseOps());
 
diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc
index 9c0d38b11587..ee005aa17052 100644
--- a/src/relay/transforms/fuse_ops.cc
+++ b/src/relay/transforms/fuse_ops.cc
@@ -319,9 +319,10 @@ class IndexedForwardGraphCreator : private ExprVisitor {
 
 class FuseMutator : private MixedModeMutator {
  public:
-  FuseMutator(int fuse_opt_level, size_t max_fuse_depth, bool link_params)
+  FuseMutator(int fuse_opt_level, size_t max_fuse_depth, size_t max_function_args, bool link_params)
       : fuse_opt_level_(fuse_opt_level),
         max_fuse_depth_(max_fuse_depth),
+        max_function_args_(max_function_args),
         link_params_(link_params) {}
 
   // Run the transform
@@ -334,7 +335,8 @@ class FuseMutator : private MixedModeMutator {
   Expr Transform(const Expr& body, int fuse_opt_level, size_t max_fuse_depth, bool link_params) {
     // setup the group map.
     auto graph = IndexedForwardGraphCreator::Create(&arena_, body);
-    auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth).Partition(graph);
+    auto groups = GraphPartitioner(&arena_, fuse_opt_level, max_fuse_depth, max_function_args_)
+                      .Partition(graph);
     for (size_t nid = 0; nid < graph.post_dfs_order.size(); ++nid) {
       ICHECK(graph.post_dfs_order[nid]->ref != nullptr);
       gmap_[graph.post_dfs_order[nid]->ref] = groups[nid];
@@ -347,6 +349,7 @@ class FuseMutator : private MixedModeMutator {
  private:
   int fuse_opt_level_;
   size_t max_fuse_depth_;
+  size_t max_function_args_;
   bool link_params_;
 
   using MixedModeMutator::VisitExpr_;
@@ -548,9 +551,10 @@ class FuseMutator : private MixedModeMutator {
   }
 };
 
-Expr FuseOps(const Expr& expr, int fuse_opt_level, size_t max_fuse_depth, bool link_params,
-             const IRModule& module) {
-  return FuseMutator(fuse_opt_level, max_fuse_depth, link_params).Transform(expr);
+Expr FuseOps(const Expr& expr, int fuse_opt_level, size_t max_fuse_depth, size_t max_function_args,
+             bool link_params, const IRModule& module) {
+  return FuseMutator(fuse_opt_level, max_fuse_depth, max_function_args, link_params)
+      .Transform(expr);
 }
 
 namespace transform {
@@ -567,8 +571,13 @@ Pass FuseOps(int fuse_opt_level) {
         link_params = pc->GetConfig("relay.FuseOps.link_params", Bool(link_params)).value();
         int opt_level = fuse_opt_level == -1 ? pc->opt_level : fuse_opt_level;
         auto max_fuse_depth = pc->GetConfig("relay.FuseOps.max_depth", Integer(kMaxFusedOps));
-        return Downcast<Function>(
-            FuseOps(f, opt_level, max_fuse_depth.value().IntValue(), link_params, m));
+        auto target = Target::Current();
+        size_t max_function_args =
+            (target.defined())
+                ? target->GetAttr<Integer>("max_function_args", Integer(0)).value().IntValue()
+                : 0;
+        return Downcast<Function>(FuseOps(f, opt_level, max_fuse_depth.value().IntValue(),
+                                          max_function_args, link_params, m));
       };
   return CreateFunctionPass(pass_func, 0, "FuseOps", {"InferType"});
 }
diff --git a/src/relay/transforms/split_args.cc b/src/relay/transforms/split_args.cc
index 00b9a3be3b2e..6ef404ee814d 100644
--- a/src/relay/transforms/split_args.cc
+++ b/src/relay/transforms/split_args.cc
@@ -31,58 +31,101 @@ namespace relay {
 
 class ArgumentSplitter : public ExprRewriter {
  public:
-  explicit ArgumentSplitter(int max_function_args)
+  explicit ArgumentSplitter(size_t max_function_args)
       : max_function_args_(max_function_args), concat_op_(Op::Get("concatenate")) {}
 
+  Expr ConcatSplitter(const TupleNode* tuple_node, const tvm::Array<relay::Expr>& args, int axis,
+                      size_t limit) {
+    tvm::Array<relay::Expr> new_args;
+    size_t added_args = 0;
+    for (const auto& it : args) {
+      size_t curr_args = 1;
+      if (const auto* ttype = it->checked_type().as<TensorTypeNode>()) {
+        ICHECK(additional_args_cache_.count(ttype));
+        curr_args += additional_args_cache_[ttype];
+      }
+      if (added_args + curr_args > limit) {
+        Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), new_args);
+        Expr stop = StopFusion(new_tuple);
+        Expr lastExpr = MakeConcatenate(stop, axis);
+        new_args.clear();
+        new_args.push_back(lastExpr);
+        added_args = curr_args;
+      }
+      added_args += curr_args;
+      new_args.push_back(it);
+    }
+    Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), new_args);
+    Expr stop = StopFusion(new_tuple);
+    Expr lastExpr = MakeConcatenate(stop, axis);
+    return lastExpr;
+  }
+
+  // In case of dynamic shape in tensor, size of any_dims and strides are passed as function args
+  size_t CalculateNumberOfAdditionalArgs_(const TensorTypeNode* arg, bool isOutput = false) {
+    size_t num = 0;
+    for (const auto& dim : arg->shape) {
+      if (dim.as<AnyNode>()) {
+        num++;
+      }
+    }
+    // In case of dynamic shape also strides will be passed to function
+    // as arguments. Number of strides equals to the rank of the tensor.
+    if (num > 0 && isOutput)
+      return arg->shape.size();
+    else if (num > 0)
+      num += arg->shape.size();
+    return num;
+  }
+
   Expr Rewrite_(const CallNode* call, const Expr& post) final {
-    if (max_function_args_ < 0) return post;
+    if (max_function_args_ == 0) return post;
     if (call->op == concat_op_) {
       auto tuple_node = call->args[0].as<TupleNode>();
+      if (tuple_node == nullptr) return post;
       const auto param = call->attrs.as<ConcatenateAttrs>();
-      int outputsNum = 1;
+      size_t outputsNum = 1;
       if (const auto* tuple_type = call->checked_type().as<TupleTypeNode>()) {
         outputsNum = tuple_type->fields.size();
+        for (const auto& it : tuple_type->fields) {
+          if (const auto* ttype = it.as<TensorTypeNode>()) {
+            outputsNum += CalculateNumberOfAdditionalArgs_(ttype, true);
+          }
+        }
+      } else if (const auto* ttype = call->checked_type().as<TensorTypeNode>()) {
+        outputsNum += CalculateNumberOfAdditionalArgs_(ttype, true);
       }
-      const int limit = max_function_args_ - outputsNum;
-      int argsNum = tuple_node->fields.size();
-      if (argsNum < limit) return post;
-      int splitNum = argsNum / limit;
-      splitNum = (argsNum % limit) ? splitNum + 1 : splitNum;
-
-      std::vector<Expr> splitted(splitNum);
-      for (int i = 0; i < splitNum; ++i) {
-        int startIdx = i * limit;
-        int argsCount = std::min(limit, argsNum - startIdx);
-        tvm::Array<Expr> args;
-        args.reserve(argsCount);
+      CHECK_GT(max_function_args_, outputsNum);
+      size_t limit = max_function_args_ - outputsNum;
 
-        for (int j = 0; j < argsCount; ++j) {
-          args.push_back(tuple_node->fields[j + startIdx]);
+      size_t argsNum = tuple_node->fields.size();
+      for (const auto& it : tuple_node->fields) {
+        if (const auto* ttype = it->checked_type().as<TensorTypeNode>()) {
+          size_t any_dims = CalculateNumberOfAdditionalArgs_(ttype);
+          argsNum += any_dims;
+          additional_args_cache_[ttype] = any_dims;
         }
-        Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), args);
-        Expr body = MakeConcatenate(new_tuple, param->axis);
-        splitted[i] = StopFusion(body);
       }
-      tvm::Array<Expr> tuple_args(splitted);
-      Tuple new_tuple = WithFields(GetRef<Tuple>(tuple_node), tuple_args);
-      return MakeConcatenate(new_tuple, param->axis);
+      if (argsNum < limit) return post;
+      return ConcatSplitter(tuple_node, tuple_node->fields, param->axis, limit);
     }
     return post;
   }
 
  private:
-  const int max_function_args_;
+  const size_t max_function_args_;
   const Op& concat_op_;
+  std::unordered_map<const TensorTypeNode*, size_t> additional_args_cache_;
 };
 
-Expr SplitArgs(const Expr& expr, int max_function_args) {
+Expr SplitArgs(const Expr& expr, size_t max_function_args) {
   auto rewriter = ArgumentSplitter(max_function_args);
   return PostOrderRewrite(expr, &rewriter);
 }
 
 namespace transform {
 
-Pass SplitArgs(int max_function_args) {
+Pass SplitArgs(uint64_t max_function_args) {
   runtime::TypedPackedFunc<Function(Function, IRModule, PassContext)> pass_func =
       [=](Function f, IRModule m, PassContext pc) {
         auto r = Downcast<Function>(SplitArgs(f, max_function_args));
diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc
index b7105e4bcdfc..b8c30691e21f 100644
--- a/src/target/source/codegen_metal.cc
+++ b/src/target/source/codegen_metal.cc
@@ -68,8 +68,8 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) {
 
   // Buffer arguments
   size_t num_buffer = 0;
-  int limit = target_->GetAttr<Integer>("max_function_args").value().IntValue();
-  if (static_cast<int>(f->params.size()) > limit) {
+  size_t limit = target_->GetAttr<Integer>("max_function_args").value().IntValue();
+  if (f->params.size() > limit) {
     LOG(WARNING) << "Probably you won't be able to execute your kernel due to high number of "
                     "buffers in the kernel";
   }
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 9af50d3f54ed..5431d7eceb5d 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -365,6 +365,11 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(1))
     .add_attr_option<Integer>("texture_spatial_limit", Integer(16384))
+    // Faced that Qualcomm OpenCL runtime was crashed without any error message in
+    // case when the number of kernel arguments was pretty big. OpenCL doesn't
+    // specify any limitations on the number of kernel arguments. max_function_args
+    // equals to 128 looks like a reasonable number of kernel arguments.
+    .add_attr_option<Integer>("max_function_args", Integer(128))
     .set_default_keys({"opencl", "gpu"});
 
 // The metal has some limitations on the number of input parameters. This is why attribute
diff --git a/tests/python/relay/test_pass_fuse_ops.py b/tests/python/relay/test_pass_fuse_ops.py
index 06c93fbc5549..714818328f66 100644
--- a/tests/python/relay/test_pass_fuse_ops.py
+++ b/tests/python/relay/test_pass_fuse_ops.py
@@ -624,6 +624,12 @@ def expected(n, max_fused_ops):
 
     assert tvm.ir.structural_equal(zz, after)
 
+    with tvm.target.Target("opencl"):
+        with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
+            cl_zz = run_opt_pass(z, transform.FuseOps())
+
+    assert tvm.ir.structural_equal(cl_zz, after)
+
 
 link_params = tvm.testing.parameter(False, True)
 
@@ -828,5 +834,115 @@ def expected():
         tvm.testing.assert_allclose(result, ref, rtol=1e-4, atol=1e-4)
 
 
+target_name = tvm.testing.parameter("opencl", "metal", "cuda")
+shape_type = tvm.testing.parameter("dynamic", "static")
+
+
+def test_fuse_max_num_args(target_name, shape_type):
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 20)
+        number_of_any_dims = 1
+    else:
+        shape = (10, 20)
+        number_of_any_dims = 0
+    ndims = len(shape)
+    ops_num = 300
+
+    def _base_func(name):
+        x = relay.var(name, shape=shape)
+        y = relay.add(x, relay.const(1, "float32"))
+        w = relay.exp(y)
+        return x, w
+
+    def before(n):
+        inp = []
+        out = []
+        for i in range(n):
+            x, w = _base_func(f"x{i}")
+            inp.append(x)
+            out.append(w)
+        w = out[0]
+        for i in range(len(out) - 1):
+            w = relay.add(w, out[i + 1])
+        return relay.Function(inp, w)
+
+    def after(n):
+        def create_fused_func(limit):
+            added_args = 0
+            inputs = []
+            input_vars = []
+            res = None
+            i = 0
+            while added_args < limit:
+                inp, out = _base_func(f"p{i}")
+
+                curr_args = 1 + number_of_any_dims
+                if number_of_any_dims > 0:
+                    curr_args += ndims
+
+                if added_args + curr_args > limit:
+                    f = relay.Function(inputs, res)
+                    f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                    return i, input_vars, f
+
+                input_vars.append(relay.var(f"x{i}", shape=shape))
+                inputs.append(inp)
+                if res is None:
+                    res = out
+                else:
+                    res = relay.add(res, out)
+                added_args += curr_args
+                i += 1
+            f = relay.Function(inputs, res)
+            f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+            return i, input_vars, f
+
+        def create_accum_func(args_limit):
+            out = None
+            inputs = []
+            if args_limit == 0:
+                for i in range(n):
+                    inputs.append(relay.var(f"x{i}", shape=shape))
+                f = before(n)
+                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                out = relay.Call(f, inputs)
+                return relay.Function(inputs, out)
+
+            i, inputs, func = create_fused_func(args_limit)
+            out = relay.Call(func, inputs)
+            while i < n:
+                inp, func = _base_func(f"p{i}")
+                inputs.append(relay.var(f"xa{i}", shape=shape))
+                curr_args = 1 + number_of_any_dims
+                if number_of_any_dims > 0:
+                    curr_args += ndims
+                f = relay.Function([inp], func)
+                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                w = relay.Call(f, [inputs[-1]])
+                a = relay.var(f"a", shape=shape)
+                b = relay.var(f"b", shape=shape)
+                out_add = relay.add(a, b)
+                f = relay.Function([a, b], out_add)
+                f = f.with_attr("Primitive", tvm.tir.IntImm("int32", 1))
+                out = relay.Call(f, [out, w])
+                i += 1
+            return relay.Function(inputs, out)
+
+        args_limit = tvm.target.Target.current().max_function_args - (
+            1 + number_of_any_dims
+        )  # one buffer with output
+        args_limit = max(args_limit, 0)
+        return create_accum_func(args_limit)
+
+    max_fused_ops = ops_num * 5
+    with tvm.target.Target(target_name):
+        with tvm.transform.PassContext(config={"relay.FuseOps.max_depth": max_fused_ops}):
+            fused = run_opt_pass(before(ops_num), transform.FuseOps())
+
+        expected = run_opt_pass(after(ops_num), transform.InferType())
+
+    assert tvm.ir.structural_equal(fused, expected)
+
+
 if __name__ == "__main__":
     tvm.testing.main()
diff --git a/tests/python/relay/test_pass_split_args.py b/tests/python/relay/test_pass_split_args.py
index 2039f464751f..508f74f11269 100644
--- a/tests/python/relay/test_pass_split_args.py
+++ b/tests/python/relay/test_pass_split_args.py
@@ -22,6 +22,10 @@
 from tvm.relay.testing import run_infer_type, create_workload
 
 
+target_name = tvm.testing.parameter("opencl", "metal", "cuda")
+shape_type = tvm.testing.parameter("dynamic", "static")
+
+
 def run_opt_pass(expr, opt_pass):
     assert isinstance(opt_pass, tvm.transform.Pass)
 
@@ -32,65 +36,63 @@ def run_opt_pass(expr, opt_pass):
     return entry if isinstance(expr, relay.Function) else entry.body
 
 
-def test_split_concat_metal():
-    shape = (1, 1, 1, 3)
-    dtype = "float32"
-    axis = 1
-    inputs = []
-    for i in range(100):
-        inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
-
-    def before():
-        inp = relay.Tuple(inputs)
-        return relay.op.concatenate(inp, axis)
-
-    def expected():
-        limit = tvm.target.Target("metal").max_function_args - 1  # one buffer with output
-        splitNum = int(len(inputs) / limit)
-        if len(inputs) % limit > 0:
-            splitNum += 1
-
-        splitted = []
-        for i in range(splitNum):
-            startIdx = i * limit
-            argsCount = min(limit, len(inputs) - startIdx)
-            args = []
-            for j in range(argsCount):
-                args.append(inputs[j + startIdx])
-            t = relay.Tuple(args)
-            concat = relay.op.concatenate(t, axis)
-            splitted.append(relay.annotation.stop_fusion(concat))
-        inp = relay.Tuple(splitted)
-        return relay.op.concatenate(inp, axis)
-
-    # the fold constant should work on any context.
-    res = run_opt_pass(before(), transform.SplitArgs(tvm.target.Target("metal").max_function_args))
-    exp = run_opt_pass(expected(), transform.InferType())
-    assert tvm.ir.structural_equal(res, exp)
-
-
-def test_split_concat_cuda():
-    shape = (1, 1, 1, 3)
+def test_split_concat(target_name, shape_type):
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 1, 1, 3)
+        number_of_any_dims = 1
+    else:
+        shape = (1, 1, 1, 3)
+        number_of_any_dims = 0
+    ndims = len(shape)
     dtype = "float32"
     axis = 1
+    tensors_num = 300
     inputs = []
-    for i in range(100):
+    for i in range(tensors_num):
         inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
 
     def before():
         inp = relay.Tuple(inputs)
         return relay.op.concatenate(inp, axis)
 
-    def expected():
-        inp = relay.Tuple(inputs)
-        return relay.op.concatenate(inp, axis)
+    def expected(limit):
+        if limit == 0:
+            return before()
+        limit = limit - 1  # one buffer with output
+        if number_of_any_dims > 0:
+            limit -= ndims
+
+        new_args = []
+        added_args = 0
+        num_inputs = 0
+        for inp in inputs:
+            curr_args = 1 + number_of_any_dims
+            if number_of_any_dims > 0:
+                curr_args += ndims
+            num_inputs += curr_args
+            if added_args + curr_args > limit:
+                t = relay.Tuple(new_args)
+                stop = relay.annotation.stop_fusion(t)
+                concat = relay.op.concatenate(stop, axis)
+                new_args = [concat]
+                added_args = curr_args
+            added_args += curr_args
+            new_args.append(inp)
+        t = relay.Tuple(new_args)
+        stop = relay.annotation.stop_fusion(t)
+        concat = relay.op.concatenate(stop, axis)
+
+        if num_inputs < limit:
+            return before()
+
+        return concat
 
     # the fold constant should work on any context.
-    res = run_opt_pass(before(), transform.SplitArgs(tvm.target.Target("cuda").max_function_args))
-    exp = run_opt_pass(expected(), transform.InferType())
+    limit = tvm.target.Target(target_name).max_function_args
+    res = run_opt_pass(before(), transform.SplitArgs(limit))
+    exp = run_opt_pass(expected(limit), transform.InferType())
     assert tvm.ir.structural_equal(res, exp)
 
 
 if __name__ == "__main__":
-    test_split_concat_metal()
-    test_split_concat_cuda()
+    tvm.testing.main()
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index 83612b7f5979..dcb43f29daf4 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -15,9 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 import tvm
-from tvm import te
+from tvm import te, relay
 import tvm.testing
 import re
+import pytest
+import numpy as np
 
 target = "opencl"
 
@@ -217,5 +219,211 @@ def _check(target, n, dtype):
     _check(target, 32, "float32")
 
 
+def _get_maximum_kernel_args(source):
+    def get_kernel_args(source):
+        import re
+
+        p = re.compile(r"__kernel void .+\((.*)\)")
+        args = p.findall(source)
+        return args
+
+    args = get_kernel_args(source)
+    max_args = len(args[0].split(","))
+    for arg_line in args:
+        max_args = max(max_args, len(arg_line.split(",")))
+    return max_args
+
+
+def _validate_opencl_executors(executor_type, get_model, ref_impl):
+    from tvm.contrib import graph_executor
+    from tvm.runtime.vm import VirtualMachine
+
+    input_dict, model = get_model()
+    if executor_type == "ge":
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.build(model, target_host="llvm", target=target)
+        ocl_lib = lib.get_lib()
+    else:
+        module = tvm.IRModule({})
+        module["main"] = model
+        with tvm.transform.PassContext(opt_level=3):
+            lib = relay.vm.compile(module, target=target, target_host="llvm")
+        ocl_lib = lib.module.imported_modules[0]
+    opencl_modules = list(filter(lambda mod: mod.type_key == "opencl", ocl_lib.imported_modules))
+    assembly = opencl_modules[0].get_source()
+    with tvm.target.Target(target):
+        limit = tvm.target.Target.current().max_function_args
+    max_num = _get_maximum_kernel_args(assembly)
+    assert max_num <= limit
+
+    dev = tvm.cl()
+    if executor_type == "ge":
+        module = graph_executor.GraphModule(lib["default"](dev))
+        module.set_input(**input_dict)
+        module.run()
+        tvm_out = module.get_output(0)
+    else:
+        vm = VirtualMachine(lib, dev, "naive")
+        data = {}
+        for k, v in input_dict.items():
+            data[k] = tvm.nd.array(v, dev)
+        vm.set_input("main", **data)
+        vm.invoke_stateful("main")
+        tvm_out = vm.get_outputs()[0]
+
+    np_result = ref_impl(list(input_dict.values()))
+    np.testing.assert_allclose(tvm_out.asnumpy(), np_result, rtol=1e-2, atol=1e-2)
+
+
+shape_type = tvm.testing.parameter("dynamic", "static")
+executor_type = tvm.testing.parameter("ge", "vm")
+
+
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_opencl_args_split(executor_type, shape_type):
+    def _get_model():
+        if shape_type == "dynamic":
+            shape = (tvm.tir.Any(), 1, 1, 3)
+        else:
+            shape = (1, 1, 1, 3)
+        shape_np = (1, 1, 1, 3)
+        dtype = "float32"
+        axis = 1
+        tensors_num = 300
+        inputs = []
+        inputs_np = {}
+        for i in range(tensors_num):
+            inputs.append(relay.var("p{}".format(i), shape=shape, dtype=dtype))
+            inputs_np[f"p{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+
+        inp = relay.Tuple(inputs)
+        concat = relay.op.concatenate(inp, axis)
+        return inputs_np, relay.Function(inputs, concat)
+
+    def ref_impl(inputs):
+        axis = 1
+        return np.concatenate(tuple(inputs), axis=axis)
+
+    if executor_type == "ge" and shape_type == "dynamic":
+        pytest.skip()
+    _validate_opencl_executors(executor_type, _get_model, ref_impl)
+
+
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_opencl_fuse_max_args(executor_type, shape_type):
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 20)
+        ops_num = 80
+    else:
+        shape = (1, 20)
+        ops_num = 300
+    shape_np = (1, 20)
+    dtype = "float32"
+
+    def _base_func(name):
+        x = relay.var(name, shape=shape)
+        y = relay.add(x, relay.const(1, "float32"))
+        w = relay.exp(y)
+        return x, w
+
+    def _get_model():
+        inp = []
+        inputs_np = {}
+        out = []
+        for i in range(ops_num):
+            x, w = _base_func(f"x{i}")
+            inputs_np[f"x{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+            inp.append(x)
+            out.append(w)
+        w = out[0]
+        for i in range(len(out) - 1):
+            w = relay.add(w, out[i + 1])
+        return inputs_np, relay.Function(inp, w)
+
+    def ref_impl(inputs):
+        w = np.exp(inputs[0] + 1)
+        for i in range(len(inputs) - 1):
+            w = w + np.exp(inputs[i + 1] + 1)
+        return w
+
+    if executor_type == "ge" and shape_type == "dynamic":
+        pytest.skip()
+    _validate_opencl_executors(executor_type, _get_model, ref_impl)
+
+
+@tvm.testing.requires_gpu
+@tvm.testing.requires_opencl
+def test_fuse_concat_max_num_args(executor_type, shape_type):
+    """
+    In this test before concat we have an operation with 3 inputs. In the
+    SplitArgs we cannot calculate these inputs as inputs to concat, because
+    they will be added to the concat after fusing operation. So FuseOps pass
+    should handle this case and stop fusing before concat.
+
+    The example:
+       x     y     z                  x     y     z
+       \     |     /                  \     |     /
+        \    |    /                    \    |    /
+           where            ...           where
+             |                              |
+            exp                            exp
+             \                              /
+              \                            /
+               \----->    concat    <-----/
+    """
+    if shape_type == "dynamic":
+        shape = (tvm.tir.Any(), 20)
+        ops_num = 80
+    else:
+        shape = (10, 20)
+        ops_num = 300
+    shape_np = (10, 20)
+    dtype = "float32"
+    axis = 1
+
+    def _base_func(name):
+        x = relay.var(name, shape=shape)
+        y = relay.var(f"y{name}", shape=shape)
+        z = relay.var(f"z{name}", shape=shape)
+        cond = relay.less(x, relay.const(1, "float32"))
+        l = relay.add(y, relay.const(1, "float32"))
+        r = relay.add(z, relay.const(5, "float32"))
+        w = relay.where(cond, l, r)
+        w = relay.exp(w)
+        return [x, y, z], w
+
+    def _get_model():
+        inp = []
+        out = []
+        inputs_np = {}
+        for i in range(ops_num):
+            inputs, w = _base_func(f"x{i}")
+            inputs_np[f"x{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+            inputs_np[f"yx{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+            inputs_np[f"zx{i}"] = np.random.uniform(size=shape_np).astype(dtype)
+            inp.extend(inputs)
+            out.append(w)
+        t = relay.Tuple(out)
+        w = relay.op.concatenate(t, axis)
+        return inputs_np, relay.Function(inp, w)
+
+    def ref_impl(inputs):
+        res = []
+        for i in range(0, len(inputs), 3):
+            x = inputs[i]
+            y = inputs[i + 1]
+            z = inputs[i + 2]
+            comp = np.where(x < 1, y + 1, z + 5)
+            comp = np.exp(comp)
+            res.append(comp)
+        return np.concatenate(tuple(res), axis=axis)
+
+    if executor_type == "ge" and shape_type == "dynamic":
+        pytest.skip()
+    _validate_opencl_executors(executor_type, _get_model, ref_impl)
+
+
 if __name__ == "__main__":
     tvm.testing.main()

From d36a899704ebadf437485ca785720db7b476cef0 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Wed, 19 Jul 2023 11:47:27 +0300
Subject: [PATCH 2/4] Fix memory_scope order in test

---
 tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
index cd5a99242173..3476037946ff 100644
--- a/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
+++ b/tests/python/relay/opencl_texture/test_conv2d_nchw_texture.py
@@ -836,9 +836,9 @@ def test_pooling_branching_texture_params(remote, target, dtype):
         "global.texture",
         "global",
         "global.texture-weight",
+        "global",
         "global.texture-weight",
         "global.texture",
-        "global",
         "global.texture",
         "",
         "",
@@ -962,9 +962,9 @@ def test_branching_texture_params(remote, target, dtype):
         "global.texture",
         "global",
         "global.texture-weight",
+        "global",
         "global.texture-weight",
         "global.texture",
-        "global",
         "global.texture",
         "",
         "",

From 58f17b7e63299f25ca090c8f0b8b4dd27c301a0c Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Thu, 20 Jul 2023 09:00:10 +0300
Subject: [PATCH 3/4] Apply code review comments

---
 include/tvm/relay/transform.h                 |  4 +--
 python/tvm/relay/transform/transform.py       |  3 +-
 src/relay/analysis/graph_partitioner.cc       | 20 ++++++-------
 src/relay/analysis/graph_partitioner.h        | 28 ++++++++++---------
 src/relay/transforms/split_args.cc            |  7 +++--
 src/target/target_kind.cc                     |  4 +--
 .../unittest/test_target_codegen_opencl.py    |  9 +++---
 7 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h
index da4d05f0e63e..4f5b5d146d92 100644
--- a/include/tvm/relay/transform.h
+++ b/include/tvm/relay/transform.h
@@ -120,8 +120,8 @@ TVM_DLL Pass FoldConstant(bool fold_qnn = false);
 /*!
  * \brief Split function with huge number of arguments to smaller pieces.
  *
- * \param max_function_args Maximum number of function arguments. If it is 0 then SplitArgs won't
- *                          split function.
+ * \param max_function_args Maximum number of function arguments. If it equals 0 then SplitArgs
+ *                          shouldn't split the function.
  *
  * \return The pass.
  */
diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py
index c162164daace..902c1a6a4576 100644
--- a/python/tvm/relay/transform/transform.py
+++ b/python/tvm/relay/transform/transform.py
@@ -1379,7 +1379,8 @@ def SplitArgs(max_function_args):
     Parameters
     ----------
     max_function_args: int
-      Maximum number of function arguments. If it is 0 then SplitArgs won't split function.
+      Maximum number of function arguments. If it equals 0 then SplitArgs
+      shouldn't split the function.
 
 
     Returns
diff --git a/src/relay/analysis/graph_partitioner.cc b/src/relay/analysis/graph_partitioner.cc
index cf397fc3e8f7..d233d43ad7eb 100644
--- a/src/relay/analysis/graph_partitioner.cc
+++ b/src/relay/analysis/graph_partitioner.cc
@@ -243,21 +243,21 @@ size_t GraphPartitioner::CountArgs_(IndexedForwardGraph::Node* src,
   ICHECK(gnode != nullptr);
   auto sum = gnode->args_num;
   visited_groups.insert(gnode->FindRoot());
-  auto calcArgs = [this, src, &graph, &visited_groups,
-                   update_postpone](const relay::Expr& arg) -> size_t {
+  auto calc_args_number = [this, src, &graph, &visited_groups,
+                           update_postpone](const relay::Expr& arg) -> size_t {
     if (arg.as<VarNode>()) return 0;
     auto* node = graph.node_map.at(arg.get());
     Group* prev_group = groups_[node->index]->FindRoot();
     if (visited_groups.count(prev_group) == 0) {
       visited_groups.insert(prev_group);
       if (prev_group->args_num > 0) {
-        // Get number of arguments from group
+        // Get the number of arguments from the group
         return prev_group->args_num;
       } else if (update_postpone) {
-        // Update pointer to node which should be postponed for deferred fusing
+        // Update pointer to the node which should be postponed for deferred fusing
         postpone_node_ = src;
       } else {
-        // Calculate number of arguments for the node which wasn't processed before
+        // Calculate the number of arguments for the node which wasn't processed before
         return CountArgs_(node, graph, update_postpone);
       }
     }
@@ -265,11 +265,11 @@ size_t GraphPartitioner::CountArgs_(IndexedForwardGraph::Node* src,
   };
   if (auto call_node = GetRef<ObjectRef>(src->ref).as<CallNode>()) {
     for (auto& it : call_node->args) {
-      sum += calcArgs(it);
+      sum += calc_args_number(it);
     }
   } else if (auto tuple_node = GetRef<ObjectRef>(src->ref).as<TupleNode>()) {
     for (auto& it : tuple_node->fields) {
-      sum += calcArgs(it);
+      sum += calc_args_number(it);
     }
   }
   return sum;
@@ -357,11 +357,11 @@ void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph,    //
     Group* group_node = groups_[nid];
     ICHECK(group_node != nullptr);
     postpone_node_ = nullptr;
-    // Check if fusing of some inputs was postponed
+    // Check if the fusing of some inputs was postponed
     if (postponed_fusing_map_.count(graph_node)) {
       auto range = postponed_fusing_map_.equal_range(graph_node);
       for (auto it = range.first; it != range.second; ++it) {
-        // If number of arguments is less than limit then the input can be fused
+        // If the number of arguments is less than the limit then the input can be fused
         if (CountArgs_(graph_node, graph, false) <= CountArgsLimit_(graph_node)) {
           auto* src = it->second;
           auto* snode = post_dom_tree.nodes[src->index]->parent->gnode;
@@ -381,7 +381,7 @@ void GraphPartitioner::RunFuse(const IndexedForwardGraph& graph,    //
     // refuse the fusion if too many ops are going to be fused together
     if (CountFusedNodesWithNewChild(graph_node, dom_node->parent->gnode) > max_fuse_depth_)
       continue;
-    // refuse the fusion if too many arguments are going to be in fused function
+    // Refuse the fusion if too many arguments are going to be in the fused function
     if (max_function_args_ > 0) {
       auto limit = CountArgsLimit_(graph_node);
       if (limit > 0) {
diff --git a/src/relay/analysis/graph_partitioner.h b/src/relay/analysis/graph_partitioner.h
index 0319a37b5dbd..08c26926477e 100644
--- a/src/relay/analysis/graph_partitioner.h
+++ b/src/relay/analysis/graph_partitioner.h
@@ -224,8 +224,8 @@ class GraphPartitioner {
       postponed_fusing_map_;
   /*!
    * \brief Fusing of this node should be postponed till all child nodes will be evaluated.
-   *        It is used to calculate number of arguments which will be passed to this node in
-   * generated function.
+   *        It is used to calculate the number of arguments which will be passed to this node in
+   *        the generated function.
    */
   const IndexedForwardGraph::Node* postpone_node_{nullptr};
   // Internal implementation of CheckPath
@@ -266,21 +266,21 @@ class GraphPartitioner {
   void CommitFuse(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
 
   size_t CountNodesUptoSink_(IndexedForwardGraph::Node* src, IndexedForwardGraph::Node* sink);
-  // Count the number of additional arguments. In case of dynamic shape,
-  // generated function takes several additional arguments, such as size of
-  // dynamic dimension and strides.
-  // This function calculates number of such additional arguments.
+  // Count the number of additional arguments. In the case of dynamic shape,
+  // generated function takes several additional arguments, such as the sizes of
+  // the dynamic dimensions and strides.
+  // This function calculates the number of such additional arguments.
   size_t CountAdditionalArgs_(const TensorTypeNode* ttype, bool with_strides = true);
   // Calculate the number of arguments for the node.
   size_t CountArgs_(IndexedForwardGraph::Node* src, const IndexedForwardGraph& graph,
                     bool update_postpone = true);
-  // Count actual limit of arguments for a generated function.
+  // Count the actual limit of arguments for a generated function.
   // max_function_args_ specifies the number of maximum function arguments. But
-  // usually, output tensors also passed to the function as arguments.
-  // Additionally, in case of dynamic shape, it is necessary to take into
-  // account the number of parameters which specifies the size of dynamic
-  // dimension.
-  // This function computes limit of arguments by the following formula:
+  // usually, output tensors are also passed to the function as arguments.
+  // Additionally, in the case of dynamic shape, it is necessary to take into
+  // account the number of parameters which specifies the sizes of the dynamic
+  // dimensions.
+  // This function computes the limit of arguments by the following formula:
   // limit = max_function_args_ - output_args_count
   size_t CountArgsLimit_(const IndexedForwardGraph::Node* child);
 
@@ -292,7 +292,9 @@ class GraphPartitioner {
   // is important for correct calculation.
   size_t CountFusedNodesWithNewChild(IndexedForwardGraph::Node* child,
                                      IndexedForwardGraph::Node* dom_parent);
-  // Count the number of arguments in a fused subgraph if the output of child is additionally fused.
+  // Count the number of arguments in a fused subgraph. This function also takes into account the
+  // number of the child's output node argument. It helps to stop fusing before the node when the
+  // limit will be exceeded.
   size_t CountFusedArgs(const IndexedForwardGraph& graph, IndexedForwardGraph::Node* child);
 
   // Initialize the groups.
diff --git a/src/relay/transforms/split_args.cc b/src/relay/transforms/split_args.cc
index 6ef404ee814d..423adff9a4cb 100644
--- a/src/relay/transforms/split_args.cc
+++ b/src/relay/transforms/split_args.cc
@@ -61,7 +61,8 @@ class ArgumentSplitter : public ExprRewriter {
     return lastExpr;
   }
 
-  // In case of dynamic shape in tensor, size of any_dims and strides are passed as function args
+  // In the case of dynamic shape in tensor, the sizes of any_dims and strides are passed as
+  // function args
   size_t CalculateNumberOfAdditionalArgs_(const TensorTypeNode* arg, bool isOutput = false) {
     size_t num = 0;
     for (const auto& dim : arg->shape) {
@@ -69,8 +70,8 @@ class ArgumentSplitter : public ExprRewriter {
         num++;
       }
     }
-    // In case of dynamic shape also strides will be passed to function
-    // as arguments. Number of strides equals to the rank of the tensor.
+    // In the case of dynamic shape, strides are also passed to a function as arguments. The number
+    // of strides equals the rank of the tensor.
     if (num > 0 && isOutput)
       return arg->shape.size();
     else if (num > 0)
diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc
index 5431d7eceb5d..ac9877a4ecc1 100644
--- a/src/target/target_kind.cc
+++ b/src/target/target_kind.cc
@@ -365,8 +365,8 @@ TVM_REGISTER_TARGET_KIND("opencl", kDLOpenCL)
     .add_attr_option<Integer>("max_num_threads", Integer(256))
     .add_attr_option<Integer>("thread_warp_size", Integer(1))
     .add_attr_option<Integer>("texture_spatial_limit", Integer(16384))
-    // Faced that Qualcomm OpenCL runtime was crashed without any error message in
-    // case when the number of kernel arguments was pretty big. OpenCL doesn't
+    // Faced that Qualcomm OpenCL runtime crashed without any error message in
+    // the case when the number of kernel arguments was pretty big. OpenCL doesn't
     // specify any limitations on the number of kernel arguments. max_function_args
     // equals to 128 looks like a reasonable number of kernel arguments.
     .add_attr_option<Integer>("max_function_args", Integer(128))
diff --git a/tests/python/unittest/test_target_codegen_opencl.py b/tests/python/unittest/test_target_codegen_opencl.py
index dcb43f29daf4..9222947ae47e 100644
--- a/tests/python/unittest/test_target_codegen_opencl.py
+++ b/tests/python/unittest/test_target_codegen_opencl.py
@@ -357,10 +357,11 @@ def ref_impl(inputs):
 @tvm.testing.requires_opencl
 def test_fuse_concat_max_num_args(executor_type, shape_type):
     """
-    In this test before concat we have an operation with 3 inputs. In the
-    SplitArgs we cannot calculate these inputs as inputs to concat, because
-    they will be added to the concat after fusing operation. So FuseOps pass
-    should handle this case and stop fusing before concat.
+    In this test, we have an operation with 3 inputs before concat. In the
+    SplitArgs we cannot calculate these inputs as inputs to the concat layer,
+    because they will be added to the concat after the fusing operation. So
+    FuseOps pass should handle this case and stop fusing before the concat
+    layer.
 
     The example:
        x     y     z                  x     y     z

From 0e3f1f02437d176c381d5ce9cc7f787fcceb2547 Mon Sep 17 00:00:00 2001
From: Egor Churaev <egor.churaev@gmail.com>
Date: Thu, 20 Jul 2023 10:39:20 +0300
Subject: [PATCH 4/4] Apply comments

---
 src/relay/analysis/graph_partitioner.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/relay/analysis/graph_partitioner.h b/src/relay/analysis/graph_partitioner.h
index 08c26926477e..b3f934b972de 100644
--- a/src/relay/analysis/graph_partitioner.h
+++ b/src/relay/analysis/graph_partitioner.h
@@ -223,7 +223,7 @@ class GraphPartitioner {
   std::unordered_multimap<const IndexedForwardGraph::Node*, IndexedForwardGraph::Node*>
       postponed_fusing_map_;
   /*!
-   * \brief Fusing of this node should be postponed till all child nodes will be evaluated.
+   * \brief Fusing of this node should be postponed till all child nodes are evaluated.
    *        It is used to calculate the number of arguments which will be passed to this node in
    *        the generated function.
    */
@@ -280,7 +280,7 @@ class GraphPartitioner {
   // Additionally, in the case of dynamic shape, it is necessary to take into
   // account the number of parameters which specifies the sizes of the dynamic
   // dimensions.
-  // This function computes the limit of arguments by the following formula:
+  // This function computes the maximum number of arguments by the following formula:
   // limit = max_function_args_ - output_args_count
   size_t CountArgsLimit_(const IndexedForwardGraph::Node* child);