apache · leandron · Nov 3, 2022 · Oct 12, 2022 · Oct 20, 2022 · Oct 20, 2022
diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
@@ -64,14 +64,42 @@ def ConvertEquivalents() -> tvm.ir.IRModule:  # pylint: disable=invalid-name
     """Converts operations into a numerically equivalent form
     that can be understood by the NPU codegen.
 
-    Return
-    ------
+    Returns
+    -------
     Pass
         The module pass.
     """
     return _ethosn.ConvertEquivalents()
 
 
+def InlineNonComputeIntensivePartitions() -> tvm.ir.IRModule:  # pylint: disable=invalid-name
+    """This pass checks whether functions partitioned for the NPU are considered
+    non-compute intensive. If they are not, they will be unpartitioned and passed onto
+    other backends to consider.
+
+    A partitioned function is currently considered non-compute intensive if it contains
+    no multiply accumulate operations.
+
+    Returns
+    -------
+    Pass
+        The module pass.
+    """
+    return _ethosn.InlineNonComputeIntensivePartitions()
+
+
+def is_inline_non_compute_intensive_partitions_enabled() -> bool:
+    """
+    Determine whether to inline none-compute-intensive partitions.
+
+    Returns
+    -------
+    True if inlining should happen, False if not.
+    """
+    compiler_attrs = tvm.get_global_func("relay.ext.ethos-n.get_compiler_attrs")()
+    return compiler_attrs.inline_non_compute_intensive_partitions
+
+
 def partition_for_ethosn(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to Arm Ethos-N NPU.
@@ -112,17 +140,18 @@ def partition_for_ethosn(mod, params=None, **opts):
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.MergeComposite(pattern_table()),
-            transform.AnnotateTarget("ethos-n"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-            ConvertEquivalents(),
-        ]
-    )
-    return seq(mod)
+    passes = [
+        transform.InferType(),
+        transform.MergeComposite(pattern_table()),
+        transform.AnnotateTarget("ethos-n"),
+        transform.MergeCompilerRegions(),
+        transform.PartitionGraph(),
+        ConvertEquivalents(),
+    ]
+    if is_inline_non_compute_intensive_partitions_enabled():
+        passes.append(InlineNonComputeIntensivePartitions())
+
+    return tvm.transform.Sequential(passes)(mod)
 
 
 @register_pattern_table("ethos-n")

diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -251,6 +251,7 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
   bool enable_intermediate_compression;
   bool disable_winograd;
   String debug_dir;
+  bool inline_non_compute_intensive_partitions;
 
   TVM_DECLARE_ATTRS(EthosnCompilerConfigNode, "ext.attrs.EthosnCompilerConfigNode") {
     TVM_ATTR_FIELD(variant).describe("See Ethos-N documentation.").set_default("n78");
@@ -278,6 +279,12 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
     TVM_ATTR_FIELD(enable_intermediate_compression).set_default(true);
     TVM_ATTR_FIELD(disable_winograd).set_default(false);
     TVM_ATTR_FIELD(debug_dir).set_default(".");
+    TVM_ATTR_FIELD(inline_non_compute_intensive_partitions)
+        .describe(
+            "A heuristic to improve performance. Inlines functions partitioned for Arm(R) "
+            "Ethos(TM)-N that are deemed 'non-compute-intensive'. The inlined functions will "
+            "continue through TVM's standard compilation flow.")
+        .set_default(true);
   }
 };
 
@@ -289,6 +296,16 @@ class EthosnCompilerConfig : public Attrs {
 TVM_REGISTER_NODE_TYPE(EthosnCompilerConfigNode);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.ethos-n.options", EthosnCompilerConfig);
 
+auto GetCompilerAttrs() {
+  auto ctx = transform::PassContext::Current();
+  auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options");
+  if (!cfg.defined()) {
+    cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
+  }
+  return cfg;
+}
+TVM_REGISTER_GLOBAL("relay.ext.ethos-n.get_compiler_attrs").set_body_typed(GetCompilerAttrs);
+
 /*! \brief The compiler for Ethos-N functions */
 class EthosnCompiler {
  public:

diff --git a/src/relay/backend/contrib/ethosn/inline_partitions.cc b/src/relay/backend/contrib/ethosn/inline_partitions.cc
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/ethosn/inline_partitions.cc
+ * \brief A pass to inline NPU partitions that are not considered compute
+ * intensive.
+ */
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+
+#include "../../../transforms/compiler_function_utils.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosn {
+
+class IsComputeIntensivePartition : MixedModeVisitor {
+ public:
+  /*!
+   * \brief Check if the partitioned function is compute
+   * intensive. If it has not multiply-accumulate operations
+   * it is not considered compute intensive.
+   *
+   * \param expr The partitioned function to check.
+   */
+  bool CheckSubgraph(const Expr& expr) {
+    is_compute_intensive = false;
+    VisitExpr(expr);
+    return is_compute_intensive;
+  }
+
+  /*!
+   * \brief Visit the call nodes of a partitioned function
+   * and check if operators or composite functions make the
+   * partitioned function compute intensive.
+   *
+   * \param op The call node to check.
+   */
+  void VisitExpr_(const CallNode* op) override {
+    Call call = GetRef<Call>(op);
+    std::string op_name = "";
+    if (const auto* op = call->op.as<OpNode>()) {
+      op_name = op->name;
+    } else if (const auto* func = call->op.as<FunctionNode>()) {
+      op_name = func->GetAttr<String>(attr::kComposite, "").value();
+    }
+
+    if (op_name != "") {
+      if (compute_intensive_operators.find(op_name) != compute_intensive_operators.end()) {
+        is_compute_intensive = true;
+      }
+    }
+  }
+
+ private:
+  /*! \brief Whether or not the partitioned function is consdiered compute intensive. */
+  bool is_compute_intensive;
+  /*! \brief A set of operators considered compute intensive. */
+  const std::unordered_set<std::string> compute_intensive_operators{
+      "ethos-n.qnn_conv2d",     "ethos-n.qnn_conv2d_transpose",
+      "ethos-n.qnn_avg_pool2d", "ethos-n.qnn_sigmoid",
+      "ethos-n.qnn_fc",         "ethos-n.qnn_mean",
+      "ethos-n.qnn_resize",     "nn.max_pool2d",
+  };
+};
+
+/*!
+ * \brief This pass checks whether functions partitioned for the NPU are considered
+ * non-compute intensive. If they are not, they will be unpartitioned and passed onto
+ * other backends to consider.
+ *
+ * A partitioned function is currently considered non-compute intensive if it contains
+ * no multiply accumulate operations. Note that this is not an optimal heuristic.
+ *
+ * Some suggestions for future exploration:
+ * - Making a better choice about large non-compute-intensive subgraphs
+ *   as currently these are inlined.
+ * - Allowing the user to input ops that are considered compute-intensive.
+ * - Inline "small" compute intensive operations.
+ */
+tvm::transform::Pass InlineNonComputeIntensivePartitions() {
+  runtime::TypedPackedFunc<IRModule(IRModule, tvm::transform::PassContext)> pass_func =
+      [=](IRModule mod, tvm::transform::PassContext ctx) {
+        auto analyzer = IsComputeIntensivePartition();
+        Array<GlobalVar> gvs_to_inline;
+        for (auto gv : mod->GetGlobalVars()) {
+          Function func = Downcast<Function>(mod->Lookup(gv));
+          auto compiler_name = func->GetAttr<String>(attr::kCompiler);
+          if (compiler_name.defined() && compiler_name == "ethos-n") {
+            if (!analyzer.CheckSubgraph(func->body)) {
+              gvs_to_inline.push_back(gv);
+            }
+          }
+        }
+        return relay::transform::InlineCompilerFunctionsBoundTo(gvs_to_inline)(mod);
+      };
+  return tvm::transform::CreateModulePass(
+      pass_func, 0, "relay.backend.contrib.ethos-n.InlineNonComputeIntensivePartitions", {});
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.InlineNonComputeIntensivePartitions")
+    .set_body_typed(InlineNonComputeIntensivePartitions);
+
+}  // namespace ethosn
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -143,7 +143,7 @@ def visit_call(self, call):
     return c.count
 
 
-def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
+def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1, optimize_partitions=True):
     """Build a network with or without Ethos-N offloading.
 
     Parameters
@@ -158,10 +158,18 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
         The number of ops expected to remain on the host.
     npu_partitions : int, optional
         The number of Ethos-N partitions expected.
+    optimize_partitions : bool, optional
+        Disable the pass that optimizes NPU partitions post partitioning.
     """
     relay.backend.te_compiler.get().clear()
     with tvm.transform.PassContext(
-        opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}}
+        opt_level=3,
+        config={
+            "relay.ext.ethos-n.options": {
+                "variant": get_ethosn_variant(),
+                "inline_non_compute_intensive_partitions": optimize_partitions,
+            }
+        },
     ):
         with tvm.target.Target("llvm"):
             if npu:
@@ -228,8 +236,20 @@ def run(lib, inputs, outputs, npu=True):
     return out
 
 
-def build_and_run(mod, inputs, outputs, params, npu=True, expected_host_ops=0, npu_partitions=1):
-    lib = build(mod, params, npu, expected_host_ops, npu_partitions)
+def build_and_run(
+    mod,
+    inputs,
+    outputs,
+    params,
+    npu=True,
+    expected_host_ops=0,
+    npu_partitions=1,
+    optimize_partitions=True,
+):
+    """
+    Convenient wrapper for building and running a module on the NPU.
+    """
+    lib = build(mod, params, npu, expected_host_ops, npu_partitions, optimize_partitions)
     return run(lib, inputs, outputs, npu)
 
 

diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py
@@ -111,7 +111,7 @@ def test_addition(dtype, shape):
     model = _get_model(shape, shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype)
     for npu in [False, True]:
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
@@ -227,7 +227,7 @@ def test_addition_to_reinterpret_quantize(lhs_shape, lhs_is_constant, rhs_shape,
     outputs = []
     for npu in [False, True]:
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
     tei.verify(outputs, dtype, 1)
 
 

diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py
@@ -76,7 +76,7 @@ def test_concatenate(dtype, shapes, axis):
     for npu in [False, True]:
         model = _get_model(shapes, dtype, axis)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
         tei.verify(outputs, dtype, 0)
 

diff --git a/tests/python/contrib/test_ethosn/test_depth_to_space.py b/tests/python/contrib/test_ethosn/test_depth_to_space.py
@@ -53,7 +53,7 @@ def test_depth_to_space(dtype, shape):
     for npu in [False, True]:
         model = _get_model(shape, 2, dtype, "NHWC")
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)