From 37df763d4abd5d3a2e1d74a237322443b6fdf297 Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Wed, 12 Oct 2022 14:06:53 +0000
Subject: [PATCH 1/4] [ETHOSN] Inline non-compute-intensive partitions

Adds a pass that analyzes functions partitioned for the NPU and inlines
those that are deemed "non-compute-intensive" back to the main function
so that they can be considered for other backends. The current heurisic
for deciding a non-compute-intensive function is to collectively check
all of the operations in the function have no multiply accumulate
operations. This heuristic is not optimial; optimization is left for
future exploration.

This pass is inspired by the "IsComputeIntensiveGraph" pass in the
TensorRT integration.

Change-Id: I20c197702f5252f102cfc1e4b4635ab836aa7835
---
 python/tvm/relay/op/contrib/ethosn.py         |  55 ++++--
 .../backend/contrib/ethosn/codegen_ethosn.h   |  17 ++
 .../contrib/ethosn/inline_partitions.cc       | 120 +++++++++++++
 .../contrib/test_ethosn/infrastructure.py     |  28 ++-
 .../contrib/test_ethosn/test_addition.py      |   2 +-
 .../contrib/test_ethosn/test_concatenate.py   |   2 +-
 .../test_ethosn/test_inline_partitions.py     | 167 ++++++++++++++++++
 .../contrib/test_ethosn/test_leaky_relu.py    |   2 +-
 .../contrib/test_ethosn/test_multiply.py      |   4 +-
 tests/python/contrib/test_ethosn/test_relu.py |   2 +-
 .../contrib/test_ethosn/test_requantize.py    |   4 +-
 .../contrib/test_ethosn/test_reshape.py       |   6 +-
 .../python/contrib/test_ethosn/test_split.py  |   4 +-
 tests/python/contrib/test_ethosn/test_tanh.py |   2 +-
 .../contrib/test_ethosn/test_topologies.py    |  34 +++-
 15 files changed, 418 insertions(+), 31 deletions(-)
 create mode 100644 src/relay/backend/contrib/ethosn/inline_partitions.cc
 create mode 100644 tests/python/contrib/test_ethosn/test_inline_partitions.py

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 80cc1ca3b202..5952d21e7a37 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -64,14 +64,42 @@ def ConvertEquivalents() -> tvm.ir.IRModule:  # pylint: disable=invalid-name
     """Converts operations into a numerically equivalent form
     that can be understood by the NPU codegen.
 
-    Return
-    ------
+    Returns
+    -------
     Pass
         The module pass.
     """
     return _ethosn.ConvertEquivalents()
 
 
+def InlineNonComputeIntensivePartitions() -> tvm.ir.IRModule:  # pylint: disable=invalid-name
+    """This pass checks whether functions partitioned for the NPU are considered
+    non-compute intensive. If they are not, they will be unpartitioned and passed onto
+    other backends to consider.
+
+    A partitioned function is currently considered non-compute intensive if it contains
+    no multiply accumulate operations.
+
+    Returns
+    -------
+    Pass
+        The module pass.
+    """
+    return _ethosn.InlineNonComputeIntensivePartitions()
+
+
+def inline_non_compute_intensive_partitions() -> bool:
+    """
+    Determine whether to inline none-compute-intensive partitions.
+
+    Returns
+    -------
+    True if inlining should happen, False if not.
+    """
+    compiler_attrs = tvm.get_global_func("relay.ext.ethos-n.get_compiler_attrs")()
+    return compiler_attrs.inline_non_compute_intensive_partitions
+
+
 def partition_for_ethosn(mod, params=None, **opts):
     """Partition the graph greedily offloading supported
     operators to Arm Ethos-N NPU.
@@ -112,17 +140,18 @@ def partition_for_ethosn(mod, params=None, **opts):
     if params:
         mod["main"] = bind_params_by_name(mod["main"], params)
 
-    seq = tvm.transform.Sequential(
-        [
-            transform.InferType(),
-            transform.MergeComposite(pattern_table()),
-            transform.AnnotateTarget("ethos-n"),
-            transform.MergeCompilerRegions(),
-            transform.PartitionGraph(),
-            ConvertEquivalents(),
-        ]
-    )
-    return seq(mod)
+    passes = [
+        transform.InferType(),
+        transform.MergeComposite(pattern_table()),
+        transform.AnnotateTarget("ethos-n"),
+        transform.MergeCompilerRegions(),
+        transform.PartitionGraph(),
+        ConvertEquivalents(),
+    ]
+    if inline_non_compute_intensive_partitions():
+        passes.append(InlineNonComputeIntensivePartitions())
+
+    return tvm.transform.Sequential(passes)(mod)
 
 
 @register_pattern_table("ethos-n")
diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
index ab853599aa2d..c640db47b6dd 100644
--- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h
+++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h
@@ -251,6 +251,7 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
   bool enable_intermediate_compression;
   bool disable_winograd;
   String debug_dir;
+  bool inline_non_compute_intensive_partitions;
 
   TVM_DECLARE_ATTRS(EthosnCompilerConfigNode, "ext.attrs.EthosnCompilerConfigNode") {
     TVM_ATTR_FIELD(variant).describe("See Ethos-N documentation.").set_default("n78");
@@ -278,6 +279,12 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode<EthosnCompilerConfigNode
     TVM_ATTR_FIELD(enable_intermediate_compression).set_default(true);
     TVM_ATTR_FIELD(disable_winograd).set_default(false);
     TVM_ATTR_FIELD(debug_dir).set_default(".");
+    TVM_ATTR_FIELD(inline_non_compute_intensive_partitions)
+        .describe(
+            "A heuristic to improve performance. Inlines functions partitioned for Arm(R) "
+            "Ethos(TM)-N that are deemed 'non-compute-intensive'. The inlined functions will "
+            "continue through TVM's standard compilation flow.")
+        .set_default(true);
   }
 };
 
@@ -289,6 +296,16 @@ class EthosnCompilerConfig : public Attrs {
 TVM_REGISTER_NODE_TYPE(EthosnCompilerConfigNode);
 TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.ethos-n.options", EthosnCompilerConfig);
 
+auto GetCompilerAttrs() {
+  auto ctx = transform::PassContext::Current();
+  auto cfg = ctx->GetConfig<EthosnCompilerConfig>("relay.ext.ethos-n.options");
+  if (!cfg.defined()) {
+    cfg = AttrsWithDefaultValues<EthosnCompilerConfig>();
+  }
+  return cfg;
+}
+TVM_REGISTER_GLOBAL("relay.ext.ethos-n.get_compiler_attrs").set_body_typed(GetCompilerAttrs);
+
 /*! \brief The compiler for Ethos-N functions */
 class EthosnCompiler {
  public:
diff --git a/src/relay/backend/contrib/ethosn/inline_partitions.cc b/src/relay/backend/contrib/ethosn/inline_partitions.cc
new file mode 100644
index 000000000000..8dc7ebb1116f
--- /dev/null
+++ b/src/relay/backend/contrib/ethosn/inline_partitions.cc
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \file src/relay/backend/contrib/ethosn/inline_partitions.cc
+ * \brief A pass to inline NPU partitions that are not considered compute
+ * intensive.
+ */
+
+#include <tvm/relay/expr.h>
+#include <tvm/relay/expr_functor.h>
+
+#include "../../../transforms/compiler_function_utils.h"
+
+namespace tvm {
+namespace relay {
+namespace contrib {
+namespace ethosn {
+
+class IsComputeIntensivePartition : MixedModeVisitor {
+ public:
+  /*!
+   * \brief Check if the partitioned function is compute
+   * intensive. If it has not multiply-accumulate operations
+   * it is not considered compute intensive.
+   *
+   * \param expr The partitioned function to check.
+   */
+  bool CheckSubgraph(const Expr& expr) {
+    is_compute_intensive = false;
+    VisitExpr(expr);
+    return is_compute_intensive;
+  }
+
+  /*!
+   * \brief Visit the call nodes of a partitioned function
+   * and check if operators or composite functions make the
+   * partitioned function compute intensive.
+   *
+   * \param op The call node to check.
+   */
+  void VisitExpr_(const CallNode* op) override {
+    Call call = GetRef<Call>(op);
+    std::string op_name = "";
+    if (const auto* op = call->op.as<OpNode>()) {
+      op_name = op->name;
+    } else if (const auto* func = call->op.as<FunctionNode>()) {
+      op_name = func->GetAttr<String>(attr::kComposite, "").value();
+    }
+
+    if (op_name != "") {
+      if (compute_intensive_operators.find(op_name) != compute_intensive_operators.end()) {
+        is_compute_intensive = true;
+      }
+    }
+  }
+
+ private:
+  /*! \brief Whether or not the partitioned function is consdiered compute intensive. */
+  bool is_compute_intensive;
+  /*! \brief A set of operators considered compute intensive. */
+  const std::unordered_set<std::string> compute_intensive_operators{
+      "ethos-n.qnn_add",        "ethos-n.qnn_conv2d",  "ethos-n.qnn_conv2d_transpose",
+      "ethos-n.qnn_avg_pool2d", "ethos-n.qnn_sigmoid", "ethos-n.qnn_fc",
+      "ethos-n.qnn_mean",       "ethos-n.qnn_resize",  "nn.max_pool2d",
+      "nn.depth_to_space"};
+};
+
+/*!
+ * \brief This pass checks whether functions partitioned for the NPU are considered
+ * non-compute intensive. If they are not, they will be unpartitioned and passed onto
+ * other backends to consider.
+ *
+ * A partitioned function is currently considered non-compute intensive if it contains
+ * no multiply accumulate operations. Note that this is not an optimal heuristic,
+ * however, it will not degrade performance.
+ */
+tvm::transform::Pass InlineNonComputeIntensivePartitions() {
+  runtime::TypedPackedFunc<IRModule(IRModule, tvm::transform::PassContext)> pass_func =
+      [=](IRModule mod, tvm::transform::PassContext ctx) {
+        auto analyzer = IsComputeIntensivePartition();
+        Array<GlobalVar> gvs_to_inline;
+        for (auto gv : mod->GetGlobalVars()) {
+          Function func = Downcast<Function>(mod->Lookup(gv));
+          auto compiler_name = func->GetAttr<String>(attr::kCompiler);
+          if (compiler_name.defined() && compiler_name == "ethos-n") {
+            if (!analyzer.CheckSubgraph(func->body)) {
+              gvs_to_inline.push_back(gv);
+            }
+          }
+        }
+        return relay::transform::InlineCompilerFunctionsBoundTo(gvs_to_inline)(mod);
+      };
+  return tvm::transform::CreateModulePass(
+      pass_func, 0, "relay.backend.contrib.ethos-n.InlineNonComputeIntensivePartitions", {});
+}
+
+TVM_REGISTER_GLOBAL("relay.backend.contrib.ethos-n.InlineNonComputeIntensivePartitions")
+    .set_body_typed(InlineNonComputeIntensivePartitions);
+
+}  // namespace ethosn
+}  // namespace contrib
+}  // namespace relay
+}  // namespace tvm
diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py
index 6b019686968e..85ebd98efcff 100644
--- a/tests/python/contrib/test_ethosn/infrastructure.py
+++ b/tests/python/contrib/test_ethosn/infrastructure.py
@@ -143,7 +143,7 @@ def visit_call(self, call):
     return c.count
 
 
-def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
+def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1, optimize_partitions=True):
     """Build a network with or without Ethos-N offloading.
 
     Parameters
@@ -158,10 +158,18 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1):
         The number of ops expected to remain on the host.
     npu_partitions : int, optional
         The number of Ethos-N partitions expected.
+    optimize_partitions : bool, optional
+        Disable the pass that optimizes NPU partitions post partitioning.
     """
     relay.backend.te_compiler.get().clear()
     with tvm.transform.PassContext(
-        opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}}
+        opt_level=3,
+        config={
+            "relay.ext.ethos-n.options": {
+                "variant": get_ethosn_variant(),
+                "inline_non_compute_intensive_partitions": optimize_partitions,
+            }
+        },
     ):
         with tvm.target.Target("llvm"):
             if npu:
@@ -228,8 +236,20 @@ def run(lib, inputs, outputs, npu=True):
     return out
 
 
-def build_and_run(mod, inputs, outputs, params, npu=True, expected_host_ops=0, npu_partitions=1):
-    lib = build(mod, params, npu, expected_host_ops, npu_partitions)
+def build_and_run(
+    mod,
+    inputs,
+    outputs,
+    params,
+    npu=True,
+    expected_host_ops=0,
+    npu_partitions=1,
+    optimize_partitions=True,
+):
+    """
+    Convenient wrapper for building and running a module on the NPU.
+    """
+    lib = build(mod, params, npu, expected_host_ops, npu_partitions, optimize_partitions)
     return run(lib, inputs, outputs, npu)
 
 
diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py
index 11d8b8d1cd56..76bda5e0b7d4 100644
--- a/tests/python/contrib/test_ethosn/test_addition.py
+++ b/tests/python/contrib/test_ethosn/test_addition.py
@@ -227,7 +227,7 @@ def test_addition_to_reinterpret_quantize(lhs_shape, lhs_is_constant, rhs_shape,
     outputs = []
     for npu in [False, True]:
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
     tei.verify(outputs, dtype, 1)
 
 
diff --git a/tests/python/contrib/test_ethosn/test_concatenate.py b/tests/python/contrib/test_ethosn/test_concatenate.py
index 0389b3c5b103..f8521b595060 100644
--- a/tests/python/contrib/test_ethosn/test_concatenate.py
+++ b/tests/python/contrib/test_ethosn/test_concatenate.py
@@ -76,7 +76,7 @@ def test_concatenate(dtype, shapes, axis):
     for npu in [False, True]:
         model = _get_model(shapes, dtype, axis)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
         tei.verify(outputs, dtype, 0)
 
diff --git a/tests/python/contrib/test_ethosn/test_inline_partitions.py b/tests/python/contrib/test_ethosn/test_inline_partitions.py
new file mode 100644
index 000000000000..79c35fc5bcb2
--- /dev/null
+++ b/tests/python/contrib/test_ethosn/test_inline_partitions.py
@@ -0,0 +1,167 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+"""
+Tests for the 'InlineNonComputeIntensivePartitions' pass.
+"""
+
+import tvm
+from tvm import relay
+from tvm.testing import requires_ethosn
+from tvm.relay.op.contrib.ethosn import InlineNonComputeIntensivePartitions
+
+from . import infrastructure as tei
+
+
+def _assert_structural_equal(a, b):
+    """Check structural equality of two Relay expressions."""
+    reason = (
+        "Actual and expected relay functions are not equal. "
+        "InlineNonComputeIntensiveSubgraphs is not correctly "
+        "transforming the input graph."
+    )
+    assert tvm.ir.structural_equal(a, b, map_free_vars=True), reason
+
+
+@requires_ethosn
+def test_single_reshape():
+    """Check that a single reshape is inlined correctly."""
+
+    def get_reshape():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        return relay.reshape(x, newshape=(2, 2, 4))
+
+    def before():
+        reshape = get_reshape()
+        return tei.make_ethosn_partition(reshape)
+
+    def expected():
+        reshape = get_reshape()
+        mod = tvm.IRModule.from_expr(reshape)
+        return relay.transform.InferType()(mod)
+
+    mod = before()
+    mod = InlineNonComputeIntensivePartitions()(mod)
+    expected_mod = expected()
+    _assert_structural_equal(mod, expected_mod)
+
+
+@requires_ethosn
+def test_multiple_non_compute_intensive_ops():
+    """
+    Check that a partitioned function is correctly inlined
+    when it contains multiple non-compute intensive operations.
+    """
+
+    def get_graph():
+        x = relay.var("x", shape=(2, 2, 4), dtype="int8")
+        x = relay.reshape(x, newshape=(1, 2, 2, 4))
+        x = relay.clip(x, 0.0, 1.0)
+        x = relay.reshape(x, newshape=(2, 2, 4))
+        return relay.clip(x, 0.0, 1.0)
+
+    def before():
+        func = get_graph()
+        return tei.make_ethosn_partition(func)
+
+    def expected():
+        func = get_graph()
+        mod = tvm.IRModule.from_expr(func)
+        return relay.transform.InferType()(mod)
+
+    mod = before()
+    mod = InlineNonComputeIntensivePartitions()(mod)
+    expected_mod = expected()
+    _assert_structural_equal(mod, expected_mod)
+
+
+@requires_ethosn
+def test_compute_intensive_ops():
+    """
+    Check that a partitioned function that is considered
+    compute intensive is not inlined.
+    """
+
+    def before():
+        x = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+        x = relay.nn.max_pool2d(x, layout="NHWC")
+        x = relay.reshape(x, newshape=(2, 2, 4))
+        return tei.make_ethosn_partition(x)
+
+    mod = before()
+    transformed_mod = InlineNonComputeIntensivePartitions()(mod)
+    for global_var in mod.get_global_vars():
+        _assert_structural_equal(mod[global_var], transformed_mod[global_var])
+
+
+@requires_ethosn
+def test_multiple_partitioned_functions():
+    """
+    Tests the pass on a number of partitioned functions.
+    """
+
+    def before():
+        composite_func_name = "ethos-n_0"
+        inp = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+
+        # partitioned func 1 (non compute intensive)
+        x = relay.reshape(inp, newshape=(1, 2, 2, 4))
+        partitioned_func_1 = tei.make_ethosn_partition(x)[composite_func_name]
+        gv_1 = relay.GlobalVar("ethos-n_0")
+
+        # partitioned func 2 (compute intensive)
+        x = relay.nn.max_pool2d(inp, layout="NHWC")
+        partitioned_func_2 = tei.make_ethosn_partition(x)[composite_func_name]
+        gv_2 = relay.GlobalVar("ethos-n_1")
+
+        # partitioned func 3 (non compute intensive)
+        x = relay.clip(inp, 0.0, 1.0)
+        partitioned_func_3 = tei.make_ethosn_partition(x)[composite_func_name]
+        gv_3 = relay.GlobalVar("ethos-n_2")
+
+        mod = tvm.IRModule({})
+        mod[gv_1] = partitioned_func_1
+        mod[gv_2] = partitioned_func_2
+        mod[gv_3] = partitioned_func_3
+        main_expr = relay.Call(gv_1, [inp])
+        main_expr = relay.Call(gv_2, [main_expr])
+        main_expr = relay.Call(gv_3, [main_expr])
+        mod["main"] = relay.Function([inp], main_expr)
+        return relay.transform.InferType()(mod)
+
+    def expected():
+        composite_func_name = "ethos-n_0"
+        inp = relay.var("x", shape=(1, 2, 2, 4), dtype="int8")
+
+        # partitioned func 2 (compute intensive)
+        x = relay.nn.max_pool2d(inp, layout="NHWC")
+        partitioned_func_2 = tei.make_ethosn_partition(x)[composite_func_name]
+        gv_2 = relay.GlobalVar("ethos-n_1")
+
+        mod = tvm.IRModule({})
+        mod[gv_2] = partitioned_func_2
+        main_expr = relay.reshape(inp, newshape=(1, 2, 2, 4))
+        main_expr = relay.Call(gv_2, [main_expr])
+        main_expr = relay.clip(main_expr, 0.0, 1.0)
+        mod["main"] = relay.Function([inp], main_expr)
+        return relay.transform.InferType()(mod)
+
+    mod = before()
+    mod = InlineNonComputeIntensivePartitions()(mod)
+    expected_mod = expected()
+    for global_var in mod.get_global_vars():
+        _assert_structural_equal(mod[global_var.name_hint], expected_mod[global_var.name_hint])
diff --git a/tests/python/contrib/test_ethosn/test_leaky_relu.py b/tests/python/contrib/test_ethosn/test_leaky_relu.py
index 3c3bbc709679..7c1969ec44ba 100644
--- a/tests/python/contrib/test_ethosn/test_leaky_relu.py
+++ b/tests/python/contrib/test_ethosn/test_leaky_relu.py
@@ -65,7 +65,7 @@ def test_leaky_relu(dtype, shape, alpha):
     for npu in [False, True]:
         model = _get_model(shape, input_zp, input_sc, output_zp, output_sc, dtype, alpha)
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_multiply.py b/tests/python/contrib/test_ethosn/test_multiply.py
index 41c06092447a..a7b97e39cb13 100644
--- a/tests/python/contrib/test_ethosn/test_multiply.py
+++ b/tests/python/contrib/test_ethosn/test_multiply.py
@@ -151,7 +151,9 @@ def test_multiply_to_reinterpret_quantize(shape, constant_shape, reverse_inputs)
     outputs = []
     for npu in [False, True]:
         mod = tei.make_module(model, params)
-        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+        outputs.append(
+            tei.build_and_run(mod, inputs, 1, params, npu=npu, optimize_partitions=False)
+        )
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_relu.py b/tests/python/contrib/test_ethosn/test_relu.py
index db1894931dd9..8ecea0d23ce4 100644
--- a/tests/python/contrib/test_ethosn/test_relu.py
+++ b/tests/python/contrib/test_ethosn/test_relu.py
@@ -60,7 +60,7 @@ def test_relu(dtype, shape, a_min, a_max):
     for npu in [False, True]:
         model = _get_model(inputs["a"].shape, dtype, a_min, a_max)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_requantize.py b/tests/python/contrib/test_ethosn/test_requantize.py
index 3187c22f3391..618b00c6e4ee 100644
--- a/tests/python/contrib/test_ethosn/test_requantize.py
+++ b/tests/python/contrib/test_ethosn/test_requantize.py
@@ -64,7 +64,7 @@ def test_requantize(in_dtype, out_dtype, shape):
             out_dtype=out_dtype,
         )
         mod = tei.make_module(model, [])
-        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu)
+        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False)
         outputs.append(x)
 
     tei.verify(outputs, out_dtype, 1)
@@ -128,7 +128,7 @@ def get_model():
     for npu in [False, True]:
         model = get_model()
         mod = tei.make_module(model, {})
-        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu)
+        x = tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False)
         outputs.append(x)
 
     tei.verify(outputs, out_dtype, 1)
diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py
index 2d6eae9b2522..d60ad50b97bc 100644
--- a/tests/python/contrib/test_ethosn/test_reshape.py
+++ b/tests/python/contrib/test_ethosn/test_reshape.py
@@ -71,7 +71,9 @@ def test_reshape(dtype, input_shape, output_shape):
     for npu in [False, True]:
         model, params = _get_model(input_shape, output_shape, dtype)
         mod = tei.make_module(model, params)
-        outputs.append(tei.build_and_run(mod, inputs, 1, params, npu=npu))
+        outputs.append(
+            tei.build_and_run(mod, inputs, 1, params, npu=npu, optimize_partitions=False)
+        )
 
     tei.verify(outputs, dtype, 1)
 
@@ -91,4 +93,4 @@ def test_reshape_failure(input_shape, output_shape):
 
     model, params = _get_model(input_shape, output_shape, "int8")
     mod = tei.make_module(model, params)
-    tei.build(mod, params, expected_host_ops=1, npu_partitions=0)
+    tei.build(mod, params, expected_host_ops=1, npu_partitions=0, optimize_partitions=False)
diff --git a/tests/python/contrib/test_ethosn/test_split.py b/tests/python/contrib/test_ethosn/test_split.py
index 57335feadbba..56e51e2de159 100644
--- a/tests/python/contrib/test_ethosn/test_split.py
+++ b/tests/python/contrib/test_ethosn/test_split.py
@@ -56,7 +56,9 @@ def test_split(dtype, shape, splits, axis):
         model = _get_model(shape, dtype, splits, axis)
         mod = tei.make_module(model, {})
         output_count = splits if isinstance(splits, int) else len(splits) + 1
-        outputs.append(tei.build_and_run(mod, inputs, output_count, {}, npu=npu))
+        outputs.append(
+            tei.build_and_run(mod, inputs, output_count, {}, npu=npu, optimize_partitions=False)
+        )
 
         tei.verify(outputs, dtype, 0)
 
diff --git a/tests/python/contrib/test_ethosn/test_tanh.py b/tests/python/contrib/test_ethosn/test_tanh.py
index 68170601c5f8..c2fc5188e5f1 100644
--- a/tests/python/contrib/test_ethosn/test_tanh.py
+++ b/tests/python/contrib/test_ethosn/test_tanh.py
@@ -59,7 +59,7 @@ def test_tanh(dtype, shape):
     for npu in [False, True]:
         model = _get_model(shape, zp_min + 120, 0.0250629, zp_min + 128, 0.0078125, dtype)
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
index 6425eb0faba3..a16a453d0f85 100644
--- a/tests/python/contrib/test_ethosn/test_topologies.py
+++ b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -183,7 +183,7 @@ def get_model(input_shape, dtype, var_names):
     for npu in [False, True]:
         model = get_model(inputs["a"].shape, dtype, iter(inputs))
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 8, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 8, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
@@ -291,6 +291,7 @@ def get_model(shape, dtype, splits, axis):
                 npu=npu,
                 expected_host_ops=expected_host_ops,
                 npu_partitions=npu_partitions,
+                optimize_partitions=False,
             )
         else:
             outputs.append(
@@ -302,6 +303,7 @@ def get_model(shape, dtype, splits, axis):
                     npu=npu,
                     expected_host_ops=expected_host_ops,
                     npu_partitions=npu_partitions,
+                    optimize_partitions=False,
                 )
             )
 
@@ -332,7 +334,7 @@ def get_model(dtype):
     for npu in [False, True]:
         model = get_model(dtype)
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 4, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 4, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 0)
 
@@ -381,7 +383,33 @@ def get_model(shapes, dtype, axis):
             mod = tei.make_module(model, {})
         else:
             mod = tei.make_ethosn_partition(model)
-        lib = tei.build(mod, {}, npu=False)
+        lib = tei.build(mod, {}, npu=False, optimize_partitions=False)
         outputs.append(tei.run(lib, inputs, 1, npu=npu))
 
     tei.verify(outputs, dtype, 0)
+
+
+@requires_ethosn
+def test_inline_non_compute_intensive_operations():
+    """Tests the case when a subgraph is unpartitioned."""
+    np.random.seed(0)
+    dtype = "int8"
+    shape = (1, 2, 2, 4)
+
+    inp = relay.var("x", shape=shape, dtype=dtype)
+    reshape = relay.reshape(inp, newshape=(1, 1, 4, 4))
+
+    inputs = {
+        "x": tvm.nd.array(
+            np.random.randint(np.iinfo(dtype).min, np.iinfo(dtype).max + 1, size=shape, dtype=dtype)
+        ),
+    }
+    outputs = []
+
+    for npu in [False, True]:
+        mod = tei.make_module(reshape, {})
+        outputs.append(
+            tei.build_and_run(mod, inputs, 1, {}, npu=npu, expected_host_ops=1, npu_partitions=0)
+        )
+
+    tei.verify(outputs, dtype, 0)

From 3defac896abc41d4367251ee5ee23134f08709cc Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 20 Oct 2022 10:55:48 +0000
Subject: [PATCH 2/4] Address comments

* 'inline_non_compute_intensive_partitions' -> 'is_inline_non_compute
_intensive_partitions_enabled'.
* remove no MAC operations.
* fix network test.

Change-Id: Ie1015b27f37e47544bed6f0aff819ee4649de579
---
 python/tvm/relay/op/contrib/ethosn.py                |  4 ++--
 .../backend/contrib/ethosn/inline_partitions.cc      | 12 ++++++------
 tests/python/contrib/test_ethosn/test_networks.py    |  5 ++---
 3 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py
index 5952d21e7a37..9afab68ccd8f 100644
--- a/python/tvm/relay/op/contrib/ethosn.py
+++ b/python/tvm/relay/op/contrib/ethosn.py
@@ -88,7 +88,7 @@ def InlineNonComputeIntensivePartitions() -> tvm.ir.IRModule:  # pylint: disable
     return _ethosn.InlineNonComputeIntensivePartitions()
 
 
-def inline_non_compute_intensive_partitions() -> bool:
+def is_inline_non_compute_intensive_partitions_enabled() -> bool:
     """
     Determine whether to inline none-compute-intensive partitions.
 
@@ -148,7 +148,7 @@ def partition_for_ethosn(mod, params=None, **opts):
         transform.PartitionGraph(),
         ConvertEquivalents(),
     ]
-    if inline_non_compute_intensive_partitions():
+    if is_inline_non_compute_intensive_partitions_enabled():
         passes.append(InlineNonComputeIntensivePartitions())
 
     return tvm.transform.Sequential(passes)(mod)
diff --git a/src/relay/backend/contrib/ethosn/inline_partitions.cc b/src/relay/backend/contrib/ethosn/inline_partitions.cc
index 8dc7ebb1116f..739515503c0f 100644
--- a/src/relay/backend/contrib/ethosn/inline_partitions.cc
+++ b/src/relay/backend/contrib/ethosn/inline_partitions.cc
@@ -76,10 +76,11 @@ class IsComputeIntensivePartition : MixedModeVisitor {
   bool is_compute_intensive;
   /*! \brief A set of operators considered compute intensive. */
   const std::unordered_set<std::string> compute_intensive_operators{
-      "ethos-n.qnn_add",        "ethos-n.qnn_conv2d",  "ethos-n.qnn_conv2d_transpose",
-      "ethos-n.qnn_avg_pool2d", "ethos-n.qnn_sigmoid", "ethos-n.qnn_fc",
-      "ethos-n.qnn_mean",       "ethos-n.qnn_resize",  "nn.max_pool2d",
-      "nn.depth_to_space"};
+      "ethos-n.qnn_conv2d",     "ethos-n.qnn_conv2d_transpose",
+      "ethos-n.qnn_avg_pool2d", "ethos-n.qnn_sigmoid",
+      "ethos-n.qnn_fc",         "ethos-n.qnn_mean",
+      "ethos-n.qnn_resize",     "nn.max_pool2d",
+  };
 };
 
 /*!
@@ -88,8 +89,7 @@ class IsComputeIntensivePartition : MixedModeVisitor {
  * other backends to consider.
  *
  * A partitioned function is currently considered non-compute intensive if it contains
- * no multiply accumulate operations. Note that this is not an optimal heuristic,
- * however, it will not degrade performance.
+ * no multiply accumulate operations. Note that this is not an optimal heuristic.
  */
 tvm::transform::Pass InlineNonComputeIntensivePartitions() {
   runtime::TypedPackedFunc<IRModule(IRModule, tvm::transform::PassContext)> pass_func =
diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py
index 5bd133ba20bb..68402cd5e8a9 100644
--- a/tests/python/contrib/test_ethosn/test_networks.py
+++ b/tests/python/contrib/test_ethosn/test_networks.py
@@ -146,7 +146,6 @@ def test_resnet_50_int8():
     # on hardware that isn't available in CI.
     _compile_hash = {
         "f16dc9caa8e696bc5da8a5c6a644eb72",
-        "6e5fcbab831607b9da1039aff4e56871",
         "41acecca37b2735bd580f6ec38d8c2e0",
     }
     _test_image_network(
@@ -156,8 +155,8 @@ def test_resnet_50_int8():
         input_dict={"input": (1, 224, 224, 3)},
         compile_hash=_compile_hash,
         output_count=1,
-        host_ops=9,
-        npu_partitions=3,
+        host_ops=10,
+        npu_partitions=2,
     )
 
 

From ea1c7404f5c0c3863e04cc80c5dd98cf899812ce Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Thu, 20 Oct 2022 13:31:06 +0000
Subject: [PATCH 3/4] Fix failing unit tests due to optimization

Change-Id: I0ee0af071dc77c91e0ef0f6753506cb40d1d1859
---
 .../contrib/test_ethosn/test_addition.py      |  2 +-
 .../test_ethosn/test_depth_to_space.py        |  2 +-
 .../contrib/test_ethosn/test_topologies.py    | 27 ++++++++-----------
 3 files changed, 13 insertions(+), 18 deletions(-)

diff --git a/tests/python/contrib/test_ethosn/test_addition.py b/tests/python/contrib/test_ethosn/test_addition.py
index 76bda5e0b7d4..53afd01b8449 100644
--- a/tests/python/contrib/test_ethosn/test_addition.py
+++ b/tests/python/contrib/test_ethosn/test_addition.py
@@ -111,7 +111,7 @@ def test_addition(dtype, shape):
     model = _get_model(shape, shape, lhs_zp, lhs_sc, rhs_zp, rhs_sc, out_zp, out_sc, dtype)
     for npu in [False, True]:
         mod = tei.make_module(model, [])
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_depth_to_space.py b/tests/python/contrib/test_ethosn/test_depth_to_space.py
index 732932d8f324..814693b664ca 100644
--- a/tests/python/contrib/test_ethosn/test_depth_to_space.py
+++ b/tests/python/contrib/test_ethosn/test_depth_to_space.py
@@ -53,7 +53,7 @@ def test_depth_to_space(dtype, shape):
     for npu in [False, True]:
         model = _get_model(shape, 2, dtype, "NHWC")
         mod = tei.make_module(model, {})
-        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu))
+        outputs.append(tei.build_and_run(mod, inputs, 1, {}, npu=npu, optimize_partitions=False))
 
     tei.verify(outputs, dtype, 1)
 
diff --git a/tests/python/contrib/test_ethosn/test_topologies.py b/tests/python/contrib/test_ethosn/test_topologies.py
index a16a453d0f85..4a4fc1e4d126 100644
--- a/tests/python/contrib/test_ethosn/test_topologies.py
+++ b/tests/python/contrib/test_ethosn/test_topologies.py
@@ -81,23 +81,18 @@ def get_model(input_shape, dtype, var_names):
         expected_host_ops = 0
         npu_partitions = 1
 
-        # Mock inference is only supported when the whole graph is offloaded to the NPU
-        if ethosn_available() == Available.SW_ONLY:
-            tei.build(
-                mod, {}, npu=npu, expected_host_ops=expected_host_ops, npu_partitions=npu_partitions
-            )
-        else:
-            outputs.append(
-                tei.build_and_run(
-                    mod,
-                    inputs,
-                    1,
-                    {},
-                    npu=npu,
-                    expected_host_ops=expected_host_ops,
-                    npu_partitions=npu_partitions,
-                )
+        outputs.append(
+            tei.build_and_run(
+                mod,
+                inputs,
+                1,
+                {},
+                npu=npu,
+                expected_host_ops=expected_host_ops,
+                npu_partitions=npu_partitions,
+                optimize_partitions=False,
             )
+        )
 
     if outputs:
         tei.verify(outputs, dtype, 2)

From bd0bf6c10ece5a3950b2d8772291db93b428f77d Mon Sep 17 00:00:00 2001
From: Luke Hutton <luke.hutton@arm.com>
Date: Fri, 21 Oct 2022 08:17:36 +0000
Subject: [PATCH 4/4] Add future exploration suggestions

Change-Id: Ie918d7f1059f032282f1f5eeffda38f4febcd59c
---
 src/relay/backend/contrib/ethosn/inline_partitions.cc | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/relay/backend/contrib/ethosn/inline_partitions.cc b/src/relay/backend/contrib/ethosn/inline_partitions.cc
index 739515503c0f..f8cc3fc00d10 100644
--- a/src/relay/backend/contrib/ethosn/inline_partitions.cc
+++ b/src/relay/backend/contrib/ethosn/inline_partitions.cc
@@ -90,6 +90,12 @@ class IsComputeIntensivePartition : MixedModeVisitor {
  *
  * A partitioned function is currently considered non-compute intensive if it contains
  * no multiply accumulate operations. Note that this is not an optimal heuristic.
+ *
+ * Some suggestions for future exploration:
+ * - Making a better choice about large non-compute-intensive subgraphs
+ *   as currently these are inlined.
+ * - Allowing the user to input ops that are considered compute-intensive.
+ * - Inline "small" compute intensive operations.
  */
 tvm::transform::Pass InlineNonComputeIntensivePartitions() {
   runtime::TypedPackedFunc<IRModule(IRModule, tvm::transform::PassContext)> pass_func =