apache · MasterJH5574 · Apr 25, 2026 · Apr 25, 2026 · gemini-code-assist · Apr 25, 2026
diff --git a/docs/how_to/tutorials/byoc_npu_example.py → ...ow_to/tutorials/bring_your_own_codegen.py b/docs/how_to/tutorials/byoc_npu_example.py → ...ow_to/tutorials/bring_your_own_codegen.py
@@ -16,21 +16,25 @@
 # under the License.
 
 """
-.. _tutorial-byoc-npu-example:
+.. _tutorial-bring-your-own-codegen:
 
 Bring Your Own Codegen: NPU Backend Example
 ===========================================
-**Author**: `Sheldon Aristide <https://github.com/Aristide021/>`_
 
-This tutorial walks through the example NPU BYOC backend included in TVM.
-It demonstrates the key concepts needed to offload operations to a custom
-accelerator: pattern registration, graph partitioning, codegen, and runtime
-dispatch.
+This tutorial shows how to integrate a custom hardware backend with TVM's
+BYOC framework, using the bundled example NPU backend (CPU emulation, no
+real hardware required) as the worked example.  You will see the key
+concepts needed to offload operations to a custom accelerator: pattern
+registration, graph partitioning, codegen, and runtime dispatch.
 
 NPUs are purpose-built accelerators designed around a fixed set of operations
 common in neural network inference, such as matrix multiplication, convolution,
 and activation functions.
-The example backend uses CPU emulation so no real NPU hardware is required.
+The example backend's runtime is a *stub*: it logs the dispatch decisions an
+NPU would make (memory tier, execution engine, fusion) but performs no real
+computation, so output buffers are uninitialized.  Assertions in this tutorial
+therefore check shapes, not values.  When you replace the runtime with your
+hardware SDK calls, the same flow produces real results.
 
 **Prerequisites**: Build TVM with ``USE_EXAMPLE_NPU_CODEGEN=ON`` and
 ``USE_EXAMPLE_NPU_RUNTIME=ON``.
@@ -58,6 +62,8 @@
 # Importing the module is enough to register all supported patterns with
 # TVM's pattern registry.
 
+import numpy as np
+
 import tvm
 import tvm.relax.backend.contrib.example_npu  # registers patterns
 from tvm import relax
@@ -69,6 +75,8 @@
 has_example_npu_runtime = tvm.get_global_func("runtime.ExampleNPUJSONRuntimeCreate", True)
 has_example_npu = has_example_npu_codegen and has_example_npu_runtime
 
+target = tvm.target.Target("llvm")
+
 patterns = get_patterns_with_prefix("example_npu")
 print("Registered patterns:", [p.name for p in patterns])
 
@@ -98,8 +106,22 @@ def main(
 # ---------------------------
 #
 # ``FuseOpsByPattern`` groups ops that match a registered pattern into
-# composite functions.  ``MergeCompositeFunctions`` consolidates them
-# so each group becomes a single external call.
+# composite functions, controlled by two flags:
+#
+# - ``bind_constants=False`` keeps weights as function arguments instead
+#   of baking them in, so the host stays in charge of parameter
+#   ownership.
+# - ``annotate_codegen=True`` tags each composite with its backend name
+#   (``example_npu``); without this tag, ``RunCodegen`` has no way to
+#   route the composite to a backend.
+#
+# ``MergeCompositeFunctions`` then consolidates adjacent composites
+# that target the same backend so each group becomes a single external
+# call.  Note that consolidation depends on the patterns themselves: an
+# ``op_a + op_b`` chain only collapses into one composite if a fused
+# pattern (e.g. ``matmul_relu_fused``) was registered for it; otherwise
+# each op stays as its own composite even when both target the same
+# backend.
 
 mod = MatmulReLU
 mod = FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod)
@@ -130,28 +152,27 @@ def main(
     # Build the module for the host target, create a virtual machine, and
     # execute the compiled function.
 
-    import numpy as np
-
     np.random.seed(0)
     x_np = np.random.randn(2, 4).astype("float32")
     w_np = np.random.randn(4, 8).astype("float32")
 
-    target = tvm.target.Target("llvm")
     with tvm.transform.PassContext(opt_level=3):
         built = relax.build(mod, target)
 
     vm = relax.VirtualMachine(built, tvm.cpu())
     result = vm["main"](tvm.runtime.tensor(x_np, tvm.cpu()), tvm.runtime.tensor(w_np, tvm.cpu()))
 
-    expected_shape = (2, 8)
-    assert result.numpy().shape == expected_shape
+    assert result.numpy().shape == (2, 8)
     print("Execution completed. Output shape:", result.numpy().shape)
 
 ######################################################################
 # Step 6: Conv2D + ReLU
 # ---------------------
 #
-# The same flow applies to convolution workloads.
+# The same flow applies to convolution workloads.  Because the fused
+# ``conv2d + relu`` pattern is registered after the standalone
+# ``conv2d`` pattern in ``patterns.py`` (later entries have higher
+# priority), both ops are offloaded as a single composite function.
 
 
 @tvm.script.ir_module
@@ -177,7 +198,15 @@ def main(
     with tvm.transform.PassContext(opt_level=3):
         built2 = relax.build(mod2, target)
 
-    print("Conv2dReLU compiled successfully.")
+    x2_np = np.random.randn(1, 3, 32, 32).astype("float32")
+    w2_np = np.random.randn(16, 3, 3, 3).astype("float32")
+
+    vm2 = relax.VirtualMachine(built2, tvm.cpu())
+    result2 = vm2["main"](
+        tvm.runtime.tensor(x2_np, tvm.cpu()), tvm.runtime.tensor(w2_np, tvm.cpu())
+    )
+    assert result2.numpy().shape == (1, 16, 30, 30)
+    print("Conv2dReLU output shape:", result2.numpy().shape)
 
 ######################################################################
 # Next steps

diff --git a/docs/index.rst b/docs/index.rst
@@ -48,6 +48,7 @@ driving its costs down.
    how_to/tutorials/cross_compilation_and_rpc
    how_to/tutorials/export_and_load_executable
    how_to/tutorials/mix_python_and_tvm_with_pymodule
+   how_to/tutorials/bring_your_own_codegen
    how_to/dev/index
 
 .. The Deep Dive content is comprehensive

diff --git a/python/tvm/relax/backend/contrib/example_npu/patterns.py b/python/tvm/relax/backend/contrib/example_npu/patterns.py
@@ -117,6 +117,40 @@ def _check_conv2d_relu(context: PatternCheckContext) -> bool:
     return ("example_npu.conv2d_relu_fused", *_make_conv2d_relu_pattern(), _check_conv2d_relu)
 
 
+def matmul_relu_fused_pattern():
+    """
+    NPU-optimized MatMul+ReLU fusion pattern.
+
+    Fusing the matrix engine output with the activation unit avoids a
+    write/read round-trip through L1 SRAM, mirroring the conv2d+relu
+    fusion below.
+    """
+
+    def _make_matmul_relu_pattern():
+        input_tensor = wildcard()
+        weight = wildcard()
+        matmul = is_op("relax.matmul")(input_tensor, weight)
+        relu = is_op("relax.nn.relu")(matmul)
+
+        annotations = {
+            "input": input_tensor,
+            "weight": weight,
+            "matmul": matmul,
+            "root": relu,
+        }
+        return relu, annotations
+
+    def _check_matmul_relu(context: PatternCheckContext) -> bool:
+        """Check if MatMul+ReLU fusion is beneficial for NPU"""
+        if not _check_npu_memory_constraints(context):
+            return False
+        if not _check_npu_quantization(context):
+            return False
+        return True
+
+    return ("example_npu.matmul_relu_fused", *_make_matmul_relu_pattern(), _check_matmul_relu)
+
+
 def matmul_patterns():
     """
     NPU-optimized matrix multiplication patterns.
@@ -486,18 +520,25 @@ def _check_quantization(
 
 
 # Register all NPU patterns with architectural awareness
+# register_patterns priority: patterns that appear LATER in the list win.
+# So we place general / standalone patterns first, and fused (more
+# specific) patterns last so they take precedence over their constituents.
 register_patterns(
     [
-        conv2d_relu_fused_pattern(),  # Fused patterns first (higher priority)
+        *quantization_patterns(),
+        *elementwise_patterns(),
+        *activation_patterns(),
+        *softmax_patterns(),
+        *batch_norm_patterns(),
+        *pooling_patterns(),
         *matmul_patterns(),
         *conv1d_patterns(),
+        # Plain conv2d is more general than depthwise (groups>1); list
+        # plain first so depthwise wins on grouped convs.
         *conv2d_patterns(),
         *depthwise_conv2d_patterns(),
-        *pooling_patterns(),
-        *batch_norm_patterns(),
-        *softmax_patterns(),
-        *activation_patterns(),
-        *elementwise_patterns(),
-        *quantization_patterns(),
+        # Fused patterns last (highest priority).
+        matmul_relu_fused_pattern(),
+        conv2d_relu_fused_pattern(),
     ]
 )
diff --git a/src/runtime/contrib/example_npu/example_npu_runtime.cc b/src/runtime/contrib/example_npu/example_npu_runtime.cc
@@ -319,16 +319,19 @@ class ExampleNPURuntime : public JSONRuntimeBase {
           LOG(INFO) << "  Executing fused operation - reducing memory traffic";
         }
 
-        // Dispatch to appropriate implementation
+        // Dispatch to appropriate implementation.
+        // More specific names must be checked before more general ones, since
+        // op_name.find() is a substring match (e.g. "depthwise_conv2d" also
+        // contains "conv2d", and "dequantize" also contains "quantize").
         if (op_name.find("matmul") != std::string::npos ||
             op_name.find("dense") != std::string::npos) {
-          ExecuteMatMul(node, engine);
+          ExecuteMatMul(node, engine, is_fused);
+        } else if (op_name.find("depthwise") != std::string::npos) {
+          ExecuteDepthwiseConv2D(node, engine);
         } else if (op_name.find("conv2d") != std::string::npos) {
           ExecuteConv2D(node, engine, is_fused);
         } else if (op_name.find("conv1d") != std::string::npos) {
           ExecuteConv1D(node, engine);
-        } else if (op_name.find("depthwise") != std::string::npos) {
-          ExecuteDepthwiseConv2D(node, engine);
         } else if (op_name.find("pool") != std::string::npos) {
           ExecutePooling(node, engine);
         } else if (op_name.find("relu") != std::string::npos ||
@@ -340,10 +343,10 @@ class ExampleNPURuntime : public JSONRuntimeBase {
         } else if (op_name.find("add") != std::string::npos ||
                    op_name.find("multiply") != std::string::npos) {
           ExecuteElementwise(node, engine);
-        } else if (op_name.find("quantize") != std::string::npos) {
-          ExecuteQuantization(node);
         } else if (op_name.find("dequantize") != std::string::npos) {
           ExecuteDequantization(node);
+        } else if (op_name.find("quantize") != std::string::npos) {
+          ExecuteQuantization(node);
         } else {
           LOG(WARNING) << "Unsupported operation: " << op_name;
         }
@@ -431,7 +434,7 @@ class ExampleNPURuntime : public JSONRuntimeBase {
   /*!
    * \brief Execute matrix multiplication on NPU matrix engine
    */
-  void ExecuteMatMul(const JSONGraphNode& node, ExecutionEngine engine) {
+  void ExecuteMatMul(const JSONGraphNode& node, ExecutionEngine engine, bool is_fused) {
     LOG(INFO) << "  Executing MatMul on " << GetEngineString(engine);
 
     // Get input shapes
@@ -448,6 +451,10 @@ class ExampleNPURuntime : public JSONRuntimeBase {
       LOG(INFO) << "    Using 16x16 systolic array for acceleration";
     }
 
+    if (is_fused) {
+      LOG(INFO) << "    Fused with activation - saving memory bandwidth";
+    }
+
     // In a real implementation: dispatch to NPU matrix multiplication unit
   }