From 469efb7d10ee3dd540a77c18fa17eaf5dcde50ab Mon Sep 17 00:00:00 2001 From: tlopex <820958424@qq.com> Date: Sat, 25 Apr 2026 02:49:25 -0400 Subject: [PATCH] finish1 --- ...u_example.py => bring_your_own_codegen.py} | 61 ++++++++++++++----- docs/index.rst | 1 + .../backend/contrib/example_npu/patterns.py | 55 ++++++++++++++--- .../example_npu/example_npu_runtime.cc | 21 ++++--- 4 files changed, 108 insertions(+), 30 deletions(-) rename docs/how_to/tutorials/{byoc_npu_example.py => bring_your_own_codegen.py} (72%) diff --git a/docs/how_to/tutorials/byoc_npu_example.py b/docs/how_to/tutorials/bring_your_own_codegen.py similarity index 72% rename from docs/how_to/tutorials/byoc_npu_example.py rename to docs/how_to/tutorials/bring_your_own_codegen.py index 143d097dc461..b6039e493039 100644 --- a/docs/how_to/tutorials/byoc_npu_example.py +++ b/docs/how_to/tutorials/bring_your_own_codegen.py @@ -16,21 +16,25 @@ # under the License. """ -.. _tutorial-byoc-npu-example: +.. _tutorial-bring-your-own-codegen: Bring Your Own Codegen: NPU Backend Example =========================================== -**Author**: `Sheldon Aristide `_ -This tutorial walks through the example NPU BYOC backend included in TVM. -It demonstrates the key concepts needed to offload operations to a custom -accelerator: pattern registration, graph partitioning, codegen, and runtime -dispatch. +This tutorial shows how to integrate a custom hardware backend with TVM's +BYOC framework, using the bundled example NPU backend (CPU emulation, no +real hardware required) as the worked example. You will see the key +concepts needed to offload operations to a custom accelerator: pattern +registration, graph partitioning, codegen, and runtime dispatch. NPUs are purpose-built accelerators designed around a fixed set of operations common in neural network inference, such as matrix multiplication, convolution, and activation functions. -The example backend uses CPU emulation so no real NPU hardware is required. +The example backend's runtime is a *stub*: it logs the dispatch decisions an +NPU would make (memory tier, execution engine, fusion) but performs no real +computation, so output buffers are uninitialized. Assertions in this tutorial +therefore check shapes, not values. When you replace the runtime with your +hardware SDK calls, the same flow produces real results. **Prerequisites**: Build TVM with ``USE_EXAMPLE_NPU_CODEGEN=ON`` and ``USE_EXAMPLE_NPU_RUNTIME=ON``. @@ -58,6 +62,8 @@ # Importing the module is enough to register all supported patterns with # TVM's pattern registry. +import numpy as np + import tvm import tvm.relax.backend.contrib.example_npu # registers patterns from tvm import relax @@ -69,6 +75,8 @@ has_example_npu_runtime = tvm.get_global_func("runtime.ExampleNPUJSONRuntimeCreate", True) has_example_npu = has_example_npu_codegen and has_example_npu_runtime +target = tvm.target.Target("llvm") + patterns = get_patterns_with_prefix("example_npu") print("Registered patterns:", [p.name for p in patterns]) @@ -98,8 +106,22 @@ def main( # --------------------------- # # ``FuseOpsByPattern`` groups ops that match a registered pattern into -# composite functions. ``MergeCompositeFunctions`` consolidates them -# so each group becomes a single external call. +# composite functions, controlled by two flags: +# +# - ``bind_constants=False`` keeps weights as function arguments instead +# of baking them in, so the host stays in charge of parameter +# ownership. +# - ``annotate_codegen=True`` tags each composite with its backend name +# (``example_npu``); without this tag, ``RunCodegen`` has no way to +# route the composite to a backend. +# +# ``MergeCompositeFunctions`` then consolidates adjacent composites +# that target the same backend so each group becomes a single external +# call. Note that consolidation depends on the patterns themselves: an +# ``op_a + op_b`` chain only collapses into one composite if a fused +# pattern (e.g. ``matmul_relu_fused``) was registered for it; otherwise +# each op stays as its own composite even when both target the same +# backend. mod = MatmulReLU mod = FuseOpsByPattern(patterns, bind_constants=False, annotate_codegen=True)(mod) @@ -130,28 +152,27 @@ def main( # Build the module for the host target, create a virtual machine, and # execute the compiled function. - import numpy as np - np.random.seed(0) x_np = np.random.randn(2, 4).astype("float32") w_np = np.random.randn(4, 8).astype("float32") - target = tvm.target.Target("llvm") with tvm.transform.PassContext(opt_level=3): built = relax.build(mod, target) vm = relax.VirtualMachine(built, tvm.cpu()) result = vm["main"](tvm.runtime.tensor(x_np, tvm.cpu()), tvm.runtime.tensor(w_np, tvm.cpu())) - expected_shape = (2, 8) - assert result.numpy().shape == expected_shape + assert result.numpy().shape == (2, 8) print("Execution completed. Output shape:", result.numpy().shape) ###################################################################### # Step 6: Conv2D + ReLU # --------------------- # -# The same flow applies to convolution workloads. +# The same flow applies to convolution workloads. Because the fused +# ``conv2d + relu`` pattern is registered after the standalone +# ``conv2d`` pattern in ``patterns.py`` (later entries have higher +# priority), both ops are offloaded as a single composite function. @tvm.script.ir_module @@ -177,7 +198,15 @@ def main( with tvm.transform.PassContext(opt_level=3): built2 = relax.build(mod2, target) - print("Conv2dReLU compiled successfully.") + x2_np = np.random.randn(1, 3, 32, 32).astype("float32") + w2_np = np.random.randn(16, 3, 3, 3).astype("float32") + + vm2 = relax.VirtualMachine(built2, tvm.cpu()) + result2 = vm2["main"]( + tvm.runtime.tensor(x2_np, tvm.cpu()), tvm.runtime.tensor(w2_np, tvm.cpu()) + ) + assert result2.numpy().shape == (1, 16, 30, 30) + print("Conv2dReLU output shape:", result2.numpy().shape) ###################################################################### # Next steps diff --git a/docs/index.rst b/docs/index.rst index 01a4a64f0822..2c66c4295d26 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -48,6 +48,7 @@ driving its costs down. how_to/tutorials/cross_compilation_and_rpc how_to/tutorials/export_and_load_executable how_to/tutorials/mix_python_and_tvm_with_pymodule + how_to/tutorials/bring_your_own_codegen how_to/dev/index .. The Deep Dive content is comprehensive diff --git a/python/tvm/relax/backend/contrib/example_npu/patterns.py b/python/tvm/relax/backend/contrib/example_npu/patterns.py index f55ce47dfb29..d224388fa306 100644 --- a/python/tvm/relax/backend/contrib/example_npu/patterns.py +++ b/python/tvm/relax/backend/contrib/example_npu/patterns.py @@ -117,6 +117,40 @@ def _check_conv2d_relu(context: PatternCheckContext) -> bool: return ("example_npu.conv2d_relu_fused", *_make_conv2d_relu_pattern(), _check_conv2d_relu) +def matmul_relu_fused_pattern(): + """ + NPU-optimized MatMul+ReLU fusion pattern. + + Fusing the matrix engine output with the activation unit avoids a + write/read round-trip through L1 SRAM, mirroring the conv2d+relu + fusion below. + """ + + def _make_matmul_relu_pattern(): + input_tensor = wildcard() + weight = wildcard() + matmul = is_op("relax.matmul")(input_tensor, weight) + relu = is_op("relax.nn.relu")(matmul) + + annotations = { + "input": input_tensor, + "weight": weight, + "matmul": matmul, + "root": relu, + } + return relu, annotations + + def _check_matmul_relu(context: PatternCheckContext) -> bool: + """Check if MatMul+ReLU fusion is beneficial for NPU""" + if not _check_npu_memory_constraints(context): + return False + if not _check_npu_quantization(context): + return False + return True + + return ("example_npu.matmul_relu_fused", *_make_matmul_relu_pattern(), _check_matmul_relu) + + def matmul_patterns(): """ NPU-optimized matrix multiplication patterns. @@ -486,18 +520,25 @@ def _check_quantization( # Register all NPU patterns with architectural awareness +# register_patterns priority: patterns that appear LATER in the list win. +# So we place general / standalone patterns first, and fused (more +# specific) patterns last so they take precedence over their constituents. register_patterns( [ - conv2d_relu_fused_pattern(), # Fused patterns first (higher priority) + *quantization_patterns(), + *elementwise_patterns(), + *activation_patterns(), + *softmax_patterns(), + *batch_norm_patterns(), + *pooling_patterns(), *matmul_patterns(), *conv1d_patterns(), + # Plain conv2d is more general than depthwise (groups>1); list + # plain first so depthwise wins on grouped convs. *conv2d_patterns(), *depthwise_conv2d_patterns(), - *pooling_patterns(), - *batch_norm_patterns(), - *softmax_patterns(), - *activation_patterns(), - *elementwise_patterns(), - *quantization_patterns(), + # Fused patterns last (highest priority). + matmul_relu_fused_pattern(), + conv2d_relu_fused_pattern(), ] ) diff --git a/src/runtime/contrib/example_npu/example_npu_runtime.cc b/src/runtime/contrib/example_npu/example_npu_runtime.cc index 4f4e70d4e556..440a5d9715ec 100644 --- a/src/runtime/contrib/example_npu/example_npu_runtime.cc +++ b/src/runtime/contrib/example_npu/example_npu_runtime.cc @@ -319,16 +319,19 @@ class ExampleNPURuntime : public JSONRuntimeBase { LOG(INFO) << " Executing fused operation - reducing memory traffic"; } - // Dispatch to appropriate implementation + // Dispatch to appropriate implementation. + // More specific names must be checked before more general ones, since + // op_name.find() is a substring match (e.g. "depthwise_conv2d" also + // contains "conv2d", and "dequantize" also contains "quantize"). if (op_name.find("matmul") != std::string::npos || op_name.find("dense") != std::string::npos) { - ExecuteMatMul(node, engine); + ExecuteMatMul(node, engine, is_fused); + } else if (op_name.find("depthwise") != std::string::npos) { + ExecuteDepthwiseConv2D(node, engine); } else if (op_name.find("conv2d") != std::string::npos) { ExecuteConv2D(node, engine, is_fused); } else if (op_name.find("conv1d") != std::string::npos) { ExecuteConv1D(node, engine); - } else if (op_name.find("depthwise") != std::string::npos) { - ExecuteDepthwiseConv2D(node, engine); } else if (op_name.find("pool") != std::string::npos) { ExecutePooling(node, engine); } else if (op_name.find("relu") != std::string::npos || @@ -340,10 +343,10 @@ class ExampleNPURuntime : public JSONRuntimeBase { } else if (op_name.find("add") != std::string::npos || op_name.find("multiply") != std::string::npos) { ExecuteElementwise(node, engine); - } else if (op_name.find("quantize") != std::string::npos) { - ExecuteQuantization(node); } else if (op_name.find("dequantize") != std::string::npos) { ExecuteDequantization(node); + } else if (op_name.find("quantize") != std::string::npos) { + ExecuteQuantization(node); } else { LOG(WARNING) << "Unsupported operation: " << op_name; } @@ -431,7 +434,7 @@ class ExampleNPURuntime : public JSONRuntimeBase { /*! * \brief Execute matrix multiplication on NPU matrix engine */ - void ExecuteMatMul(const JSONGraphNode& node, ExecutionEngine engine) { + void ExecuteMatMul(const JSONGraphNode& node, ExecutionEngine engine, bool is_fused) { LOG(INFO) << " Executing MatMul on " << GetEngineString(engine); // Get input shapes @@ -448,6 +451,10 @@ class ExampleNPURuntime : public JSONRuntimeBase { LOG(INFO) << " Using 16x16 systolic array for acceleration"; } + if (is_fused) { + LOG(INFO) << " Fused with activation - saving memory bandwidth"; + } + // In a real implementation: dispatch to NPU matrix multiplication unit }