From 6fd20ccccdad4e3056da1ad10d7464b1aa6b4b89 Mon Sep 17 00:00:00 2001 From: tlopex <820958424@qq.com> Date: Sun, 5 Apr 2026 20:51:59 -0400 Subject: [PATCH 1/2] finish1' --- docs/how_to/tutorials/meta_schedule.py | 317 +++++++++++++++++++++++++ docs/index.rst | 1 + 2 files changed, 318 insertions(+) create mode 100644 docs/how_to/tutorials/meta_schedule.py diff --git a/docs/how_to/tutorials/meta_schedule.py b/docs/how_to/tutorials/meta_schedule.py new file mode 100644 index 000000000000..2245ab5e6196 --- /dev/null +++ b/docs/how_to/tutorials/meta_schedule.py @@ -0,0 +1,317 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ruff: noqa: E402, F401, I001 + +""" +.. _meta_schedule_advanced: + +MetaSchedule: Advanced Auto-Tuning Guide +========================================= +MetaSchedule is TVM's search-based auto-tuning framework. It explores different TIR schedules +(loop tiling, vectorization, thread binding, etc.) and measures them on real hardware to find +the fastest implementation for each operator. + +For the basic tune-and-apply workflow, see :ref:`optimize_model`. This tutorial focuses on +advanced usage: inspecting tunable tasks, selective operator tuning, database management, +cross-model reuse, and the lower-level tuning API. + +.. contents:: Table of Contents + :local: + :depth: 1 +""" + +###################################################################### +# Prepare a Model +# --------------- +# We start with a simple MLP model exported as a Relax IRModule, then legalize it +# so that high-level Relax operators are lowered to TIR functions that MetaSchedule can tune. + +import os +import numpy as np + +import tvm +from tvm import relax +from tvm.relax.frontend import nn + +IS_IN_CI = os.getenv("CI", "") == "true" + + +class MLPModel(nn.Module): + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(784, 256) + self.relu1 = nn.ReLU() + self.fc2 = nn.Linear(256, 10, bias=False) + + def forward(self, x): + x = self.fc1(x) + x = self.relu1(x) + x = self.fc2(x) + return x + + +input_shape = (1, 784) +mod, params = MLPModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}}) + +# Legalize: lower Relax operators to TIR PrimFuncs +target = tvm.target.Target({"kind": "llvm", "num-cores": 4}) +with target: + mod = relax.get_pipeline("zero")(mod) + +mod.show() + +###################################################################### +# Inspecting Tunable Tasks +# ------------------------ +# Before tuning, it is useful to see what MetaSchedule will actually tune. The +# ``extract_tasks`` function analyzes an IRModule and returns one ``ExtractedTask`` per +# unique TIR workload. Each task has a ``task_name`` and a ``weight`` (how many times +# this workload is called in the graph — the task scheduler uses weights to allocate +# more tuning budget to frequently-called operators). + +from tvm.s_tir.meta_schedule.relax_integration import extract_tasks + +tasks = extract_tasks(mod, target) +for i, task in enumerate(tasks): + print(f"Task {i}: {task.task_name} (weight={task.weight})") + +###################################################################### +# This tells you exactly how many operators need tuning and their relative importance. +# Use this to decide whether to tune all operators or focus on a subset. + +###################################################################### +# Selective Operator Tuning +# ------------------------- +# Tuning every operator can be time-consuming. ``MetaScheduleTuneIRMod`` accepts an +# ``op_names`` parameter to tune only operators whose task name contains any of the given +# strings. Operators without tuning records are left unscheduled — you can later apply +# DLight or other rule-based schedules to cover them. +# +# .. note:: +# +# ``MetaScheduleTuneIRMod`` works at the IRModule level and supports ``op_names`` filtering, +# while ``MetaScheduleTuneTIR`` tunes all TIR functions without filtering. Choose based on +# your needs. +# +# .. note:: +# +# To save CI time and avoid flakiness, we skip the tuning process in CI environment. +# + +if not IS_IN_CI: + WORK_DIR = "./tuning_logs" + with target: + tuned_mod = tvm.ir.transform.Sequential( + [ + relax.transform.MetaScheduleTuneIRMod( + params={}, + work_dir=WORK_DIR, + max_trials_global=300, + op_names=["matmul"], # Only tune matmul-related operators + ), + relax.transform.MetaScheduleApplyDatabase(work_dir=WORK_DIR), + ] + )(mod) + + tuned_mod.show() + +###################################################################### +# Database Persistence and Resumption +# ------------------------------------ +# When you use a fixed ``work_dir`` (instead of ``tempfile.TemporaryDirectory``), tuning +# results are persisted in two JSON files: +# +# - ``database_workload.json``: One line per unique workload (structural hash + serialized +# IRModule). +# - ``database_tuning_record.json``: One line per tuning record (workload index + schedule +# trace + measured run times). +# +# Both files use a newline-delimited JSON format. Records are appended incrementally as +# tuning progresses, so **interrupting and resuming is safe**. When you re-run tuning with +# the same ``work_dir``, existing records are loaded and used as warm-start seeds for the +# evolutionary search — the tuner does not skip already-seen workloads entirely, but starts +# from a better initial population, so re-runs are faster than starting from scratch. +# +# You can quickly check tuning progress from the command line: +# +# .. code-block:: bash +# +# # Count how many tuning records have been collected +# wc -l tuning_logs/database_tuning_record.json +# +# Once tuning is done, subsequent compilations only need ``MetaScheduleApplyDatabase`` +# which reads the database and applies the best schedules — this takes seconds, not hours: +# +# .. code-block:: python +# +# # Fast: apply previously tuned results (no search) +# with target: +# mod = relax.transform.MetaScheduleApplyDatabase(work_dir="./tuning_logs")(mod) +# + +###################################################################### +# Querying the Tuning Database +# ---------------------------- +# The ``JSONDatabase`` class provides a Python API to inspect tuning results +# programmatically. This is useful for analyzing tuning quality, comparing different +# tuning runs, or debugging performance issues. + +from tvm.s_tir.meta_schedule.database import JSONDatabase + +if not IS_IN_CI: + db = JSONDatabase(work_dir=WORK_DIR) + print(f"Total tuning records: {len(db)}") + + # List all records with their best measured runtime + records = db.get_all_tuning_records() + for rec in records: + if rec.run_secs: + best = min(float(s) for s in rec.run_secs) + print(f" Best time: {best * 1e3:.3f} ms") + +###################################################################### +# You can also query the best schedule for a specific TIR function by passing its +# IRModule. For example, to query a single PrimFunc extracted from the full module: +# +# .. code-block:: python +# +# # tir_mod: an IRModule containing a single PrimFunc named "main" +# record = db.query_tuning_record(tir_mod, target, workload_name="main") +# if record: +# print(f"Best time: {min(float(s) for s in record.run_secs) * 1e3:.3f} ms") +# # Reconstruct the optimized schedule +# sch = db.query_schedule(tir_mod, target, workload_name="main") +# sch.mod.show() +# + +###################################################################### +# Cross-Model Database Reuse +# -------------------------- +# MetaSchedule identifies workloads by their structural hash. If two models contain +# operators with the same shape, dtype, and computation, they share the same hash and +# can reuse tuning records. This means a matmul ``(M=1, N=256, K=784)`` tuned for one +# model will automatically be reused by any other model with the same matmul shape. +# +# **module_equality options**: +# +# - ``"structural"`` (default): Exact structural match. Safe but strict. +# - ``"anchor-block"``: Match based on the dominant compute block, ignoring +# surrounding context. More permissive — enables sharing across fused operators +# that have the same core computation but different fusion boundaries. +# +# **OrderedUnionDatabase** enables a layered lookup strategy: check a local database +# first, then fall back to a shared team database: + +from tvm.s_tir.meta_schedule.database import OrderedUnionDatabase +from tvm.s_tir.meta_schedule.relax_integration import tune_relax + +###################################################################### +# +# .. code-block:: python +# +# local_db = JSONDatabase(work_dir="./my_tuning_logs") +# shared_db = JSONDatabase(work_dir="/shared/tuning_db") +# combined_db = OrderedUnionDatabase(local_db, shared_db) +# +# With this setup, ``combined_db.query_tuning_record(...)`` checks ``local_db`` first. +# Only if no match is found does it fall back to ``shared_db``. This lets a team maintain +# a shared tuning database while individuals only tune new operators locally. +# +# To make ``MetaScheduleApplyDatabase`` use the combined database during compilation, +# enter it as a context manager. The pass checks ``Database.current()`` first, and only +# falls back to ``work_dir`` when no database is in scope: +# +# .. code-block:: python +# +# with target, combined_db: +# mod = relax.transform.MetaScheduleApplyDatabase()(mod) +# + +###################################################################### +# Lower-Level API: ``tune_relax`` +# -------------------------------- +# The transform-based API (``MetaScheduleTuneTIR`` / ``MetaScheduleTuneIRMod``) covers +# most use cases. For advanced scenarios -- custom cost models, remote runners, or +# fine-grained control -- use the lower-level ``tune_relax`` function directly: + +###################################################################### +# +# .. code-block:: python +# +# db = tune_relax( +# mod=mod, +# params={}, +# target=target, +# work_dir="./tuning_logs", +# max_trials_global=2000, +# max_trials_per_task=500, +# op_names=["matmul"], # Selective tuning +# cost_model="xgb", # "xgb" (default), "mlp", or "random" +# num_trials_per_iter=64, # Batch size per search iteration +# runner="local", # "local" or RPCRunner for remote devices +# module_equality="structural", # "structural" or "anchor-block" +# ) +# +# Key parameters: +# +# - **cost_model**: ``"xgb"`` (XGBoost, default) uses gradient-boosted trees to predict +# schedule performance, reducing the number of actual measurements needed. ``"mlp"`` +# uses a neural network-based model. ``"random"`` disables prediction (baseline). +# - **num_trials_per_iter**: How many candidates are measured in each search iteration. +# Larger values improve hardware utilization but use more memory. +# - **runner**: Use ``"local"`` for the current machine. For cross-compilation scenarios +# (e.g., tuning for a remote device), use ``RPCRunner``. +# - **module_equality**: Controls how workloads are matched. ``"anchor-block"`` improves +# database hit rate across models at the cost of slightly less precise matching. + +###################################################################### +# Build and Run +# ------------- +# Finally, we build and run the model to verify the result. If tuning was skipped +# (e.g., in CI), we compile the untuned module directly — LLVM can still generate +# valid (though unoptimized) code for CPU targets without explicit scheduling. + +final_mod = tuned_mod if not IS_IN_CI else mod + +ex = tvm.compile(final_mod, target) +vm = relax.VirtualMachine(ex, tvm.cpu()) +data = tvm.runtime.tensor(np.random.rand(*input_shape).astype("float32")) +tvm_params = [tvm.runtime.tensor(np.random.rand(*p.shape).astype(p.dtype)) for _, p in params] +result = vm["forward"](data, *tvm_params).numpy() +print("Output shape:", result.shape) +print("Output:", result) + +###################################################################### +# Summary +# ------- +# This tutorial covered advanced MetaSchedule usage beyond the basic tune-and-apply flow: +# +# - **Inspect tasks** with ``extract_tasks`` to understand what will be tuned and plan your +# tuning budget: ``max_trials_global`` is shared across all tasks, so set it proportional +# to the number of tasks (e.g., 200-500 trials per task for good results). +# - **Selective tuning** with ``op_names`` to focus on performance-critical operators and +# skip the rest. +# - **Persist results** with a fixed ``work_dir``. Tuning is resumable — existing records +# warm-start the search on re-run. +# - **Query the database** to analyze tuning quality and debug performance. +# - **Reuse across models** via ``OrderedUnionDatabase`` and ``module_equality="anchor-block"`` +# to amortize tuning cost across a team or model family. +# - **Lower-level API** (``tune_relax``) for custom cost models, remote runners, and +# fine-grained control. +# +# For the basic end-to-end workflow, see :ref:`optimize_model`. For rule-based scheduling +# without search, see DLight documentation. diff --git a/docs/index.rst b/docs/index.rst index 2b5ef6464636..511f2ffa0b4f 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -43,6 +43,7 @@ driving its costs down. how_to/tutorials/e2e_opt_model how_to/tutorials/customize_opt + how_to/tutorials/meta_schedule how_to/tutorials/optimize_llm how_to/tutorials/cross_compilation_and_rpc how_to/tutorials/export_and_load_executable From 9e20ee920385a738711d4f1ac2c05e72f08c9f8c Mon Sep 17 00:00:00 2001 From: tlopex <820958424@qq.com> Date: Mon, 6 Apr 2026 10:22:59 -0400 Subject: [PATCH 2/2] finish5 --- docs/deep_dive/tensor_ir/index.rst | 2 + .../tutorials/dlight_gpu_scheduling.py | 316 +++++++++++++++++ .../tensor_ir/tutorials/meta_schedule.py | 307 +++++++++++++++++ docs/how_to/tutorials/meta_schedule.py | 317 ------------------ docs/index.rst | 1 - 5 files changed, 625 insertions(+), 318 deletions(-) create mode 100644 docs/deep_dive/tensor_ir/tutorials/dlight_gpu_scheduling.py create mode 100644 docs/deep_dive/tensor_ir/tutorials/meta_schedule.py delete mode 100644 docs/how_to/tutorials/meta_schedule.py diff --git a/docs/deep_dive/tensor_ir/index.rst b/docs/deep_dive/tensor_ir/index.rst index 95a6a3a402cc..2f8bd07c1b0c 100644 --- a/docs/deep_dive/tensor_ir/index.rst +++ b/docs/deep_dive/tensor_ir/index.rst @@ -39,3 +39,5 @@ In TVMScript, both modules are accessed via learning tutorials/tir_creation tutorials/tir_transformation + tutorials/dlight_gpu_scheduling + tutorials/meta_schedule diff --git a/docs/deep_dive/tensor_ir/tutorials/dlight_gpu_scheduling.py b/docs/deep_dive/tensor_ir/tutorials/dlight_gpu_scheduling.py new file mode 100644 index 000000000000..9c5fe1ff4c7c --- /dev/null +++ b/docs/deep_dive/tensor_ir/tutorials/dlight_gpu_scheduling.py @@ -0,0 +1,316 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ruff: noqa: E402, E501 + +""" +.. _dlight_gpu_scheduling: + +DLight: Rule-Based GPU Scheduling +================================== +TIR functions produced by Relax legalization need GPU-specific scheduling — thread binding, +loop tiling, shared memory usage — before they can run efficiently on a GPU. There are two +main approaches in TVM: + +- **MetaSchedule**: explores a search space to find the best schedule. High quality, but + compilation takes minutes to hours. +- **DLight**: applies pre-defined scheduling rules deterministically. No tuning required, + compilation completes in seconds. Performance is excellent for well-known patterns + (e.g., GEMM, GEMV in LLM workloads) and fair for the rest. + +This tutorial covers how DLight works, what rules are available, how to diagnose scheduling +quality, and how to write custom rules. + +.. contents:: Table of Contents + :local: + :depth: 1 +""" + +###################################################################### +# Prepare a Model +# --------------- +# We build a small model with ``nn.Module`` that is rich enough to trigger multiple DLight +# rules: ``Linear`` layers produce GEMM (matrix multiplication) kernels, ``LayerNorm`` +# produces a general-reduction kernel, and ``ReLU`` is a simple elementwise op. + +import tvm +from tvm import relax, tirx +from tvm.relax.frontend import nn +from tvm.s_tir import dlight as dl + + +class DemoModel(nn.Module): + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(768, 768) + self.relu = nn.ReLU() + self.norm = nn.LayerNorm(768) + self.fc2 = nn.Linear(768, 256) + + def forward(self, x): + x = self.norm(self.relu(self.fc1(x))) + return self.fc2(x) + + +mod, params = DemoModel().export_tvm({"forward": {"x": nn.spec.Tensor((1, 768), "float32")}}) + +###################################################################### +# Legalize Relax operators into TIR functions so that DLight has concrete kernels to schedule. + +device = tvm.cuda(0) +target = tvm.target.Target.from_device(device) +with target: + mod = relax.get_pipeline("zero")(mod) + +###################################################################### +# At this point every TIR function in ``mod`` is **unscheduled** — it has no thread bindings +# and would not run efficiently on a GPU. Let's see what functions we have: +for gv, func in mod.functions_items(): + if isinstance(func, tirx.PrimFunc): + print(f" {gv.name_hint}") + +###################################################################### +# Basic Usage: ApplyDefaultSchedule +# --------------------------------- +# ``ApplyDefaultSchedule`` is an ``IRModule`` pass. It iterates over every TIR function in the +# module and tries the given rules **in order**. For each function the first rule whose +# ``apply()`` returns a non-``None`` schedule wins; subsequent rules are skipped. +# After scheduling, the function is marked with ``tirx.is_scheduled`` so it won't be +# scheduled again by a later ``ApplyDefaultSchedule`` call. + +###################################################################### +# Here we use a common subset of rules. The full catalog (including ``LowBatchGEMV``, +# ``Transpose``, ``RMSNorm``) is listed in the next section. + +with target: + scheduled_mod = dl.ApplyDefaultSchedule( + dl.gpu.Matmul(), # GEMM: dense matrix multiplication + dl.gpu.GEMV(), # matrix-vector products + dl.gpu.Reduction(), # simple reductions (sum, max, ...) + dl.gpu.GeneralReduction(), # compound reductions (softmax, layer norm, ...) + dl.gpu.Fallback(), # catch-all for anything unmatched above + )(mod) + +scheduled_mod.show() + +###################################################################### +# Compared with the unscheduled IR, you can now see thread bindings +# (``blockIdx.x``, ``threadIdx.x``, ...) and loop transformations in each TIR function. + +###################################################################### +# Rule Catalog +# ------------ +# DLight ships a set of GPU scheduling rules. Each rule is a subclass of +# ``ScheduleRule`` and implements an ``apply(func, target, tunable)`` method that returns +# a ``Schedule`` if the rule matches, or ``None`` to pass. +# +# The built-in GPU rules, roughly from most specific to most general: +# +# .. list-table:: +# :header-rows: 1 +# :widths: 20 40 40 +# +# * - Rule +# - Pattern +# - Typical operators +# * - ``Matmul`` +# - GEMM index pattern ``C[S,I,J] += A[S,I,K] * B[S,J,K]`` +# - ``nn.Linear``, batched matmul +# * - ``GEMV`` +# - Matrix-vector multiply (one dimension is 1) +# - single-batch decode in attention +# * - ``LowBatchGEMV`` +# - Low-batch GEMM scheduled with a GEMV strategy +# - small-batch decode +# * - ``Reduction`` +# - Simple accumulation ``X[...] += Y[...]`` +# - sum, max, argmax +# * - ``GeneralReduction`` +# - Spatial dims followed by reduction dims (``S* R*``) +# - softmax, layer norm, RMS norm +# * - ``Transpose`` +# - Read/write indices are permutations of each other +# - 2-D transpose +# * - ``RMSNorm`` +# - Contains an ``rsqrt`` operation +# - RMS normalization +# * - ``Fallback`` +# - Any function (always matches) +# - generic catch-all +# +# **Rule order matters.** ``ApplyDefaultSchedule`` stops at the first match, so: +# +# - Put **specialized** rules first (``Matmul``, ``GEMV``) — they have strict matching +# conditions but produce high-quality schedules. +# - Put **general** rules later (``GeneralReduction``, ``Fallback``) — they match broadly +# but with less optimal schedules. +# - If you put ``Fallback`` first, it would "steal" every function and no specialized +# rule would ever run. + +###################################################################### +# Diagnosing Schedule Quality +# --------------------------- +# A common question is: *which rule scheduled which function?* ``ApplyDefaultSchedule`` +# does not log this directly, but you can figure it out by applying rules one at a time. +# +# **Step 1**: Apply each rule individually and record which functions it claims. + +from collections import OrderedDict + +rules = OrderedDict( + [ + ("Matmul", dl.gpu.Matmul()), + ("GEMV", dl.gpu.GEMV()), + ("LowBatchGEMV", dl.gpu.LowBatchGEMV()), + ("Reduction", dl.gpu.Reduction()), + ("GeneralReduction", dl.gpu.GeneralReduction()), + ("Transpose", dl.gpu.Transpose()), + ("RMSNorm", dl.gpu.RMSNorm()), + ] +) + +rule_assignment = {} +for rule_name, rule in rules.items(): + with target: + test_mod = dl.ApplyDefaultSchedule(rule)(mod) + for gv, func in test_mod.functions_items(): + if isinstance(func, tirx.PrimFunc) and gv.name_hint not in rule_assignment: + if "tirx.is_scheduled" in func.attrs and func.attrs["tirx.is_scheduled"] == 1: + rule_assignment[gv.name_hint] = rule_name + +###################################################################### +# **Step 2**: Functions not claimed by any specialized rule will fall through to ``Fallback``. + +all_tir_funcs = [ + gv.name_hint for gv, func in mod.functions_items() if isinstance(func, tirx.PrimFunc) +] +fallback_funcs = [name for name in all_tir_funcs if name not in rule_assignment] + +print("Rule assignments:") +for name, rule_name in sorted(rule_assignment.items()): + print(f" {name:40s} -> {rule_name}") +if fallback_funcs: + print("Handled by Fallback (may have suboptimal performance):") + for name in sorted(fallback_funcs): + print(f" {name}") + +###################################################################### +# If an important kernel lands in the Fallback bucket, you have three options: +# +# 1. Write a **custom DLight rule** for it (see below). +# 2. Use **MetaSchedule** to auto-tune that specific function. +# 3. Manually schedule it with the ``tvm.s_tir.Schedule`` API. + +###################################################################### +# DLight vs MetaSchedule +# ---------------------- +# The two systems are complementary, not competing: +# +# .. list-table:: +# :header-rows: 1 +# :widths: 20 40 40 +# +# * - +# - DLight +# - MetaSchedule +# * - Mechanism +# - Deterministic rule matching +# - Search-space exploration +# * - Compile time +# - Seconds +# - Minutes to hours +# * - Performance +# - Excellent on known patterns, fair otherwise +# - Near-optimal with sufficient search budget +# * - Best for +# - Default path, rapid iteration, CI +# - Hot-spot tuning in production +# +# A practical workflow: +# +# 1. Run ``ApplyDefaultSchedule`` with the full rule set to cover all functions. +# 2. Profile the compiled model to identify hot-spot kernels. +# 3. Use ``MetaScheduleTuneTIR`` to auto-tune only those kernels. +# +# Note that ``MetaScheduleTuneTIR`` does **not** automatically skip functions already +# scheduled by DLight — it processes every ``PrimFunc`` in the module. In practice this +# is harmless (tuning an already-scheduled function simply re-explores its space), but if +# you want to avoid the extra search cost, filter the module or use ``MetaScheduleTuneIRMod`` +# with ``op_names`` to target specific functions. + +###################################################################### +# Writing a Custom Rule +# --------------------- +# You can extend DLight by writing your own ``ScheduleRule``. The simplest way is +# ``ScheduleRule.from_callable``, which wraps a plain function into a rule **instance**. + +from tvm import s_tir +from tvm.s_tir.dlight.analysis import normalize_prim_func +from tvm.s_tir.dlight.base.schedule_rule import ScheduleRule + + +@ScheduleRule.from_callable("MyTileAndBind") +def my_tile_and_bind(func: tirx.PrimFunc, target: tvm.target.Target, tunable: bool): + """A minimal rule: for single-block injective functions, tile and bind to GPU threads.""" + if not isinstance(func, tirx.PrimFunc): + return None + sch = s_tir.Schedule(func) + # Use normalize_prim_func to get block info with correct spatial/reduction classification. + # This is the same analysis used by built-in DLight rules. + block_infos = normalize_prim_func(sch) + if block_infos is None or len(block_infos) != 1: + return None # only handle single-block functions + info = block_infos[0] + if not info.is_injective(): + return None # skip reductions — dom_kind() uses iter_type, not loop kind + loops = sch.get_loops(info.block_rv) + if len(loops) == 0: + return None + fused = sch.fuse(*loops) + bx, tx = sch.split(fused, factors=[None, 256]) + sch.bind(bx, "blockIdx.x") + sch.bind(tx, "threadIdx.x") + return sch + + +###################################################################### +# Insert the custom rule into the rule chain. Note that ``from_callable`` returns an +# **instance**, so pass it directly — do not call ``my_tile_and_bind()`` again. + +with target: + custom_mod = dl.ApplyDefaultSchedule( + dl.gpu.Matmul(), + dl.gpu.GeneralReduction(), + my_tile_and_bind, # our custom rule, tried before Fallback + dl.gpu.Fallback(), + )(mod) + +custom_mod.show() + +###################################################################### +# To build a production-quality rule, subclass ``ScheduleRule`` directly and implement +# ``apply()`` with full analysis logic (see ``tvm.s_tir.dlight.gpu.Matmul`` for an example). + +###################################################################### +# Summary +# ------- +# - **DLight** provides fast, deterministic GPU scheduling via rule matching. +# - Rules are tried in order; the first match wins. Put specialized rules before general ones. +# - Use the **single-rule probing** technique to diagnose which rule handles each function. +# - Combine DLight with MetaSchedule: DLight for baseline coverage, MetaSchedule for hot-spot tuning. +# - Extend DLight by writing custom ``ScheduleRule`` implementations. +# +# For DLight's role in the broader optimization pipeline, see :ref:`customize_opt`. diff --git a/docs/deep_dive/tensor_ir/tutorials/meta_schedule.py b/docs/deep_dive/tensor_ir/tutorials/meta_schedule.py new file mode 100644 index 000000000000..a263397bbe2a --- /dev/null +++ b/docs/deep_dive/tensor_ir/tutorials/meta_schedule.py @@ -0,0 +1,307 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ruff: noqa: E402 + +""" +.. _meta_schedule_deep_dive: + +MetaSchedule: Search-Based Auto-Tuning +======================================= +MetaSchedule is TVM's search-based auto-tuning framework, located in +``python/tvm/s_tir/meta_schedule/``. It explores different TIR schedules +(loop tiling, vectorization, thread binding, etc.) and measures them on real +hardware to find the fastest implementation for each operator. + +While **DLight** (see :ref:`dlight_gpu_scheduling`) provides rule-based scheduling with zero +search time, MetaSchedule trades compilation time for better performance by searching over +the space of possible schedules. + +.. contents:: Table of Contents + :local: + :depth: 1 +""" + +###################################################################### +# Architecture Overview +# --------------------- +# A MetaSchedule tuning session involves the following components: +# +# - **ExtractedTask**: A unique TIR workload extracted from a Relax IRModule, +# with a ``task_name`` and ``weight`` (call frequency in the graph). +# - **TuneContext**: Container holding all resources for a single tuning task +# (module, target, space generator, search strategy, etc.). +# - **SpaceGenerator** (default: ``PostOrderApply``): Generates the design space +# of possible schedules by applying ``ScheduleRule`` instances to each block. +# - **SearchStrategy** (default: ``EvolutionarySearch``): Explores the design +# space using an evolutionary algorithm guided by a cost model. +# - **CostModel** (default: ``XGBModel``): Predicts schedule performance using +# XGBoost, reducing the number of actual hardware measurements needed. +# Alternatives include ``MLPModel`` (neural network) and ``RandomModel`` +# (baseline). +# - **Builder** / **Runner**: Compile and execute candidates on real hardware to +# obtain measured run times. +# - **Database** (default: ``JSONDatabase``): Persistently stores tuning records +# (schedule traces + measured run times) for later retrieval. +# - **TaskScheduler** (default: ``GradientBasedScheduler``): Allocates tuning +# budget across multiple tasks based on their weights and estimated improvement +# potential. +# +# The tuning loop works as follows: +# +# 1. The **TaskScheduler** picks a task to tune. +# 2. The **SpaceGenerator** produces candidate schedules from the design space. +# 3. The **SearchStrategy** selects candidates (guided by the **CostModel**), +# sends them to the **Builder** and **Runner** for measurement. +# 4. Measured results are committed to the **Database** and used to update the +# **CostModel** for the next iteration. +# 5. Repeat until the trial budget is exhausted. + +###################################################################### +# Prepare a Model +# --------------- +# We reuse a simple model to demonstrate MetaSchedule APIs. + +import os +import tempfile + +import tvm +from tvm import relax +from tvm.relax.frontend import nn + + +class DemoModel(nn.Module): + def __init__(self): + super().__init__() + self.fc1 = nn.Linear(784, 256) + self.relu = nn.ReLU() + self.fc2 = nn.Linear(256, 10, bias=False) + + def forward(self, x): + x = self.fc1(x) + x = self.relu(x) + x = self.fc2(x) + return x + + +input_shape = (1, 784) +mod, params = DemoModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}}) + +device = tvm.cuda(0) +target = tvm.target.Target.from_device(device) + +###################################################################### +# User-Facing Entry Points +# ------------------------ +# MetaSchedule provides several levels of API, from high-level transforms to +# low-level tuning functions. +# +# Transform-Based API (Recommended) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# These are Relax passes that can be composed into a ``Sequential`` pipeline: +# +# - **MetaScheduleTuneIRMod**: Tunes an entire IRModule. Supports ``op_names`` +# for selective operator tuning. +# - **MetaScheduleTuneTIR**: Tunes all TIR functions individually (no +# ``op_names`` filtering). +# - **MetaScheduleApplyDatabase**: Applies the best schedules from the tuning +# database. Only replaces functions that have records; the rest are left +# unchanged. +# +# Here is a typical tune-and-apply pipeline: +# +# .. note:: +# +# To save CI time and avoid flakiness, we skip the tuning process in CI. + +if os.getenv("CI", "") != "true": + with target, tempfile.TemporaryDirectory() as tmp_dir: + tuned_mod = tvm.ir.transform.Sequential( + [ + relax.get_pipeline("zero"), + relax.transform.MetaScheduleTuneTIR( + work_dir=tmp_dir, + max_trials_global=300, + ), + relax.transform.MetaScheduleApplyDatabase(work_dir=tmp_dir), + ] + )(mod) + + tuned_mod.show() + +###################################################################### +# Inspecting Tunable Tasks +# ------------------------ +# Before tuning, use ``extract_tasks`` to see what MetaSchedule will tune: + +from tvm.s_tir.meta_schedule.relax_integration import extract_tasks + +with target: + legalized_mod = relax.get_pipeline("zero")(mod) + +tasks = extract_tasks(legalized_mod, target) +for i, task in enumerate(tasks): + print(f"Task {i}: {task.task_name} (weight={task.weight})") + +###################################################################### +# Each ``ExtractedTask`` has: +# +# - ``task_name``: Derived from the PrimFunc name (e.g., ``"fused_matmul_add_relu"``). +# - ``weight``: How many ``call_tir`` sites invoke this workload. The task +# scheduler uses weights to allocate more budget to frequently-called operators. +# - ``dispatched``: List of candidate TIR modules for this workload. + +###################################################################### +# Selective Operator Tuning +# ------------------------- +# ``MetaScheduleTuneIRMod`` accepts an ``op_names`` parameter to tune only +# operators whose task name contains any of the given strings: +# +# .. code-block:: python +# +# with target: +# mod = tvm.ir.transform.Sequential([ +# relax.transform.MetaScheduleTuneIRMod( +# params={}, +# work_dir="./tuning_logs", +# max_trials_global=300, +# op_names=["matmul"], # Only tune matmul-related operators +# ), +# relax.transform.MetaScheduleApplyDatabase(work_dir="./tuning_logs"), +# ])(mod) +# +# Operators without tuning records are left unscheduled -- you can apply DLight or +# other rule-based schedules to cover them afterward. +# +# .. note:: +# +# ``MetaScheduleTuneTIR`` does not support ``op_names`` filtering. Use +# ``MetaScheduleTuneIRMod`` when you need selective tuning. + +###################################################################### +# Database +# -------- +# When using a fixed ``work_dir``, tuning results are persisted in two +# newline-delimited JSON files: +# +# - ``database_workload.json``: One line per unique workload (structural hash + +# serialized IRModule). +# - ``database_tuning_record.json``: One line per tuning record (workload index + +# schedule trace + measured run times). +# +# Records are appended incrementally as tuning progresses. +# +# Resumption Semantics +# ~~~~~~~~~~~~~~~~~~~~ +# When you re-run tuning with the same ``work_dir``, existing records are loaded +# and used as warm-start seeds for the evolutionary search. The tuner does +# **not** skip already-seen workloads entirely -- it starts from a better initial +# population, so re-runs are faster than starting from scratch but still consume +# trials. +# +# Once tuning is done, subsequent compilations only need +# ``MetaScheduleApplyDatabase``: +# +# .. code-block:: python +# +# with target: +# mod = relax.transform.MetaScheduleApplyDatabase( +# work_dir="./tuning_logs" +# )(mod) +# +# Database Implementations +# ~~~~~~~~~~~~~~~~~~~~~~~~ +# MetaSchedule ships several database backends: +# +# - **JSONDatabase**: Persistent file-based storage (default). Created +# automatically when you pass ``work_dir``. +# - **MemoryDatabase**: In-memory, non-persistent. Useful for testing. +# - **UnionDatabase**: Queries all sub-databases and returns the globally best +# record. +# - **OrderedUnionDatabase**: Queries sub-databases in order; returns from the +# first one that has a match. +# - **ScheduleFnDatabase**: Wraps a user-provided scheduling function. + +###################################################################### +# Cross-Model Database Reuse +# -------------------------- +# MetaSchedule identifies workloads by their structural hash. If two models +# contain operators with the same shape, dtype, and computation, they share the +# same hash and can reuse tuning records. +# +# module_equality Options +# ~~~~~~~~~~~~~~~~~~~~~~~ +# - ``"structural"`` (default): Exact structural match. Safe but strict. +# - ``"anchor-block"``: Match based on the dominant compute block, ignoring +# surrounding context. More permissive -- enables sharing across fused operators +# that have the same core computation but different fusion boundaries. +# +# ``OrderedUnionDatabase`` enables a layered lookup strategy: check a local +# database first, then fall back to a shared team database: +# +# .. code-block:: python +# +# from tvm.s_tir.meta_schedule.database import JSONDatabase, OrderedUnionDatabase +# +# local_db = JSONDatabase(work_dir="./my_tuning_logs") +# shared_db = JSONDatabase(work_dir="/shared/tuning_db") +# combined_db = OrderedUnionDatabase(local_db, shared_db) +# +# with target, combined_db: +# mod = relax.transform.MetaScheduleApplyDatabase()(mod) + +###################################################################### +# Key Parameters Reference +# ------------------------ +# +# .. list-table:: +# :header-rows: 1 +# :widths: 25 75 +# +# * - Parameter +# - Description +# * - ``max_trials_global`` +# - Total trial budget shared across all tasks. Set proportional to the +# number of tasks (e.g., 200-500 trials per task for good results). +# * - ``max_trials_per_task`` +# - Per-task trial cap. Defaults to ``max_trials_global`` if not set. +# * - ``op_names`` +# - List of strings to filter tasks by name (substring match). +# ``MetaScheduleTuneIRMod`` only. +# * - ``work_dir`` +# - Directory for database files and logs. Use a fixed path to enable +# persistence and resumption. +# * - ``cost_model`` +# - ``"xgb"`` (XGBoost, default), ``"mlp"`` (neural network), or +# ``"random"`` (baseline). Only available via ``tune_relax``. +# * - ``runner`` +# - ``"local"`` (default) or an ``RPCRunner`` instance for remote devices. +# Only available via ``tune_relax``. +# * - ``module_equality`` +# - ``"structural"`` (default) or ``"anchor-block"`` for more permissive +# cross-model matching. Only available via ``tune_relax``. + +###################################################################### +# Summary +# ------- +# - **MetaSchedule** finds high-quality TIR schedules by searching over the +# design space and measuring on real hardware. +# - Use ``MetaScheduleTuneTIR`` for full-module tuning, or +# ``MetaScheduleTuneIRMod`` with ``op_names`` for selective tuning. +# - Tuning records persist in ``work_dir`` and can be reused across runs and +# models with the same operator shapes. +# - Combine with DLight: use DLight for fast baseline coverage, then MetaSchedule +# for hot-spot tuning (see :ref:`dlight_gpu_scheduling`). diff --git a/docs/how_to/tutorials/meta_schedule.py b/docs/how_to/tutorials/meta_schedule.py deleted file mode 100644 index 2245ab5e6196..000000000000 --- a/docs/how_to/tutorials/meta_schedule.py +++ /dev/null @@ -1,317 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# ruff: noqa: E402, F401, I001 - -""" -.. _meta_schedule_advanced: - -MetaSchedule: Advanced Auto-Tuning Guide -========================================= -MetaSchedule is TVM's search-based auto-tuning framework. It explores different TIR schedules -(loop tiling, vectorization, thread binding, etc.) and measures them on real hardware to find -the fastest implementation for each operator. - -For the basic tune-and-apply workflow, see :ref:`optimize_model`. This tutorial focuses on -advanced usage: inspecting tunable tasks, selective operator tuning, database management, -cross-model reuse, and the lower-level tuning API. - -.. contents:: Table of Contents - :local: - :depth: 1 -""" - -###################################################################### -# Prepare a Model -# --------------- -# We start with a simple MLP model exported as a Relax IRModule, then legalize it -# so that high-level Relax operators are lowered to TIR functions that MetaSchedule can tune. - -import os -import numpy as np - -import tvm -from tvm import relax -from tvm.relax.frontend import nn - -IS_IN_CI = os.getenv("CI", "") == "true" - - -class MLPModel(nn.Module): - def __init__(self): - super().__init__() - self.fc1 = nn.Linear(784, 256) - self.relu1 = nn.ReLU() - self.fc2 = nn.Linear(256, 10, bias=False) - - def forward(self, x): - x = self.fc1(x) - x = self.relu1(x) - x = self.fc2(x) - return x - - -input_shape = (1, 784) -mod, params = MLPModel().export_tvm({"forward": {"x": nn.spec.Tensor(input_shape, "float32")}}) - -# Legalize: lower Relax operators to TIR PrimFuncs -target = tvm.target.Target({"kind": "llvm", "num-cores": 4}) -with target: - mod = relax.get_pipeline("zero")(mod) - -mod.show() - -###################################################################### -# Inspecting Tunable Tasks -# ------------------------ -# Before tuning, it is useful to see what MetaSchedule will actually tune. The -# ``extract_tasks`` function analyzes an IRModule and returns one ``ExtractedTask`` per -# unique TIR workload. Each task has a ``task_name`` and a ``weight`` (how many times -# this workload is called in the graph — the task scheduler uses weights to allocate -# more tuning budget to frequently-called operators). - -from tvm.s_tir.meta_schedule.relax_integration import extract_tasks - -tasks = extract_tasks(mod, target) -for i, task in enumerate(tasks): - print(f"Task {i}: {task.task_name} (weight={task.weight})") - -###################################################################### -# This tells you exactly how many operators need tuning and their relative importance. -# Use this to decide whether to tune all operators or focus on a subset. - -###################################################################### -# Selective Operator Tuning -# ------------------------- -# Tuning every operator can be time-consuming. ``MetaScheduleTuneIRMod`` accepts an -# ``op_names`` parameter to tune only operators whose task name contains any of the given -# strings. Operators without tuning records are left unscheduled — you can later apply -# DLight or other rule-based schedules to cover them. -# -# .. note:: -# -# ``MetaScheduleTuneIRMod`` works at the IRModule level and supports ``op_names`` filtering, -# while ``MetaScheduleTuneTIR`` tunes all TIR functions without filtering. Choose based on -# your needs. -# -# .. note:: -# -# To save CI time and avoid flakiness, we skip the tuning process in CI environment. -# - -if not IS_IN_CI: - WORK_DIR = "./tuning_logs" - with target: - tuned_mod = tvm.ir.transform.Sequential( - [ - relax.transform.MetaScheduleTuneIRMod( - params={}, - work_dir=WORK_DIR, - max_trials_global=300, - op_names=["matmul"], # Only tune matmul-related operators - ), - relax.transform.MetaScheduleApplyDatabase(work_dir=WORK_DIR), - ] - )(mod) - - tuned_mod.show() - -###################################################################### -# Database Persistence and Resumption -# ------------------------------------ -# When you use a fixed ``work_dir`` (instead of ``tempfile.TemporaryDirectory``), tuning -# results are persisted in two JSON files: -# -# - ``database_workload.json``: One line per unique workload (structural hash + serialized -# IRModule). -# - ``database_tuning_record.json``: One line per tuning record (workload index + schedule -# trace + measured run times). -# -# Both files use a newline-delimited JSON format. Records are appended incrementally as -# tuning progresses, so **interrupting and resuming is safe**. When you re-run tuning with -# the same ``work_dir``, existing records are loaded and used as warm-start seeds for the -# evolutionary search — the tuner does not skip already-seen workloads entirely, but starts -# from a better initial population, so re-runs are faster than starting from scratch. -# -# You can quickly check tuning progress from the command line: -# -# .. code-block:: bash -# -# # Count how many tuning records have been collected -# wc -l tuning_logs/database_tuning_record.json -# -# Once tuning is done, subsequent compilations only need ``MetaScheduleApplyDatabase`` -# which reads the database and applies the best schedules — this takes seconds, not hours: -# -# .. code-block:: python -# -# # Fast: apply previously tuned results (no search) -# with target: -# mod = relax.transform.MetaScheduleApplyDatabase(work_dir="./tuning_logs")(mod) -# - -###################################################################### -# Querying the Tuning Database -# ---------------------------- -# The ``JSONDatabase`` class provides a Python API to inspect tuning results -# programmatically. This is useful for analyzing tuning quality, comparing different -# tuning runs, or debugging performance issues. - -from tvm.s_tir.meta_schedule.database import JSONDatabase - -if not IS_IN_CI: - db = JSONDatabase(work_dir=WORK_DIR) - print(f"Total tuning records: {len(db)}") - - # List all records with their best measured runtime - records = db.get_all_tuning_records() - for rec in records: - if rec.run_secs: - best = min(float(s) for s in rec.run_secs) - print(f" Best time: {best * 1e3:.3f} ms") - -###################################################################### -# You can also query the best schedule for a specific TIR function by passing its -# IRModule. For example, to query a single PrimFunc extracted from the full module: -# -# .. code-block:: python -# -# # tir_mod: an IRModule containing a single PrimFunc named "main" -# record = db.query_tuning_record(tir_mod, target, workload_name="main") -# if record: -# print(f"Best time: {min(float(s) for s in record.run_secs) * 1e3:.3f} ms") -# # Reconstruct the optimized schedule -# sch = db.query_schedule(tir_mod, target, workload_name="main") -# sch.mod.show() -# - -###################################################################### -# Cross-Model Database Reuse -# -------------------------- -# MetaSchedule identifies workloads by their structural hash. If two models contain -# operators with the same shape, dtype, and computation, they share the same hash and -# can reuse tuning records. This means a matmul ``(M=1, N=256, K=784)`` tuned for one -# model will automatically be reused by any other model with the same matmul shape. -# -# **module_equality options**: -# -# - ``"structural"`` (default): Exact structural match. Safe but strict. -# - ``"anchor-block"``: Match based on the dominant compute block, ignoring -# surrounding context. More permissive — enables sharing across fused operators -# that have the same core computation but different fusion boundaries. -# -# **OrderedUnionDatabase** enables a layered lookup strategy: check a local database -# first, then fall back to a shared team database: - -from tvm.s_tir.meta_schedule.database import OrderedUnionDatabase -from tvm.s_tir.meta_schedule.relax_integration import tune_relax - -###################################################################### -# -# .. code-block:: python -# -# local_db = JSONDatabase(work_dir="./my_tuning_logs") -# shared_db = JSONDatabase(work_dir="/shared/tuning_db") -# combined_db = OrderedUnionDatabase(local_db, shared_db) -# -# With this setup, ``combined_db.query_tuning_record(...)`` checks ``local_db`` first. -# Only if no match is found does it fall back to ``shared_db``. This lets a team maintain -# a shared tuning database while individuals only tune new operators locally. -# -# To make ``MetaScheduleApplyDatabase`` use the combined database during compilation, -# enter it as a context manager. The pass checks ``Database.current()`` first, and only -# falls back to ``work_dir`` when no database is in scope: -# -# .. code-block:: python -# -# with target, combined_db: -# mod = relax.transform.MetaScheduleApplyDatabase()(mod) -# - -###################################################################### -# Lower-Level API: ``tune_relax`` -# -------------------------------- -# The transform-based API (``MetaScheduleTuneTIR`` / ``MetaScheduleTuneIRMod``) covers -# most use cases. For advanced scenarios -- custom cost models, remote runners, or -# fine-grained control -- use the lower-level ``tune_relax`` function directly: - -###################################################################### -# -# .. code-block:: python -# -# db = tune_relax( -# mod=mod, -# params={}, -# target=target, -# work_dir="./tuning_logs", -# max_trials_global=2000, -# max_trials_per_task=500, -# op_names=["matmul"], # Selective tuning -# cost_model="xgb", # "xgb" (default), "mlp", or "random" -# num_trials_per_iter=64, # Batch size per search iteration -# runner="local", # "local" or RPCRunner for remote devices -# module_equality="structural", # "structural" or "anchor-block" -# ) -# -# Key parameters: -# -# - **cost_model**: ``"xgb"`` (XGBoost, default) uses gradient-boosted trees to predict -# schedule performance, reducing the number of actual measurements needed. ``"mlp"`` -# uses a neural network-based model. ``"random"`` disables prediction (baseline). -# - **num_trials_per_iter**: How many candidates are measured in each search iteration. -# Larger values improve hardware utilization but use more memory. -# - **runner**: Use ``"local"`` for the current machine. For cross-compilation scenarios -# (e.g., tuning for a remote device), use ``RPCRunner``. -# - **module_equality**: Controls how workloads are matched. ``"anchor-block"`` improves -# database hit rate across models at the cost of slightly less precise matching. - -###################################################################### -# Build and Run -# ------------- -# Finally, we build and run the model to verify the result. If tuning was skipped -# (e.g., in CI), we compile the untuned module directly — LLVM can still generate -# valid (though unoptimized) code for CPU targets without explicit scheduling. - -final_mod = tuned_mod if not IS_IN_CI else mod - -ex = tvm.compile(final_mod, target) -vm = relax.VirtualMachine(ex, tvm.cpu()) -data = tvm.runtime.tensor(np.random.rand(*input_shape).astype("float32")) -tvm_params = [tvm.runtime.tensor(np.random.rand(*p.shape).astype(p.dtype)) for _, p in params] -result = vm["forward"](data, *tvm_params).numpy() -print("Output shape:", result.shape) -print("Output:", result) - -###################################################################### -# Summary -# ------- -# This tutorial covered advanced MetaSchedule usage beyond the basic tune-and-apply flow: -# -# - **Inspect tasks** with ``extract_tasks`` to understand what will be tuned and plan your -# tuning budget: ``max_trials_global`` is shared across all tasks, so set it proportional -# to the number of tasks (e.g., 200-500 trials per task for good results). -# - **Selective tuning** with ``op_names`` to focus on performance-critical operators and -# skip the rest. -# - **Persist results** with a fixed ``work_dir``. Tuning is resumable — existing records -# warm-start the search on re-run. -# - **Query the database** to analyze tuning quality and debug performance. -# - **Reuse across models** via ``OrderedUnionDatabase`` and ``module_equality="anchor-block"`` -# to amortize tuning cost across a team or model family. -# - **Lower-level API** (``tune_relax``) for custom cost models, remote runners, and -# fine-grained control. -# -# For the basic end-to-end workflow, see :ref:`optimize_model`. For rule-based scheduling -# without search, see DLight documentation. diff --git a/docs/index.rst b/docs/index.rst index 511f2ffa0b4f..2b5ef6464636 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -43,7 +43,6 @@ driving its costs down. how_to/tutorials/e2e_opt_model how_to/tutorials/customize_opt - how_to/tutorials/meta_schedule how_to/tutorials/optimize_llm how_to/tutorials/cross_compilation_and_rpc how_to/tutorials/export_and_load_executable