From a3ec78c5d9f958731d32585f99fae879d9ad2751 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 21 Sep 2022 09:53:23 +0900 Subject: [PATCH 1/4] [Metaschedule] Support template-free tuning on Hexagon --- python/tvm/meta_schedule/default_config.py | 55 +++- python/tvm/meta_schedule/tune.py | 29 +- .../test_hexagon/test_meta_schedule.py | 294 +++++++++++++++++- 3 files changed, 371 insertions(+), 7 deletions(-) diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py index ac4028ec50f8..56e753fc1a54 100644 --- a/python/tvm/meta_schedule/default_config.py +++ b/python/tvm/meta_schedule/default_config.py @@ -174,10 +174,12 @@ def schedule_rules( # pylint: disable=redefined-outer-name return sch_rules() if sch_rules is not None: raise TypeError(f"Expected `sch_rules` to be None or callable, but gets: {sch_rules}") - if target.kind.name in ["llvm", "hexagon"]: + if target.kind.name == "llvm": return _DefaultLLVM.schedule_rules() if target.kind.name in ["cuda", "rocm", "vulkan"]: return _DefaultCUDA.schedule_rules() + if target.kind.name == "hexagon": + return _DefaultHexagon.schedule_rules() raise ValueError(f"Unsupported target: {target}") @@ -190,10 +192,12 @@ def postproc( # pylint: disable=redefined-outer-name return postproc() if postproc is not None: raise TypeError(f"Expected `postproc` to be None or callable, but gets: {postproc}") - if target.kind.name in ["llvm", "hexagon"]: + if target.kind.name == "llvm": return _DefaultLLVM.postprocs() if target.kind.name in ["cuda", "rocm", "vulkan"]: return _DefaultCUDA.postprocs() + if target.kind.name == "hexagon": + return _DefaultHexagon.postprocs() raise ValueError(f"Unsupported target: {target}") @@ -277,6 +281,53 @@ def mutator_probs() -> Dict[Mutator, float]: } +class _DefaultHexagon: + """Default tuning configuration for Hexagon.""" + + @staticmethod + def schedule_rules() -> List[ScheduleRule]: + from tvm.meta_schedule import schedule_rule as M + + return [ + M.AutoInline( + into_producer=False, + into_consumer=True, + inline_const_tensor=True, + disallow_if_then_else=True, + require_injective=True, + require_ordered=True, + disallow_op=["tir.exp"], + ), + M.MultiLevelTilingWideVector( + structure="SRSRS", + vector_length_in_bits=1024, + max_innermost_factor=128, + reuse_read=None, + reuse_write=M.ReuseType( + req="may", + levels=[1, 2], + scope="global", + ), + ), + M.ParallelizeVectorizeUnroll( + max_jobs_per_core=-1, + max_vectorize_extent=128, + unroll_max_steps=[0, 16, 64, 512], + unroll_explicit=True, + ), + ] + + @staticmethod + def postprocs() -> List[Postproc]: + from tvm.meta_schedule import postproc as M + + return [ + M.DisallowDynamicLoop(), + M.RewriteParallelVectorizeUnroll(), + M.RewriteReductionBlock(), + ] + + class _DefaultCUDA: """Default tuning configuration for CUDA.""" diff --git a/python/tvm/meta_schedule/tune.py b/python/tvm/meta_schedule/tune.py index 20eccc30a113..4285e940db95 100644 --- a/python/tvm/meta_schedule/tune.py +++ b/python/tvm/meta_schedule/tune.py @@ -554,6 +554,7 @@ def tune_relay( postprocs: Optional[FnPostproc] = None, mutator_probs: Optional[FnMutatorProb] = None, num_threads: Optional[int] = None, + executor=None, ) -> Union[Module, vm.Executable]: """Tune a Relay IRModule with a given target. @@ -581,6 +582,9 @@ def tune_relay( The callbacks used during tuning. backend : str = "graph" The backend to use for relay compilation(graph / vm). + executor : relay.backend.Executor + The executor to be passed to relay.build(...). In particular, its link-params + attribute affects task extration and workload database look up. Returns ------- @@ -596,8 +600,23 @@ def tune_relay( target = default_config.target(target) # pylint: enable=protected-access, # parse the tuning contexts + + if executor is None: + executor = relay.backend.Executor("graph") + + if "link-params" in executor.attrs: + link_params = executor.attrs["link-params"] + else: + link_params = False + with Profiler.timeit("TaskExtraction"): - extracted_tasks = extract_task_from_relay(mod, target, params) + pass_config = { + "relay.FuseOps.link_params": link_params, + "relay.backend.use_meta_schedule": True, + "relay.backend.tir_converter": "default", + } + extracted_tasks = extract_task_from_relay(mod, target, params, pass_config=pass_config) + database = tune_extracted_tasks( extracted_tasks, config, @@ -613,7 +632,7 @@ def tune_relay( mutator_probs=mutator_probs, num_threads=num_threads, ) - relay_build = {"graph": relay.build, "vm": relay.vm.compile}[backend] + with Profiler.timeit("PostTuningCompilation"): with target, autotvm_silencer(), database: with PassContext( @@ -624,4 +643,8 @@ def tune_relay( "relay.backend.tir_converter": "default", }, ): - return relay_build(mod, target=target, params=params) + if backend == "graph": + return relay.build(mod, target=target, params=params, executor=executor) + + # Executor is not supported by VM + return relay.vm.compile(mod, target=target, params=params) diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py index 96d18c9b3076..d0acd54ececd 100644 --- a/tests/python/contrib/test_hexagon/test_meta_schedule.py +++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py @@ -21,15 +21,20 @@ import tempfile import tvm.testing -from tvm import te +import tvm.topi.testing +from tvm import te, relay from tvm import meta_schedule as ms from tvm.meta_schedule.arg_info import TensorInfo from tvm.meta_schedule.builder import BuilderInput +from tvm.meta_schedule import postproc, schedule_rule from tvm.script import tir as T from tvm.tir import FloatImm from tvm.tir.tensor_intrin.hexagon import VRMPY_u8u8i32_INTRIN from tvm.meta_schedule.runner import RunnerInput from tvm.contrib.hexagon.meta_schedule import get_hexagon_local_builder, get_hexagon_rpc_runner +from tvm.relay.backend import Executor +from tvm.topi.utils import get_const_tuple +from tvm.meta_schedule.testing import te_workload MATMUL_N = 16 MATMUL_M = 32 @@ -166,7 +171,6 @@ def verify_dense(sch, target, M, N, K, hexagon_session): print("%f ms, %f GOPS" % (time_ms, gflops / (time_ms / 1e3))) -@pytest.mark.skip(reason="xgboost not installed on CI") @tvm.testing.requires_hexagon def test_vrmpy_dense(hexagon_launcher): if hexagon_launcher._serial_number == "simulator": @@ -209,3 +213,289 @@ def schedule_dense_for_tune(sch): with hexagon_launcher.start_session() as session: verify_dense(sch, target, M, N, K, session) + + +@tvm.testing.requires_hexagon +def test_vrmpy_dense_auto_tensorize(hexagon_launcher): + if hexagon_launcher._serial_number == "simulator": + pytest.skip(msg="Tuning on simulator not supported.") + + target_hexagon = tvm.target.hexagon("v68") + target = tvm.target.Target(target_hexagon, host=target_hexagon) + + M, N, K = 128, 768, 768 + workload = te.create_prim_func(dense(M, N, K)) + + sch_rules = [ + schedule_rule.AutoInline( + into_producer=False, + into_consumer=True, + inline_const_tensor=True, + disallow_if_then_else=True, + require_injective=True, + require_ordered=True, + disallow_op=["tir.exp"], + ), + schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64), + schedule_rule.MultiLevelTilingWithIntrin( + VRMPY_u8u8i32_INTRIN, + structure="SRSRS", + tile_binds=None, + max_innermost_factor=64, + vector_load_lens=None, + reuse_read=None, + reuse_write=schedule_rule.ReuseType( + req="may", + levels=[1, 2], + scope="global", + ), + ), + schedule_rule.ParallelizeVectorizeUnroll( + max_jobs_per_core=16, + max_vectorize_extent=128, + unroll_max_steps=[0, 16, 64, 512], + unroll_explicit=True, + ), + ] + + postprocs = [ + postproc.DisallowDynamicLoop(), + postproc.RewriteParallelVectorizeUnroll(), + postproc.RewriteReductionBlock(), + postproc.RewriteTensorize(vectorize_init_loop=True), + ] + + # with tempfile.TemporaryDirectory() as work_dir: + work_dir = "work" + config = ms.TuneConfig( + strategy="replay_trace", + num_trials_per_iter=32, + max_trials_per_task=128, + max_trials_global=128, + ) + + sch = ms.tune_tir( + mod=workload, + target=target, + config=config, + work_dir=work_dir, + sch_rules=lambda: sch_rules, + postprocs=lambda: postprocs, + builder=get_hexagon_local_builder(), + runner=get_hexagon_rpc_runner(hexagon_launcher, number=10), + ) + + print(sch.mod.script()) + + with hexagon_launcher.start_session() as session: + verify_dense(sch, target, M, N, K, session) + + +@tvm.testing.requires_hexagon +def test_conv2d_nhwc_auto_schedule(hexagon_launcher): + if hexagon_launcher._serial_number == "simulator": + pytest.skip(msg="Tuning on simulator not supported.") + + target_hexagon = tvm.target.hexagon("v69", num_cores=4) + target = tvm.target.Target(target_hexagon, host=target_hexagon) + I = 64 + O = 64 + H = 56 + W = 56 + kH = 3 + kW = 3 + + dtype = "float16" + + strides = (1, 1) + padding = (1, 1) + + data, kernel, out = te_workload.conv2d_nhwc( + 1, H, W, I, O, 3, 1, 1, 1, in_dtype="float16", out_dtype="float16" + ) + workload = te.create_prim_func([data, kernel, out]) + # workload = te.create_prim_func(te_workload.matmul(128, 768, 768)) + + # with tempfile.TemporaryDirectory() as work_dir: + work_dir = "work" + config = ms.TuneConfig( + # strategy="replay_trace", + strategy="evolutionary", + num_trials_per_iter=32, + max_trials_per_task=32, + max_trials_global=32, + ) + + sch = ms.tune_tir( + mod=workload, + target=target, + config=config, + work_dir=work_dir, + builder=get_hexagon_local_builder(), + runner=get_hexagon_rpc_runner(hexagon_launcher, number=10), + ) + print(sch.trace) + + import time + + t1 = time.time() + f = tvm.build(sch.mod["main"], [data, kernel, out], target) + t2 = time.time() + + # print("compiled in", t2 - t1) + # return + + with hexagon_launcher.start_session() as session: + # print("session acquired") + module = session.load_module(f) + dev = session.device + + a_np = np.random.randn(1, I, H, W).astype("float16") + w_np = np.random.randn(O, I, kH, kW).astype("float16") + c_np = tvm.topi.testing.conv2d_nchw_python( + a_np.astype("float32"), w_np.astype("float32"), strides, padding + ) + + data_np = np.zeros(get_const_tuple(data.shape)).astype(dtype) + w_np_hwio = np.zeros(get_const_tuple(kernel.shape)).astype(dtype) + + for i in range(I): + for h in range(H): + for w in range(W): + data_np[0, h, w, i] = a_np[0, i, h, w] + + for o in range(O): + for i in range(I): + for h in range(kH): + for w in range(kW): + w_np_hwio[h, w, i, o] = w_np[o, i, h, w] + + a = tvm.nd.array(data_np.astype(dtype), dev) + w = tvm.nd.array(w_np_hwio.astype(dtype), dev) + + c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), dev) + + module(a, w, c) + + P, Q = c.shape[1:3] + evaluator = module.time_evaluator(module.entry_name, dev, number=20) + time_ms = evaluator(a, w, c).mean * 1e3 + gflops = (O * P * Q * I * kH * kW) * 2 / 1e9 + print("time elapsed: ", time_ms) + print("GFLOPS:", gflops / (time_ms / 1e3)) + + out_nhwc = c.numpy() + + out = np.zeros(c_np.shape).astype("float16") + + for o in range(O): + for h in range(P): + for w in range(Q): + out[0, o, h, w] = out_nhwc[0, h, w, o] + + print(np.max(np.abs(out - c_np)), np.mean(np.abs(out - c_np))) + + mx = np.max(np.abs(out - c_np)) + + indices = np.where(np.abs(out - c_np) == mx) + + print(out[indices], c_np[indices]) + + +# @tvm.testing.requires_hexagon +# def test_conv2d_relay_auto_schedule(hexagon_launcher): +# if hexagon_launcher._serial_number == "simulator": +# pytest.skip(msg="Tuning on simulator not supported.") + +# target_hexagon = tvm.target.hexagon("v69", num_cores=4) +# target = tvm.target.Target(target_hexagon, host=target_hexagon) +# I = 64 +# O = 64 +# H = 56 +# W = 56 +# kH = 3 +# kW = 3 + +# strides = (1, 1) +# padding = (1, 1) + +# d_shape = (1, H, W, I) +# w_shape = (kH, kW, I, O) +# bias_shape = (w_shape[3],) +# out_channel = w_shape[0] + +# data = relay.var("data", shape=d_shape, dtype="float16") +# weight = relay.var("weight", shape=w_shape, dtype="float16") +# bias = relay.var("bias", shape=bias_shape, dtype="float16") +# conv2d = relay.nn.conv2d( +# data=data, +# weight=weight, +# kernel_size=w_shape[2:], +# channels=out_channel, +# padding=padding, +# strides=strides, +# out_dtype="float16", +# data_layout="NHWC", +# weight_layout="HWIO", +# ) + +# bias_add = relay.nn.bias_add(conv2d, bias) + +# use_bias = True + +# if use_bias: +# out = bias_add +# else: +# out = conv2d + +# mod = tvm.IRModule.from_expr(out) + +# data_np = np.random.randn(*d_shape).astype("float16") +# weight_np = np.random.randn(*w_shape).astype("float16") +# bias_np = np.random.randn(w_shape[0]).astype("float16") +# params = {"weight": weight_np, "bias": bias_np} + +# target_llvm = tvm.target.Target("llvm") + +# with tvm.transform.PassContext( +# opt_level=3, +# ): +# lib_ref = relay.build(mod, target=target_llvm, params=params) + +# rt_mod_ref = tvm.contrib.graph_executor.GraphModule(lib_ref["default"](tvm.cpu(0))) + +# rt_mod_ref.set_input("data", data_np) + +# rt_mod_ref.run() + +# ref = rt_mod_ref.get_output(0).numpy() + +# config = ms.TuneConfig( +# strategy="evolutionary", +# num_trials_per_iter=32, +# max_trials_per_task=32, +# max_trials_global=32, +# ) + +# with tempfile.TemporaryDirectory() as work_dir: +# executor = Executor("graph", {"link-params": True}) +# lib = ms.tune_relay( +# mod=mod, +# params=params, +# target=target, +# config=config, +# work_dir=work_dir, +# builder=get_hexagon_local_builder(), +# runner=get_hexagon_rpc_runner(hexagon_launcher, number=20), +# executor=executor, +# ) + +# with hexagon_launcher.start_session() as session: +# rt_mod = session.get_executor_from_factory(lib) + +# rt_mod.set_input("data", data_np) + +# rt_mod.run() + +# out = rt_mod.get_output(0).numpy() +# print(np.max(np.abs(ref - out)), np.mean(np.abs(ref - out))) From eb42a1e2e5b5f3382475fa16313b98212e2eac95 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 21 Sep 2022 09:57:24 +0900 Subject: [PATCH 2/4] enable multi threading --- python/tvm/meta_schedule/default_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py index 56e753fc1a54..256a9f95d4e5 100644 --- a/python/tvm/meta_schedule/default_config.py +++ b/python/tvm/meta_schedule/default_config.py @@ -310,7 +310,7 @@ def schedule_rules() -> List[ScheduleRule]: ), ), M.ParallelizeVectorizeUnroll( - max_jobs_per_core=-1, + max_jobs_per_core=16, max_vectorize_extent=128, unroll_max_steps=[0, 16, 64, 512], unroll_explicit=True, From 14958bb669d48c6f0dbdc1515b06c708be375fd9 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 20 Sep 2022 18:56:45 -0700 Subject: [PATCH 3/4] update tests --- python/tvm/meta_schedule/default_config.py | 2 + .../test_hexagon/test_meta_schedule.py | 322 +++++++----------- 2 files changed, 119 insertions(+), 205 deletions(-) diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py index 256a9f95d4e5..c2ad8709c247 100644 --- a/python/tvm/meta_schedule/default_config.py +++ b/python/tvm/meta_schedule/default_config.py @@ -325,6 +325,8 @@ def postprocs() -> List[Postproc]: M.DisallowDynamicLoop(), M.RewriteParallelVectorizeUnroll(), M.RewriteReductionBlock(), + # TODO(masahi): Fix RewriteLayout for link-params=True case + M.RewriteLayout(), ] diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py index d0acd54ececd..f16bcda12dba 100644 --- a/tests/python/contrib/test_hexagon/test_meta_schedule.py +++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py @@ -215,6 +215,43 @@ def schedule_dense_for_tune(sch): verify_dense(sch, target, M, N, K, session) +@tvm.script.ir_module +class Module_vrmpy_auto_tensorize: + @T.prim_func + def main(X: T.Buffer[(128, 768), "uint8"], packedW: T.Buffer[(24, 192, 32, 4), "uint8"], compute: T.Buffer[(128, 768), "int32"]) -> None: + # function attr dict + T.func_attr({"global_symbol": "main", "tir.noalias": True}) + # body + # with T.block("root") + for i0_0_i1_0_0_fused in T.parallel(512, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}): + for i0_1_init, i1_0_1_init, i0_2_init, i1_0_2_init in T.grid(2, 3, 1, 1): + with T.block("compute_o_init"): + i = T.axis.spatial(128, i0_0_i1_0_0_fused // 8 * 2 + i0_1_init + i0_2_init) + j_o = T.axis.spatial(24, i1_0_2_init + i0_0_i1_0_0_fused % 8 * 3 + i1_0_1_init) + T.reads() + T.writes(compute[i, j_o * 32 : j_o * 32 + 32]) + for i1_1 in T.vectorized(32): + with T.block("compute_init"): + j_i_init = T.axis.spatial(32, i1_1) + T.reads() + T.writes(compute[i, j_o * 32 + j_i_init]) + compute[i, j_o * 32 + j_i_init] = 0 + for i2_0_0, i0_1, i1_0_1, i2_0_1, i0_2, i1_0_2 in T.grid(32, 2, 3, 6, 1, 1): + with T.block("compute_o_update"): + i = T.axis.spatial(128, i0_0_i1_0_0_fused // 8 * 2 + i0_1 + i0_2) + j_o = T.axis.spatial(24, i1_0_2 + i0_0_i1_0_0_fused % 8 * 3 + i1_0_1) + k_o = T.axis.reduce(192, i2_0_0 * 6 + i2_0_1) + T.reads(compute[i, j_o * 32 : j_o * 32 + 32], X[i, k_o * 4 : k_o * 4 + 4], packedW[j_o, k_o, 0 : 32, 0 : 4]) + T.writes(compute[i, j_o * 32 : j_o * 32 + 32]) + A = T.match_buffer(X[i, k_o * 4 : k_o * 4 + 4], [4], dtype="uint8", offset_factor=1) + B = T.match_buffer(packedW[j_o, k_o, 0 : 32, 0 : 4], [32, 4], dtype="uint8", offset_factor=1) + C = T.match_buffer(compute[i, j_o * 32 : j_o * 32 + 32], [32], dtype="int32", offset_factor=1) + A_u8x4: T.uint8x4 = A[0:4] + A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32") + B_i32x32: T.int32x32 = T.reinterpret(B[0, 0:128], dtype="int32x32") + C[0:32] = T.call_llvm_pure_intrin(4390, T.uint32(3), C[0:32], B_i32x32, A_i32, dtype="int32x32") + + @tvm.testing.requires_hexagon def test_vrmpy_dense_auto_tensorize(hexagon_launcher): if hexagon_launcher._serial_number == "simulator": @@ -265,237 +302,112 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher): postproc.RewriteTensorize(vectorize_init_loop=True), ] - # with tempfile.TemporaryDirectory() as work_dir: - work_dir = "work" - config = ms.TuneConfig( - strategy="replay_trace", - num_trials_per_iter=32, - max_trials_per_task=128, - max_trials_global=128, - ) - - sch = ms.tune_tir( - mod=workload, - target=target, - config=config, - work_dir=work_dir, - sch_rules=lambda: sch_rules, - postprocs=lambda: postprocs, - builder=get_hexagon_local_builder(), - runner=get_hexagon_rpc_runner(hexagon_launcher, number=10), - ) + if True: + with tempfile.TemporaryDirectory() as work_dir: + config = ms.TuneConfig( + strategy="replay_trace", + num_trials_per_iter=8, + max_trials_per_task=8, + max_trials_global=8, + ) - print(sch.mod.script()) + sch = ms.tune_tir( + mod=workload, + target=target, + config=config, + work_dir=work_dir, + sch_rules=lambda: sch_rules, + postprocs=lambda: postprocs, + builder=get_hexagon_local_builder(), + runner=get_hexagon_rpc_runner(hexagon_launcher, number=10), + ) + else: + sch = tvm.tir.Schedule(Module_vrmpy_auto_tensorize, debug_mask="all") with hexagon_launcher.start_session() as session: verify_dense(sch, target, M, N, K, session) @tvm.testing.requires_hexagon -def test_conv2d_nhwc_auto_schedule(hexagon_launcher): +def test_conv2d_relay_auto_schedule(hexagon_launcher): if hexagon_launcher._serial_number == "simulator": pytest.skip(msg="Tuning on simulator not supported.") - target_hexagon = tvm.target.hexagon("v69", num_cores=4) + target_hexagon = tvm.target.hexagon("v69") target = tvm.target.Target(target_hexagon, host=target_hexagon) - I = 64 - O = 64 - H = 56 - W = 56 - kH = 3 - kW = 3 - - dtype = "float16" + I, O, H, W = 64, 64, 56, 56 + kH = kW = 3 strides = (1, 1) padding = (1, 1) - data, kernel, out = te_workload.conv2d_nhwc( - 1, H, W, I, O, 3, 1, 1, 1, in_dtype="float16", out_dtype="float16" - ) - workload = te.create_prim_func([data, kernel, out]) - # workload = te.create_prim_func(te_workload.matmul(128, 768, 768)) - - # with tempfile.TemporaryDirectory() as work_dir: - work_dir = "work" - config = ms.TuneConfig( - # strategy="replay_trace", - strategy="evolutionary", - num_trials_per_iter=32, - max_trials_per_task=32, - max_trials_global=32, - ) - - sch = ms.tune_tir( - mod=workload, - target=target, - config=config, - work_dir=work_dir, - builder=get_hexagon_local_builder(), - runner=get_hexagon_rpc_runner(hexagon_launcher, number=10), + d_shape = (1, H, W, I) + w_shape = (kH, kW, I, O) + bias_shape = (1, 1, 1, w_shape[3]) + out_channel = w_shape[3] + + data = relay.var("data", shape=d_shape, dtype="float16") + weight = relay.var("weight", shape=w_shape, dtype="float16") + bias = relay.var("bias", shape=bias_shape, dtype="float16") + conv2d = relay.nn.conv2d( + data=data, + weight=weight, + kernel_size=(kH, kW), + channels=out_channel, + padding=padding, + strides=strides, + out_dtype="float16", + data_layout="NHWC", + kernel_layout="HWIO", ) - print(sch.trace) + mod = tvm.IRModule.from_expr(conv2d + bias) - import time + data_np = np.random.randn(*d_shape).astype("float16") + weight_np = np.random.randn(*w_shape).astype("float16") + bias_np = np.random.randn(*bias_shape).astype("float16") + params = {"weight": weight_np, "bias": bias_np} - t1 = time.time() - f = tvm.build(sch.mod["main"], [data, kernel, out], target) - t2 = time.time() + target_llvm = tvm.target.Target("llvm") - # print("compiled in", t2 - t1) - # return + with tvm.transform.PassContext( + opt_level=3, + ): + lib_ref = relay.build(mod, target=target_llvm, params=params) - with hexagon_launcher.start_session() as session: - # print("session acquired") - module = session.load_module(f) - dev = session.device - - a_np = np.random.randn(1, I, H, W).astype("float16") - w_np = np.random.randn(O, I, kH, kW).astype("float16") - c_np = tvm.topi.testing.conv2d_nchw_python( - a_np.astype("float32"), w_np.astype("float32"), strides, padding - ) + rt_mod_ref = tvm.contrib.graph_executor.GraphModule(lib_ref["default"](tvm.cpu(0))) - data_np = np.zeros(get_const_tuple(data.shape)).astype(dtype) - w_np_hwio = np.zeros(get_const_tuple(kernel.shape)).astype(dtype) + rt_mod_ref.set_input("data", data_np) - for i in range(I): - for h in range(H): - for w in range(W): - data_np[0, h, w, i] = a_np[0, i, h, w] + rt_mod_ref.run() - for o in range(O): - for i in range(I): - for h in range(kH): - for w in range(kW): - w_np_hwio[h, w, i, o] = w_np[o, i, h, w] + ref = rt_mod_ref.get_output(0).numpy() - a = tvm.nd.array(data_np.astype(dtype), dev) - w = tvm.nd.array(w_np_hwio.astype(dtype), dev) - - c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), dev) - - module(a, w, c) - - P, Q = c.shape[1:3] - evaluator = module.time_evaluator(module.entry_name, dev, number=20) - time_ms = evaluator(a, w, c).mean * 1e3 - gflops = (O * P * Q * I * kH * kW) * 2 / 1e9 - print("time elapsed: ", time_ms) - print("GFLOPS:", gflops / (time_ms / 1e3)) - - out_nhwc = c.numpy() - - out = np.zeros(c_np.shape).astype("float16") - - for o in range(O): - for h in range(P): - for w in range(Q): - out[0, o, h, w] = out_nhwc[0, h, w, o] - - print(np.max(np.abs(out - c_np)), np.mean(np.abs(out - c_np))) - - mx = np.max(np.abs(out - c_np)) - - indices = np.where(np.abs(out - c_np) == mx) - - print(out[indices], c_np[indices]) - - -# @tvm.testing.requires_hexagon -# def test_conv2d_relay_auto_schedule(hexagon_launcher): -# if hexagon_launcher._serial_number == "simulator": -# pytest.skip(msg="Tuning on simulator not supported.") - -# target_hexagon = tvm.target.hexagon("v69", num_cores=4) -# target = tvm.target.Target(target_hexagon, host=target_hexagon) -# I = 64 -# O = 64 -# H = 56 -# W = 56 -# kH = 3 -# kW = 3 - -# strides = (1, 1) -# padding = (1, 1) - -# d_shape = (1, H, W, I) -# w_shape = (kH, kW, I, O) -# bias_shape = (w_shape[3],) -# out_channel = w_shape[0] - -# data = relay.var("data", shape=d_shape, dtype="float16") -# weight = relay.var("weight", shape=w_shape, dtype="float16") -# bias = relay.var("bias", shape=bias_shape, dtype="float16") -# conv2d = relay.nn.conv2d( -# data=data, -# weight=weight, -# kernel_size=w_shape[2:], -# channels=out_channel, -# padding=padding, -# strides=strides, -# out_dtype="float16", -# data_layout="NHWC", -# weight_layout="HWIO", -# ) - -# bias_add = relay.nn.bias_add(conv2d, bias) - -# use_bias = True - -# if use_bias: -# out = bias_add -# else: -# out = conv2d - -# mod = tvm.IRModule.from_expr(out) - -# data_np = np.random.randn(*d_shape).astype("float16") -# weight_np = np.random.randn(*w_shape).astype("float16") -# bias_np = np.random.randn(w_shape[0]).astype("float16") -# params = {"weight": weight_np, "bias": bias_np} - -# target_llvm = tvm.target.Target("llvm") - -# with tvm.transform.PassContext( -# opt_level=3, -# ): -# lib_ref = relay.build(mod, target=target_llvm, params=params) - -# rt_mod_ref = tvm.contrib.graph_executor.GraphModule(lib_ref["default"](tvm.cpu(0))) - -# rt_mod_ref.set_input("data", data_np) - -# rt_mod_ref.run() - -# ref = rt_mod_ref.get_output(0).numpy() - -# config = ms.TuneConfig( -# strategy="evolutionary", -# num_trials_per_iter=32, -# max_trials_per_task=32, -# max_trials_global=32, -# ) + config = ms.TuneConfig( + strategy="replay_trace", + num_trials_per_iter=8, + max_trials_per_task=8, + max_trials_global=8, + ) -# with tempfile.TemporaryDirectory() as work_dir: -# executor = Executor("graph", {"link-params": True}) -# lib = ms.tune_relay( -# mod=mod, -# params=params, -# target=target, -# config=config, -# work_dir=work_dir, -# builder=get_hexagon_local_builder(), -# runner=get_hexagon_rpc_runner(hexagon_launcher, number=20), -# executor=executor, -# ) + with tempfile.TemporaryDirectory() as work_dir: + executor = Executor("graph", {"link-params": True}) + lib = ms.tune_relay( + mod=mod, + params=params, + target=target, + config=config, + work_dir=work_dir, + builder=get_hexagon_local_builder(), + runner=get_hexagon_rpc_runner(hexagon_launcher, number=20), + executor=executor, + ) -# with hexagon_launcher.start_session() as session: -# rt_mod = session.get_executor_from_factory(lib) + with hexagon_launcher.start_session() as session: + rt_mod = session.get_executor_from_factory(lib) -# rt_mod.set_input("data", data_np) + rt_mod.set_input("data", data_np) -# rt_mod.run() + rt_mod.run() -# out = rt_mod.get_output(0).numpy() -# print(np.max(np.abs(ref - out)), np.mean(np.abs(ref - out))) + out = rt_mod.get_output(0).numpy() + print(np.max(np.abs(ref - out)), np.mean(np.abs(ref - out))) From 21dbad9b845de2fcd75109e124942d1accce7c79 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Wed, 21 Sep 2022 10:58:00 +0900 Subject: [PATCH 4/4] black --- python/tvm/meta_schedule/default_config.py | 2 +- .../test_hexagon/test_meta_schedule.py | 48 +++++++++++-------- 2 files changed, 28 insertions(+), 22 deletions(-) diff --git a/python/tvm/meta_schedule/default_config.py b/python/tvm/meta_schedule/default_config.py index c2ad8709c247..c701fd6568e0 100644 --- a/python/tvm/meta_schedule/default_config.py +++ b/python/tvm/meta_schedule/default_config.py @@ -326,7 +326,7 @@ def postprocs() -> List[Postproc]: M.RewriteParallelVectorizeUnroll(), M.RewriteReductionBlock(), # TODO(masahi): Fix RewriteLayout for link-params=True case - M.RewriteLayout(), + # M.RewriteLayout(), ] diff --git a/tests/python/contrib/test_hexagon/test_meta_schedule.py b/tests/python/contrib/test_hexagon/test_meta_schedule.py index f16bcda12dba..74f3ab673ec8 100644 --- a/tests/python/contrib/test_hexagon/test_meta_schedule.py +++ b/tests/python/contrib/test_hexagon/test_meta_schedule.py @@ -215,15 +215,20 @@ def schedule_dense_for_tune(sch): verify_dense(sch, target, M, N, K, session) +# This is an example of a schedule found by vrmpy auto tensorization. +# It gets 440 GFLOPS on SD888. @tvm.script.ir_module class Module_vrmpy_auto_tensorize: @T.prim_func - def main(X: T.Buffer[(128, 768), "uint8"], packedW: T.Buffer[(24, 192, 32, 4), "uint8"], compute: T.Buffer[(128, 768), "int32"]) -> None: - # function attr dict + def main( + X: T.Buffer[(128, 768), "uint8"], + packedW: T.Buffer[(24, 192, 32, 4), "uint8"], + compute: T.Buffer[(128, 768), "int32"], + ) -> None: T.func_attr({"global_symbol": "main", "tir.noalias": True}) - # body - # with T.block("root") - for i0_0_i1_0_0_fused in T.parallel(512, annotations={"pragma_auto_unroll_max_step":64, "pragma_unroll_explicit":1}): + for i0_0_i1_0_0_fused in T.parallel( + 512, annotations={"pragma_auto_unroll_max_step": 64, "pragma_unroll_explicit": 1} + ): for i0_1_init, i1_0_1_init, i0_2_init, i1_0_2_init in T.grid(2, 3, 1, 1): with T.block("compute_o_init"): i = T.axis.spatial(128, i0_0_i1_0_0_fused // 8 * 2 + i0_1_init + i0_2_init) @@ -241,15 +246,27 @@ def main(X: T.Buffer[(128, 768), "uint8"], packedW: T.Buffer[(24, 192, 32, 4), " i = T.axis.spatial(128, i0_0_i1_0_0_fused // 8 * 2 + i0_1 + i0_2) j_o = T.axis.spatial(24, i1_0_2 + i0_0_i1_0_0_fused % 8 * 3 + i1_0_1) k_o = T.axis.reduce(192, i2_0_0 * 6 + i2_0_1) - T.reads(compute[i, j_o * 32 : j_o * 32 + 32], X[i, k_o * 4 : k_o * 4 + 4], packedW[j_o, k_o, 0 : 32, 0 : 4]) + T.reads( + compute[i, j_o * 32 : j_o * 32 + 32], + X[i, k_o * 4 : k_o * 4 + 4], + packedW[j_o, k_o, 0:32, 0:4], + ) T.writes(compute[i, j_o * 32 : j_o * 32 + 32]) - A = T.match_buffer(X[i, k_o * 4 : k_o * 4 + 4], [4], dtype="uint8", offset_factor=1) - B = T.match_buffer(packedW[j_o, k_o, 0 : 32, 0 : 4], [32, 4], dtype="uint8", offset_factor=1) - C = T.match_buffer(compute[i, j_o * 32 : j_o * 32 + 32], [32], dtype="int32", offset_factor=1) + A = T.match_buffer( + X[i, k_o * 4 : k_o * 4 + 4], [4], dtype="uint8", offset_factor=1 + ) + B = T.match_buffer( + packedW[j_o, k_o, 0:32, 0:4], [32, 4], dtype="uint8", offset_factor=1 + ) + C = T.match_buffer( + compute[i, j_o * 32 : j_o * 32 + 32], [32], dtype="int32", offset_factor=1 + ) A_u8x4: T.uint8x4 = A[0:4] A_i32: T.int32 = T.reinterpret(A_u8x4, dtype="int32") B_i32x32: T.int32x32 = T.reinterpret(B[0, 0:128], dtype="int32x32") - C[0:32] = T.call_llvm_pure_intrin(4390, T.uint32(3), C[0:32], B_i32x32, A_i32, dtype="int32x32") + C[0:32] = T.call_llvm_pure_intrin( + 4390, T.uint32(3), C[0:32], B_i32x32, A_i32, dtype="int32x32" + ) @tvm.testing.requires_hexagon @@ -264,16 +281,6 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher): workload = te.create_prim_func(dense(M, N, K)) sch_rules = [ - schedule_rule.AutoInline( - into_producer=False, - into_consumer=True, - inline_const_tensor=True, - disallow_if_then_else=True, - require_injective=True, - require_ordered=True, - disallow_op=["tir.exp"], - ), - schedule_rule.AddRFactor(max_jobs_per_core=16, max_innermost_factor=64), schedule_rule.MultiLevelTilingWithIntrin( VRMPY_u8u8i32_INTRIN, structure="SRSRS", @@ -296,7 +303,6 @@ def test_vrmpy_dense_auto_tensorize(hexagon_launcher): ] postprocs = [ - postproc.DisallowDynamicLoop(), postproc.RewriteParallelVectorizeUnroll(), postproc.RewriteReductionBlock(), postproc.RewriteTensorize(vectorize_init_loop=True),