diff --git a/cmake/modules/contrib/BLAS.cmake b/cmake/modules/contrib/BLAS.cmake index 06c8755882d5..f31218088a9e 100644 --- a/cmake/modules/contrib/BLAS.cmake +++ b/cmake/modules/contrib/BLAS.cmake @@ -72,6 +72,7 @@ if(IS_DIRECTORY ${USE_MKLDNN}) include_directories(SYSTEM ${USE_MKLDNN}/include) list(APPEND TVM_RUNTIME_LINKER_LIBS ${MKLDNN_LIBRARY}) list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc) + list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc) add_definitions(-DUSE_DNNL=1) message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY}) endif() @@ -84,6 +85,7 @@ elseif(USE_MKLDNN STREQUAL "ON") add_definitions(-DUSE_DNNL=1) message(STATUS "Use MKLDNN library " ${MKLDNN_LIBRARY}) list(APPEND RUNTIME_SRCS src/runtime/contrib/cblas/mkldnn.cc) + list(APPEND RUNTIME_SRCS src/runtime/contrib/dnnl/dnnl.cc) endif() elseif(USE_MKLDNN STREQUAL "OFF") # pass diff --git a/python/tvm/contrib/mkldnn.py b/python/tvm/contrib/mkldnn.py index 8d5f4da0345b..a60a35f0ad04 100644 --- a/python/tvm/contrib/mkldnn.py +++ b/python/tvm/contrib/mkldnn.py @@ -17,6 +17,7 @@ """External function interface to BLAS libraries.""" import tvm from tvm import te +from ..topi.nn.utils import get_pad_tuple def matmul(lhs, rhs, transa=False, transb=False, **kwargs): @@ -50,3 +51,107 @@ def matmul(lhs, rhs, transa=False, transb=False, **kwargs): name="C", **kwargs, ) + + +def dnnl_conv2d( + src, + weights, + stride, + padding, + dilation, + groups, + channel_last=False, + out_dtype="float32", + **kwargs, +): + """Convolution operator in NCHW layout. + + Parameters + ---------- + src : tvm.te.Tensor + 4-D with shape [batch, in_channel, in_height, in_width] + + weights : tvm.te.Tensor + 4-D with shape [num_filter, in_channel, filter_height, filter_width] + + stride : int or a list/tuple of two ints + Stride size, or [stride_height, stride_width] + + padding : int or a list/tuple of 2 or 4 ints + padding size, or + [pad_height, pad_width] for 2 ints, or + [pad_top, pad_left, pad_bottom, pad_right] for 4 ints + + dilation: int or a list/tuple of two ints + dilation size, or [dilation_height, dilation_width] + + groups: str + input data layout: NCHW or NHWC + + channel_last: bool + chose if input/output data format is in channel_last format(NHWC) or + in plain format(NCHW) + + out_dtype: str + output datatype: now only support float32 + + Returns + ------- + Output : tvm.te.Tensor + 4-D with shape [batch, out_channel, out_height, out_width] + """ + + assert isinstance(stride, int) or len(stride) == 2 + assert isinstance(dilation, int) or len(dilation) == 2 + if isinstance(stride, int): + stride_h = stride_w = stride + else: + stride_h, stride_w = stride + + if isinstance(dilation, int): + dilation_h = dilation_w = dilation + else: + dilation_h, dilation_w = dilation + + if channel_last: + batch, in_height, in_width, _ = src.shape + kernel_h, kernel_w, _, num_filter = weights.shape + else: + batch, _, in_height, in_width = src.shape + num_filter, _, kernel_h, kernel_w = weights.shape + + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + pad_top, pad_left, pad_down, pad_right = get_pad_tuple( + padding, (dilated_kernel_h, dilated_kernel_w) + ) + out_channel = num_filter + out_height = (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1 + out_width = (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1 + + if channel_last: + out_shape = (batch, out_height, out_width, out_channel) + else: + out_shape = (batch, out_channel, out_height, out_width) + + return te.extern( + out_shape, + [src, weights], + lambda ins, outs: tvm.tir.call_packed( + "tvm.contrib.mkldnn.conv2d", + ins[0], + ins[1], + outs[0], + pad_top, + pad_down, + pad_left, + pad_right, + stride[0], + stride[1], + groups, + channel_last, + ), + name="C", + dtype=out_dtype, + **kwargs, + ) diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 59a57fd233f5..12ef048b48cd 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -120,6 +120,12 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_int8), name="conv2d_nchw_int8.x86", ) + elif "mkldnn" in target.libs: + strategy.add_implementation( + wrap_compute_conv2d(topi.x86.conv2d_nchw_mkldnn), + wrap_topi_schedule(topi.x86.schedule_conv2d_nchw_mkldnn), + name="conv2d_nchw_mkldnn.x86", + ) else: strategy.add_implementation( wrap_compute_conv2d(topi.x86.conv2d_nchw), @@ -133,11 +139,18 @@ def conv2d_strategy_cpu(attrs, inputs, out_type, target): assert kernel_layout == "HWIO" if not is_auto_scheduler_enabled(): logger.warning("conv2d NHWC layout is not optimized for x86 with autotvm.") - strategy.add_implementation( - wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True), - wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc), - name="conv2d_nhwc.x86", - ) + if "mkldnn" in target.libs: + strategy.add_implementation( + wrap_compute_conv2d(topi.x86.conv2d_nhwc_mkldnn), + wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc_mkldnn), + name="conv2d_nhwc_mkldnn.x86", + ) + else: + strategy.add_implementation( + wrap_compute_conv2d(topi.nn.conv2d_nhwc, need_auto_scheduler_layout=True), + wrap_topi_schedule(topi.x86.schedule_conv2d_nhwc), + name="conv2d_nhwc.x86", + ) judge_winograd_auto_scheduler = False if len(kernel.shape) == 4: diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py index 182454acf3a6..a28c75b81d3f 100644 --- a/python/tvm/topi/x86/conv2d.py +++ b/python/tvm/topi/x86/conv2d.py @@ -23,7 +23,9 @@ import tvm from tvm import te from tvm import autotvm +from tvm.contrib import mkldnn from .. import nn +from ..generic import schedule_extern from ..nn.conv2d import conv2d_infer_layout, _get_workload as _get_conv2d_workload from ..nn.conv2d import unpack_NCHWc_to_nchw from ..nn.depthwise_conv2d import _get_workload as _get_depthwise_conv2d_workload @@ -267,6 +269,34 @@ def _callback(op): return s +@autotvm.register_topi_compute("conv2d_nchw_mkldnn.x86") +def conv2d_nchw_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d in NCHW format using mkldnn.""" + groups = 1 + _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, False, out_dtype) + return _out + + +@autotvm.register_topi_schedule("conv2d_nchw_mkldnn.x86") +def schedule_conv2d_nchw_mkldnn(_, outs): + """Create schedule for conv2d_nchw_mkldnn""" + return schedule_extern(outs) + + +@autotvm.register_topi_compute("conv2d_nhwc_mkldnn.x86") +def conv2d_nhwc_mkldnn(cfg, data, kernel, strides, padding, dilation, out_dtype): + """Compute conv2d in NHWC format using mkldnn.""" + groups = 1 + _out = mkldnn.dnnl_conv2d(data, kernel, strides, padding, dilation, groups, True, out_dtype) + return _out + + +@autotvm.register_topi_schedule("conv2d_nhwc_mkldnn.x86") +def schedule_conv2d_nhwc_mkldnn(_, outs): + """Create schedule for conv2d_nhwc_mkldnn""" + return schedule_extern(outs) + + # FIXME - https://github.com/apache/tvm/issues/4122 # _declaration_conv_nhwc_pack expects kernel layout to be HWOI. However, the tests use HWIO # layout. Commenting until we have clarity about the nhwc_pack implementation from the author. diff --git a/src/runtime/contrib/dnnl/dnnl.cc b/src/runtime/contrib/dnnl/dnnl.cc index d1190df91375..7d3763d411ba 100644 --- a/src/runtime/contrib/dnnl/dnnl.cc +++ b/src/runtime/contrib/dnnl/dnnl.cc @@ -80,8 +80,8 @@ inline void read_from_dnnl_memory(void* handle, const memory& mem) { void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, int p_N_, int p_C_, int p_H_, int p_W_, int p_O_, int p_G_, int p_Ph0_, int p_Pw0_, int p_Ph1_, - int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_, - primitive_attr attr) { + int p_Pw1_, int p_Kh_, int p_Kw_, int p_Sh_, int p_Sw_, primitive_attr attr, + bool channel_last) { using tag = memory::format_tag; using dt = memory::data_type; engine eng(engine::kind::cpu, 0); @@ -97,32 +97,62 @@ void dnnl_conv2d_common(float* data, float* weights, float* bias, float* out, in memory::dims conv2d_padding0 = {p_Ph0_, p_Pw0_}; memory::dims conv2d_padding1 = {p_Ph1_, p_Pw1_}; - auto user_src_memory = memory({{conv2d_src_tz}, dt::f32, tag::nchw}, eng, data); + auto user_src_memory = + memory({{conv2d_src_tz}, dt::f32, channel_last ? tag::nhwc : tag::nchw}, eng, data); auto user_weights_memory = - memory({{conv2d_weights_tz}, dt::f32, (p_G_ > 1) ? tag::goihw : tag::oihw}, eng, weights); + memory({{conv2d_weights_tz}, dt::f32, channel_last ? tag::hwio : tag::oihw}, eng, weights); + if (p_G_ > 1) + user_weights_memory = memory( + {{conv2d_weights_tz}, dt::f32, channel_last ? tag::ghwio : tag::goihw}, eng, weights); auto conv2d_user_bias_memory = memory({{conv2d_bias_tz}, dt::f32, tag::x}, eng, bias); + auto user_dst_memory = + memory({{conv2d_dst_tz}, dt::f32, channel_last ? tag::nhwc : tag::nchw}, eng, out); auto conv2d_src_md = memory::desc({conv2d_src_tz}, dt::f32, tag::any); auto conv2d_bias_md = memory::desc({conv2d_bias_tz}, dt::f32, tag::any); auto conv2d_weights_md = memory::desc({conv2d_weights_tz}, dt::f32, tag::any); - auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::nchw); + auto conv2d_dst_md = memory::desc({conv2d_dst_tz}, dt::f32, tag::any); auto conv2d_desc = convolution_forward::desc( prop_kind::forward_inference, algorithm::convolution_direct, conv2d_src_md, conv2d_weights_md, conv2d_bias_md, conv2d_dst_md, conv2d_strides, conv2d_padding0, conv2d_padding1); auto conv2d_prim_desc = convolution_forward::primitive_desc(conv2d_desc, attr, eng); + // reorder if src layout not DNNL chosen. auto conv2d_src_memory = user_src_memory; + if (conv2d_prim_desc.src_desc() != user_src_memory.get_desc()) { + conv2d_src_memory = memory(conv2d_prim_desc.src_desc(), eng); + auto reorder_src = reorder(user_src_memory, conv2d_src_memory); + reorder_src.execute(s, {{DNNL_ARG_FROM, user_src_memory}, {DNNL_ARG_TO, conv2d_src_memory}}); + } + + // reorder if weights layout not DNNL chosen. auto conv2d_weights_memory = user_weights_memory; - auto conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng); + if (conv2d_prim_desc.weights_desc() != user_weights_memory.get_desc()) { + conv2d_weights_memory = memory(conv2d_prim_desc.weights_desc(), eng); + auto reorder_weights = reorder(user_weights_memory, conv2d_weights_memory); + reorder_weights.execute( + s, {{DNNL_ARG_FROM, user_weights_memory}, {DNNL_ARG_TO, conv2d_weights_memory}}); + } + + auto conv2d_dst_memory = user_dst_memory; + if (conv2d_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + conv2d_dst_memory = memory(conv2d_prim_desc.dst_desc(), eng); + } auto conv = convolution_forward(conv2d_prim_desc); conv.execute(s, {{DNNL_ARG_SRC, conv2d_src_memory}, {DNNL_ARG_WEIGHTS, conv2d_weights_memory}, {DNNL_ARG_BIAS, conv2d_user_bias_memory}, {DNNL_ARG_DST, conv2d_dst_memory}}); + + // reorder if dst layout not DNNL chosen. + if (conv2d_prim_desc.dst_desc() != user_dst_memory.get_desc()) { + reorder(conv2d_dst_memory, user_dst_memory) + .execute(s, {{DNNL_ARG_FROM, conv2d_dst_memory}, {DNNL_ARG_TO, user_dst_memory}}); + } + s.wait(); - read_from_dnnl_memory(out, conv2d_dst_memory); } extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, int p_C_, int p_H_, @@ -131,7 +161,8 @@ extern "C" void dnnl_conv2d(float* data, float* weights, float* out, int p_N_, i primitive_attr attr; std::vector bias(p_O_, 0); return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, - p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr); + p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, attr, + false); } primitive_attr create_attr_with_relu_post_op() { @@ -151,7 +182,7 @@ extern "C" void dnnl_fused_conv2d_relu(float* data, float* weights, float* out, std::vector bias(p_O_, 0); return dnnl_conv2d_common(data, weights, bias.data(), out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, - create_attr_with_relu_post_op()); + create_attr_with_relu_post_op(), false); } extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* bias, float* out, @@ -161,7 +192,7 @@ extern "C" void dnnl_fused_conv2d_bias_relu(float* data, float* weights, float* int p_Sw_) { return dnnl_conv2d_common(data, weights, bias, out, p_N_, p_C_, p_H_, p_W_, p_O_, p_G_, p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, - create_attr_with_relu_post_op()); + create_attr_with_relu_post_op(), false); } extern "C" void dnnl_dense(float* data, float* weight, float* out, int p_B_, int p_I_, int p_O_) { @@ -306,6 +337,37 @@ extern "C" void dnnl_binary_op(float* data, float* weight, float* out, int algo_ read_from_dnnl_memory(out, dst_memory); } +// DNNL Conv2d single OP +TVM_REGISTER_GLOBAL("tvm.contrib.mkldnn.conv2d").set_body([](TVMArgs args, TVMRetValue* ret) { + DLTensor* input = args[0]; + DLTensor* weights = args[1]; + DLTensor* output = args[2]; + int p_Ph0_ = args[3], p_Pw0_ = args[4], p_Ph1_ = args[5], p_Pw1_ = args[6], p_Sh_ = args[7], + p_Sw_ = args[8], p_G_ = args[9]; + bool channel_last = args[10]; + + int p_N_ = input->shape[0], p_C_ = input->shape[1], p_H_ = input->shape[2], + p_W_ = input->shape[3], p_O_ = output->shape[1], p_Kh_ = weights->shape[2], + p_Kw_ = weights->shape[3]; + + if (channel_last) { + p_N_ = input->shape[0]; + p_H_ = input->shape[1]; + p_W_ = input->shape[2]; + p_C_ = input->shape[3]; + p_O_ = output->shape[3]; + p_Kh_ = weights->shape[0]; + p_Kw_ = weights->shape[1]; + } + + std::vector bias(p_O_, 0); + primitive_attr attr; + return dnnl_conv2d_common(static_cast(input->data), static_cast(weights->data), + bias.data(), static_cast(output->data), p_N_, p_C_, p_H_, p_W_, + p_O_, p_G_, p_Ph0_, p_Pw0_, p_Ph1_, p_Pw1_, p_Kh_, p_Kw_, p_Sh_, p_Sw_, + attr, channel_last); +}); + } // namespace contrib } // namespace runtime } // namespace tvm diff --git a/src/runtime/contrib/dnnl/dnnl_kernel.h b/src/runtime/contrib/dnnl/dnnl_kernel.h index 522313ae5a64..04e06d9c9e94 100644 --- a/src/runtime/contrib/dnnl/dnnl_kernel.h +++ b/src/runtime/contrib/dnnl/dnnl_kernel.h @@ -27,6 +27,7 @@ #include #include +#include #include diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index b4f30c2eab27..db1eb16b8ca3 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -1996,5 +1996,103 @@ def test_conv2d_rocm_sdot4(): np.testing.assert_equal(out, ref) +@tvm.testing.requires_x86 +def test_conv2d_nchw_mkldnn(): + if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True): + print( + "skip because extern mkldnn function is not available, \ + built with MKLDNN=ON" + ) + return + d_shape = (1, 64, 56, 56) + w_shape = (64, 64, 3, 3) + padding = (1, 1) + strides = (1, 1) + + data = relay.var("data", shape=d_shape, dtype="float32") + weight = relay.var("weight", shape=w_shape, dtype="float32") + out_channel = w_shape[0] + conv2d = relay.nn.conv2d( + data=data, + weight=weight, + kernel_size=w_shape[2:], + channels=out_channel, + padding=padding, + strides=strides, + out_dtype="float32", + ) + + mod = tvm.IRModule.from_expr(conv2d) + + data_np = np.random.uniform(1, 10, d_shape).astype("float32") + weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32") + + target = "llvm -mcpu=skylake-avx512 -libs=mkldnn" + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target, params={"weight": weight_np}) + + dev = tvm.device(target, 0) + runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) + + runtime.set_input("data", data_np) + runtime.run() + + out = runtime.get_output(0).numpy() + + ref = tvm.topi.testing.conv2d_nchw_python(data_np, weight_np, strides, padding) + + np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5) + + +@tvm.testing.requires_x86 +def test_conv2d_nhwc_mkldnn(): + if not tvm.get_global_func("tvm.contrib.mkldnn.conv2d", allow_missing=True): + print( + "skip because extern mkldnn function is not available, \ + built with MKLDNN=ON" + ) + return + d_shape = (1, 56, 56, 64) + w_shape = (3, 3, 64, 64) + padding = (1, 1) + strides = (1, 1) + + data = relay.var("data", shape=d_shape, dtype="float32") + weight = relay.var("weight", shape=w_shape, dtype="float32") + out_channel = w_shape[3] + conv2d = relay.nn.conv2d( + data=data, + weight=weight, + kernel_size=w_shape[:2], + channels=out_channel, + padding=padding, + strides=strides, + out_dtype="float32", + data_layout="NHWC", + kernel_layout="HWIO", + ) + + mod = tvm.IRModule.from_expr(conv2d) + + data_np = np.random.uniform(1, 10, d_shape).astype("float32") + weight_np = np.random.uniform(1, 10, size=w_shape).astype("float32") + + target = "llvm -mcpu=skylake-avx512 -libs=mkldnn" + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(mod, target=target, params={"weight": weight_np}) + + dev = tvm.device(target, 0) + runtime = tvm.contrib.graph_executor.GraphModule(lib["default"](dev)) + + runtime.set_input("data", data_np) + runtime.run() + + out = runtime.get_output(0).numpy() + + ref = tvm.topi.testing.conv2d_nhwc_python(data_np, weight_np, strides, padding) + + np.testing.assert_allclose(out, ref, rtol=1e-5, atol=1e-5) + + if __name__ == "__main__": tvm.testing.main()