diff --git a/python/tvm/autotvm/task/topi_integration.py b/python/tvm/autotvm/task/topi_integration.py
index 32d8674640ed..307d44810c79 100644
--- a/python/tvm/autotvm/task/topi_integration.py
+++ b/python/tvm/autotvm/task/topi_integration.py
@@ -233,7 +233,10 @@ def wrapper(outs, *args, **kwargs):
             """wrapper function for topi schedule"""
             workload = get_workload(outs, task_name)
             if workload is None:
-                raise RuntimeError("Cannot find workload in attribute of this schedule")
+                raise RuntimeError(
+                    f"Cannot find TOPI workload {task_name}. "
+                    "Is it registered with `register_topi_compute`?"
+                )
             tgt = Target.current()
             cfg = DispatchContext.current.query(tgt, workload)
             return topi_schedule(cfg, outs, *args, **kwargs)
@@ -253,7 +256,7 @@ def traverse(tensors):
         for t in tensors:
             op = t.op
             wkl = traverse(op.input_tensors)
-            if wkl:
+            if wkl is not None:
                 return wkl
 
             if "workload" in op.attrs:
diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py
index 4ba9c07e4f80..b8bbcf89df60 100644
--- a/python/tvm/relay/frontend/onnx.py
+++ b/python/tvm/relay/frontend/onnx.py
@@ -526,23 +526,6 @@ def _impl_v1(cls, inputs, attr, params):
                 raise tvm.error.OpAttributeInvalid(msg.format(attr["auto_pad"]))
             attr.pop("auto_pad")
 
-        # Check if the requested convolution is a group conv1d, if so convert it to conv2d.
-        # TODO(jwfromm) Remove once proper group_conv1d is supported.
-        group_conv1d = False
-        if dimension_picker("conv")(attr) == "conv1d" and attr.get("group") != 1:
-            group_conv1d = True
-            # Expand input from NCW to NCHW
-            data = _op.expand_dims(data, axis=2)
-            # Expand kernel from OIW to OIHW
-            kernel = _op.expand_dims(kernel, axis=2)
-            # Add new value to kernel_shape, strices, dilation, pads, if needed
-            attr["kernel_shape"] = [1] + list(attr["kernel_shape"])
-            if "strides" in attr:
-                attr["strides"] = [1] + list(attr["strides"])
-            if "dilations" in attr:
-                attr["dilations"] = [1] + list(attr["dilations"])
-            if "pads" in attr:
-                attr["pads"] = [0, attr["pads"][0], 0, attr["pads"][1]]
         attr["channels"] = kernel_shapes[0][0]
         out = AttrCvt(
             op_name=dimension_picker("conv"),
@@ -555,10 +538,6 @@ def _impl_v1(cls, inputs, attr, params):
             custom_check=dimension_constraint(),
         )([data, kernel], attr, params)
 
-        # If this was a group_conv1d, squish output back to NCW.
-        if group_conv1d:
-            out = _op.squeeze(out, axis=[2])
-
         use_bias = len(inputs) == 3
         if use_bias:
             out = _op.nn.bias_add(out, inputs[2])
diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py
index 607b2a59045a..54c78d4d8cd9 100644
--- a/python/tvm/relay/op/strategy/cuda.py
+++ b/python/tvm/relay/op/strategy/cuda.py
@@ -689,20 +689,36 @@ def conv1d_strategy_cuda(attrs, inputs, out_type, target):
     if dilation[0] < 1:
         raise ValueError("dilation should be a positive value")
     strategy = _op.OpStrategy()
-    if layout == "NCW":
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.cuda.conv1d_ncw),
-            wrap_topi_schedule(topi.cuda.schedule_conv1d_ncw),
-            name="conv1d_ncw.cuda",
-        )
-    elif layout == "NWC":
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.cuda.conv1d_nwc),
-            wrap_topi_schedule(topi.cuda.schedule_conv1d_nwc),
-            name="conv1d_nwc.cuda",
-        )
+    if attrs.groups == 1:
+        if layout == "NCW":
+            strategy.add_implementation(
+                wrap_compute_conv1d(topi.cuda.conv1d_ncw),
+                wrap_topi_schedule(topi.cuda.schedule_conv1d_ncw),
+                name="conv1d_ncw.cuda",
+            )
+        elif layout == "NWC":
+            strategy.add_implementation(
+                wrap_compute_conv1d(topi.cuda.conv1d_nwc),
+                wrap_topi_schedule(topi.cuda.schedule_conv1d_nwc),
+                name="conv1d_nwc.cuda",
+            )
+        else:
+            raise ValueError("Unsupported conv1d layout {}".format(layout))
     else:
-        raise ValueError("Unsupported conv1d layout {}".format(layout))
+        if layout == "NCW":
+            strategy.add_implementation(
+                wrap_compute_group_conv1d(topi.cuda.group_conv1d_ncw),
+                wrap_topi_schedule(topi.cuda.schedule_group_conv1d_ncw),
+                name="group_conv1d_ncw.cuda",
+            )
+        elif layout == "NWC":
+            strategy.add_implementation(
+                wrap_compute_group_conv1d(topi.cuda.group_conv1d_nwc),
+                wrap_topi_schedule(topi.cuda.schedule_group_conv1d_nwc),
+                name="group_conv1d_nwc.cuda",
+            )
+        else:
+            raise ValueError("Unsupported conv1d layout {}".format(layout))
     return strategy
 
 
diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py
index 461e755f5212..d7f0dda92c6d 100644
--- a/python/tvm/relay/op/strategy/generic.py
+++ b/python/tvm/relay/op/strategy/generic.py
@@ -637,6 +637,49 @@ def conv1d_strategy(attrs, inputs, out_type, target):
     return strategy
 
 
+def wrap_compute_group_conv1d(topi_compute):
+    """wrap conv1d topi compute"""
+
+    def _compute_group_conv1d(attrs, inputs, out_type):
+        """Compute definition of conv1d"""
+        strides = get_const_tuple(attrs.strides)
+        padding = get_const_tuple(attrs.padding)
+        dilation = get_const_tuple(attrs.dilation)
+        out_dtype = attrs.out_dtype
+        out_dtype = inputs[0].dtype if out_dtype in ("same", "") else out_dtype
+        return [
+            topi_compute(inputs[0], inputs[1], strides, padding, dilation, attrs.groups, out_dtype)
+        ]
+
+    return _compute_group_conv1d
+
+
+@override_native_generic_func("group_conv1d_strategy")
+def group_conv1d_strategy(attrs, inputs, out_type, target):
+    """group_conv1d generic strategy"""
+    logger.warning("group_conv1d is not optimized for this platform.")
+    layout = attrs.data_layout
+    dilation = get_const_tuple(attrs.dilation)
+    if dilation[0] < 1:
+        raise ValueError("dilation should be a positive value")
+    strategy = _op.OpStrategy()
+    if layout == "NCW":
+        strategy.add_implementation(
+            wrap_compute_conv1d(topi.nn.group_conv1d_ncw),
+            wrap_topi_schedule(topi.generic.schedule_group_conv1d_ncw),
+            name="group_conv1d_ncw.generic",
+        )
+    elif layout == "NWC":
+        strategy.add_implementation(
+            wrap_compute_conv1d(topi.nn.group_conv1d_nwc),
+            wrap_topi_schedule(topi.generic.schedule_group_conv1d_nwc),
+            name="group_conv1d_nwc.generic",
+        )
+    else:
+        raise ValueError("Unsupported conv1d layout {}".format(layout))
+    return strategy
+
+
 # conv1d_transpose
 def wrap_compute_conv1d_transpose(topi_compute):
     """wrap conv1d_transpose topi compute"""
diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py
index a421b120fab4..20b674966131 100644
--- a/python/tvm/relay/op/strategy/x86.py
+++ b/python/tvm/relay/op/strategy/x86.py
@@ -360,24 +360,41 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target):
 def conv1d_strategy_cpu(attrs, inputs, out_type, target):
     """conv1d x86 strategy"""
     layout = attrs.data_layout
+    groups = attrs.groups
     dilation = get_const_tuple(attrs.dilation)
     if dilation[0] < 1:
         raise ValueError("dilation should be a positive value")
     strategy = _op.OpStrategy()
-    if layout == "NCW":
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.nn.conv1d_ncw),
-            wrap_topi_schedule(topi.x86.schedule_conv1d_ncw),
-            name="conv1d_ncw.x86",
-        )
-    elif layout == "NWC":
-        strategy.add_implementation(
-            wrap_compute_conv1d(topi.nn.conv1d_nwc),
-            wrap_topi_schedule(topi.x86.schedule_conv1d_nwc),
-            name="conv1d_nwc.x86",
-        )
+    if groups == 1:
+        if layout == "NCW":
+            strategy.add_implementation(
+                wrap_compute_conv1d(topi.nn.conv1d_ncw),
+                wrap_topi_schedule(topi.x86.schedule_conv1d_ncw),
+                name="conv1d_ncw.x86",
+            )
+        elif layout == "NWC":
+            strategy.add_implementation(
+                wrap_compute_conv1d(topi.nn.conv1d_nwc),
+                wrap_topi_schedule(topi.x86.schedule_conv1d_nwc),
+                name="conv1d_nwc.x86",
+            )
+        else:
+            raise ValueError("Unsupported conv1d layout {}".format(layout))
     else:
-        raise ValueError("Unsupported conv1d layout {}".format(layout))
+        if layout == "NCW":
+            strategy.add_implementation(
+                wrap_compute_group_conv1d(topi.nn.group_conv1d_ncw),
+                wrap_topi_schedule(topi.x86.schedule_group_conv1d_ncw),
+                name="group_conv1d_ncw.x86",
+            )
+        elif layout == "NWC":
+            strategy.add_implementation(
+                wrap_compute_group_conv1d(topi.nn.group_conv1d_nwc),
+                wrap_topi_schedule(topi.x86.schedule_group_conv1d_nwc),
+                name="group_conv1d_nwc.x86",
+            )
+        else:
+            raise ValueError("Unsupported conv1d layout {}".format(layout))
     return strategy
 
 
diff --git a/python/tvm/topi/cuda/conv1d.py b/python/tvm/topi/cuda/conv1d.py
index e50913d88df2..b2fc4ca02dc9 100644
--- a/python/tvm/topi/cuda/conv1d.py
+++ b/python/tvm/topi/cuda/conv1d.py
@@ -29,8 +29,7 @@ def conv1d_ncw(cfg, data, kernel, strides, padding, dilation, out_dtype="float32
     return nn.conv1d_ncw(data, kernel, strides, padding, dilation, out_dtype)
 
 
-@autotvm.register_topi_schedule("conv1d_ncw.cuda")
-def schedule_conv1d_ncw(cfg, outs):
+def _schedule_conv1d_ncw(cfg, outs):
     """TOPI schedule callback of conv1d ncw for cuda gpu
 
     Parameters
@@ -51,7 +50,7 @@ def schedule_conv1d_ncw(cfg, outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == "conv1d_ncw":
+        if op.tag == "conv1d_ncw" or op.tag == "group_conv1d_ncw":
             pad_data = op.input_tensors[0]
             kernel = op.input_tensors[1]
             conv = op.output(0)
@@ -140,13 +139,27 @@ def _callback(op):
     return s
 
 
+@autotvm.register_topi_schedule("conv1d_ncw.cuda")
+def schedule_conv1d_ncw(cfg, outs):
+    return _schedule_conv1d_ncw(cfg, outs)
+
+
+@autotvm.register_topi_compute("group_conv1d_ncw.cuda")
+def group_conv1d_ncw(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
+    return nn.group_conv1d_ncw(data, kernel, strides, padding, dilation, groups, out_dtype)
+
+
+@autotvm.register_topi_schedule("group_conv1d_ncw.cuda")
+def schedule_group_conv1d_ncw(cfg, outs):
+    return _schedule_conv1d_ncw(cfg, outs)
+
+
 @autotvm.register_topi_compute("conv1d_nwc.cuda")
 def conv1d_nwc(cfg, data, kernel, strides, padding, dilation, out_dtype="float32"):
     return nn.conv1d_nwc(data, kernel, strides, padding, dilation, out_dtype)
 
 
-@autotvm.register_topi_schedule("conv1d_nwc.cuda")
-def schedule_conv1d_nwc(cfg, outs):
+def _schedule_conv1d_nwc(cfg, outs):
     """TOPI schedule callback of conv1d nwc for cuda gpu
 
     Parameters
@@ -167,7 +180,7 @@ def schedule_conv1d_nwc(cfg, outs):
     s = te.create_schedule([x.op for x in outs])
 
     def _callback(op):
-        if op.tag == "conv1d_nwc":
+        if op.tag == "conv1d_nwc" or op.tag == "group_conv1d_nwc":
             pad_data = op.input_tensors[0]
             kernel = op.input_tensors[1]
             conv = op.output(0)
@@ -254,3 +267,18 @@ def _callback(op):
     traverse_inline(s, outs[0].op, _callback)
 
     return s
+
+
+@autotvm.register_topi_schedule("conv1d_nwc.cuda")
+def schedule_conv1d_nwc(cfg, outs):
+    return _schedule_conv1d_nwc(cfg, outs)
+
+
+@autotvm.register_topi_compute("group_conv1d_nwc.cuda")
+def group_conv1d_nwc(cfg, data, kernel, strides, padding, dilation, groups, out_dtype="float32"):
+    return nn.group_conv1d_nwc(data, kernel, strides, padding, dilation, groups, out_dtype)
+
+
+@autotvm.register_topi_schedule("group_conv1d_nwc.cuda")
+def schedule_group_conv1d_nwc(cfg, outs):
+    return _schedule_conv1d_nwc(cfg, outs)
diff --git a/python/tvm/topi/generic/nn.py b/python/tvm/topi/generic/nn.py
index ba63c539133e..4226c6caf23c 100644
--- a/python/tvm/topi/generic/nn.py
+++ b/python/tvm/topi/generic/nn.py
@@ -54,6 +54,40 @@ def schedule_conv1d_nwc(outs):
     return _default_schedule(outs, False)
 
 
+def schedule_group_conv1d_ncw(outs):
+    """Schedule for group_conv1d_ncw
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of group_conv1d_ncw
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
+def schedule_group_conv1d_nwc(outs):
+    """Schedule for group_conv1d_nwc
+
+    Parameters
+    ----------
+    outs: Array of Tensor
+          The computation graph description of group_conv1d_nwc
+          in the format of an array of tensors.
+
+    Returns
+    -------
+    sch: Schedule
+        The computation schedule for the op.
+    """
+    return _default_schedule(outs, False)
+
+
 def schedule_conv2d_hwcn(outs):
     """Schedule for conv2d_hwcn
 
diff --git a/python/tvm/topi/nn/conv1d.py b/python/tvm/topi/nn/conv1d.py
index 8fdf3f8918ee..0a1efa35655f 100644
--- a/python/tvm/topi/nn/conv1d.py
+++ b/python/tvm/topi/nn/conv1d.py
@@ -16,10 +16,7 @@
 # under the License.
 # pylint: disable=invalid-name, unused-variable, unused-argument
 """1D convolution operators."""
-from tvm import te
-from .pad import pad
-from ..utils import simplify
-from .utils import get_pad_tuple1d
+from .conv2d import conv
 
 
 def conv1d(data, kernel, strides=1, padding="VALID", dilation=1, layout="NCW", out_dtype=None):
@@ -50,30 +47,31 @@ def conv1d(data, kernel, strides=1, padding="VALID", dilation=1, layout="NCW", o
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
-    if out_dtype is None:
-        out_dtype = data.dtype
-    if isinstance(strides, (tuple, list)):
-        strides = strides[0]
-    if isinstance(dilation, (tuple, list)):
-        dilation = dilation[0]
+    return conv(data, kernel, strides, padding, dilation, 1, layout, out_dtype)
 
-    if layout == "NCW":
-        return conv1d_ncw(data, kernel, strides, padding, dilation, out_dtype)
-    if layout == "NWC":
-        return conv1d_nwc(data, kernel, strides, padding, dilation, out_dtype)
-    raise ValueError("This layout is not yet supported: {}".format(layout))
+
+def conv1d_nwc(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=None):
+    """1D convolution in NWC layout. See :py:func:`conv` for details on parameters"""
+    return conv(data, kernel, strides, padding, dilation, 1, "NWC", out_dtype=out_dtype)
 
 
 def conv1d_ncw(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=None):
-    """1D convolution forward operator for NCW layout.
+    """1D convolution in NCW layout. See :py:func:`conv` for details on parameters"""
+    return conv(data, kernel, strides, padding, dilation, 1, "NCW", out_dtype=out_dtype)
+
+
+def group_conv1d_nwc(
+    data, kernel, strides=1, padding="VALID", dilation=1, groups=1, out_dtype=None
+):
+    """1D convolution forward operator for NWC layout.
 
     Parameters
     ----------
     data : tvm.te.Tensor
-        3-D with shape [batch, in_channel, in_width]
+        3-D with shape [batch, in_width, in_channel]
 
     kernel : tvm.te.Tensor
-        3-D with shape [num_filter, in_channel, filter_size]
+        3-D with shape [filter_size, in_channel, num_filter]
 
     strides : int or tuple
         The spatial stride along width
@@ -85,55 +83,27 @@ def conv1d_ncw(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=N
     dilation : int or tuple
         Dilation rate if convolution should be dilated.
 
+    groups : int
+        Number of groups
+
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
-    if out_dtype is None:
-        out_dtype = data.dtype
-    if isinstance(strides, (tuple, list)):
-        strides = strides[0]
-    if isinstance(dilation, (tuple, list)):
-        dilation = dilation[0]
-
-    batch, in_channels, data_width = data.shape
-    out_channels, _, kernel_size = kernel.shape
-
-    # Compute the output shape
-    dilated_kernel_size = (kernel_size - 1) * dilation + 1
-    pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size,))
-    out_channels = simplify(out_channels)
-    out_width = simplify((data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1)
-
-    # Apply padding
-    pad_before = [0, 0, pad_left]
-    pad_after = [0, 0, pad_right]
-    temp = pad(data, pad_before, pad_after, name="pad_temp")
-
-    # Compute graph
-    rc = te.reduce_axis((0, in_channels), name="rc")
-    rw = te.reduce_axis((0, kernel_size), name="rw")
-
-    return te.compute(
-        (batch, out_channels, out_width),
-        lambda b, c, w: te.sum(
-            temp[b, rc, w * strides + rw * dilation].astype(out_dtype)
-            * kernel[c, rc, rw].astype(out_dtype),
-            axis=[rc, rw],
-        ),
-        tag="conv1d_ncw",
-    )
+    return conv(data, kernel, strides, padding, dilation, groups, "NWC", out_dtype=out_dtype)
 
 
-def conv1d_nwc(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=None):
-    """1D convolution forward operator for NWC layout.
+def group_conv1d_ncw(
+    data, kernel, strides=1, padding="VALID", dilation=1, groups=1, out_dtype=None
+):
+    """1D convolution forward operator for NCW layout.
 
     Parameters
     ----------
     data : tvm.te.Tensor
-        3-D with shape [batch, in_width, in_channel]
+        3-D with shape [batch, in_channel, in_width]
 
     kernel : tvm.te.Tensor
-        3-D with shape [filter_size, in_channel, num_filter]
+        3-D with shape [num_filter, in_channel, filter_size]
 
     strides : int or tuple
         The spatial stride along width
@@ -145,40 +115,10 @@ def conv1d_nwc(data, kernel, strides=1, padding="VALID", dilation=1, out_dtype=N
     dilation : int or tuple
         Dilation rate if convolution should be dilated.
 
+    groups : int
+        Number of groups
+
     out_dtype : str
         The output data type. If None then output is same type as input.
     """
-    if out_dtype is None:
-        out_dtype = data.dtype
-    if isinstance(strides, (tuple, list)):
-        strides = strides[0]
-    if isinstance(dilation, (tuple, list)):
-        dilation = dilation[0]
-
-    batch, data_width, in_channels = data.shape
-    kernel_size, _, out_channels = kernel.shape
-
-    # Compute the output shape
-    dilated_kernel_size = (kernel_size - 1) * dilation + 1
-    pad_left, pad_right = get_pad_tuple1d(padding, (dilated_kernel_size,))
-    out_channels = simplify(out_channels)
-    out_width = simplify((data_width - dilated_kernel_size + pad_left + pad_right) // strides + 1)
-
-    # Apply padding
-    pad_before = [0, pad_left, 0]
-    pad_after = [0, pad_right, 0]
-    temp = pad(data, pad_before, pad_after, name="pad_temp")
-
-    # Compute graph
-    rc = te.reduce_axis((0, in_channels), name="rc")
-    rw = te.reduce_axis((0, kernel_size), name="rw")
-
-    return te.compute(
-        (batch, out_width, out_channels),
-        lambda b, w, c: te.sum(
-            temp[b, w * strides + rw * dilation, rc].astype(out_dtype)
-            * kernel[rw, rc, c].astype(out_dtype),
-            axis=[rc, rw],
-        ),
-        tag="conv1d_nwc",
-    )
+    return conv(data, kernel, strides, padding, dilation, groups, "NCW", out_dtype=out_dtype)
diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py
index 7cb4b09b8805..97317fa0f9cd 100644
--- a/python/tvm/topi/nn/conv2d.py
+++ b/python/tvm/topi/nn/conv2d.py
@@ -20,13 +20,16 @@
 from __future__ import absolute_import as _abs
 
 from collections import namedtuple
+import re
+from typing import Union, Sequence
+import numpy as np
 
 import tvm
 from tvm import auto_scheduler, te
 
 from ..utils import get_const_int, get_const_tuple, simplify, tag
 from .pad import pad
-from .utils import get_pad_tuple
+from .utils import get_pad_tuple, get_pad_tuple_generic
 from .winograd_util import winograd_transform_matrices
 
 # workload description of conv2d
@@ -86,13 +89,7 @@ def conv2d(input, filter, strides, padding, dilation, layout="NCHW", out_dtype=N
     """
     # search platform specific declaration first
     # default declaration
-    if layout == "NCHW":
-        return conv2d_nchw(input, filter, strides, padding, dilation, out_dtype)
-    if layout == "HWCN":
-        return conv2d_hwcn(input, filter, strides, padding, dilation, out_dtype)
-    if layout == "NHWC":
-        return conv2d_nhwc(input, filter, strides, padding, dilation, out_dtype)
-    raise ValueError("not support this layout {} yet".format(layout))
+    return conv(input, filter, strides, padding, dilation, 1, layout, out_dtype)
 
 
 @tvm.target.generic_func
@@ -242,49 +239,7 @@ def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None):
     Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, in_channel, in_height, in_width = Input.shape
-    num_filter, channel, kernel_h, kernel_w = Filter.shape
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    out_channel = num_filter
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    # compute graph
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-    temp = pad(Input, pad_before, pad_after, name="pad_temp")
-    rc = te.reduce_axis((0, in_channel), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    return te.compute(
-        (batch, out_channel, out_height, out_width),
-        lambda nn, ff, yy, xx: te.sum(
-            temp[nn, rc, yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w].astype(
-                out_dtype
-            )
-            * Filter[ff, rc, ry, rx].astype(out_dtype),
-            axis=[rc, ry, rx],
-        ),
-        tag="conv2d_nchw",
-    )
+    return conv(Input, Filter, stride, padding, dilation, 1, "NCHW", out_dtype=out_dtype)
 
 
 def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
@@ -314,51 +269,7 @@ def conv2d_hwcn(Input, Filter, stride, padding, dilation, out_dtype=None):
     output : tvm.te.Tensor
         4-D with shape [out_height, out_width, out_channel, batch]
     """
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    in_height, in_width, in_channel, batch = Input.shape
-    kernel_h, kernel_w, channel, num_filter = Filter.shape
-    # compute the output shape
-    dilated_kernel_h = (kernel_h - 1) * dilation_h + 1
-    dilated_kernel_w = (kernel_w - 1) * dilation_w + 1
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(
-        padding, (dilated_kernel_h, dilated_kernel_w)
-    )
-    out_channel = num_filter
-    out_height = simplify((in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1)
-    out_width = simplify((in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1)
-    pad_before = [pad_top, pad_left, 0, 0]
-    pad_after = [pad_down, pad_right, 0, 0]
-    PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput")
-    rc = te.reduce_axis((0, in_channel), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    Output = te.compute(
-        (out_height, out_width, out_channel, batch),
-        lambda yy, xx, ff, nn: te.sum(
-            PaddedInput[
-                yy * stride_h + ry * dilation_h, xx * stride_w + rx * dilation_w, rc, nn
-            ].astype(out_dtype)
-            * Filter[ry, rx, rc, ff].astype(out_dtype),
-            axis=[ry, rx, rc],
-        ),
-        name="Conv2dOutput",
-        tag="conv2d_hwcn",
-    )
-    return Output
+    return conv(Input, Filter, stride, padding, dilation, 1, "HWCN", out_dtype=out_dtype)
 
 
 def conv2d_nhwc(
@@ -835,55 +746,148 @@ def group_conv2d_nchw(Input, Filter, stride, padding, dilation, groups, out_dtyp
     Output : tvm.te.Tensor
         4-D with shape [batch, out_channel, out_height, out_width]
     """
+    return conv(Input, Filter, stride, padding, dilation, groups, "NCHW", out_dtype=out_dtype)
+
+
+def conv(
+    inp: te.Tensor,
+    filt: te.Tensor,
+    stride: Union[int, Sequence[int]],
+    padding: Union[int, Sequence[int]],
+    dilation: Union[int, Sequence[int]],
+    groups: int,
+    order: str,
+    out_dtype: Union[str, None] = None,
+):
+    """Convolution operator in NCHW or NHWC layout.
+
+    Supports 1D, 2D, 3D, ... and grouping.
+
+    Parameters
+    ----------
+    inp : tvm.te.Tensor
+        N-D with shape [batch, in_channel, in_height, in_width, ...] ordered by `order`
+
+    filt : tvm.te.Tensor
+        N-D with shape [num_filter, in_channel // groups, filter_height, filter_width, ...]
+        for NCHW or [filter_height, filter_width, ..., in_channel // groups, num_filter] for NHWC
+
+    stride : int or a list/tuple of dim ints
+        (where dim=2 for NCHW, dim=1 for NCH, etc.)
+        Stride size, or [stride_height, stride_width, ...]
+
+    padding : int or a list/tuple of dim or 2*dim ints
+        (where dim=2 for NCHW, dim=1 for NCH, etc.)
+        padding size, or
+        [pad_height, pad_width, ...] for dim ints, or
+        [pad_top, pad_left, pad_bottom, pad_right] for 2*dim ints
+
+    dilation : int or a list/tuple of two ints
+        dilation size, or [dilation_height, dilation_width]
+
+    groups : int
+        number of groups
+
+    order : str
+        Ordering of dimensions. N indicates batch dimension, C indicates
+        channels, any other character indicates HW (or H or HWD for 1D and 3D).
+
+    out_dtype : str
+        Elements are converted to this type before elementwise multiplication
+        and summation.
+
+    Returns
+    -------
+    Output : tvm.te.Tensor
+        N-D with shape [batch, out_channel, out_height, out_width, ...] ordered by `order`.
+    """
+    dim = len(inp.shape) - 2
     if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
+        out_dtype = inp.dtype
+    assert isinstance(stride, int) or len(stride) == dim
+    assert isinstance(dilation, int) or len(dilation) == dim
     if isinstance(stride, int):
-        stride_h = stride_w = stride
+        strides = [stride for _ in range(dim)]
     else:
-        stride_h, stride_w = stride
+        strides = stride
 
     if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
+        dilations = [dilation for _ in range(dim)]
     else:
-        dilation_h, dilation_w = dilation
+        dilations = list(dilation)
+
+    # transform from order to NCHW
+    permutation_to = [order.find("N"), order.find("C")] + [
+        x.span()[0] for x in re.finditer("[^NC]", order)
+    ]
+    # transform from NCHW to order
+    permutation_from = np.argsort(permutation_to)
+    # transform from CHW to order
+    permutation_from_reductions = permutation_from[1:].copy()
+    permutation_from_reductions[permutation_from_reductions > permutation_from[0]] -= 1
+
+    # kernel permutation, if C appears before HW then num_filter is first, otherwise it is last
+    # tkonolige: I don't really understand kernel ordering for NHWC, it seems
+    # like num_filters should match the N dimension
+    if order.find("C") < re.search("[^NC]", order).span()[0]:
+        permutation_to_kernel = [0, 1] + list(range(2, dim + 2))
+    else:
+        permutation_to_kernel = [dim + 1, dim] + list(range(dim))
+    permutation_from_kernel = np.argsort(permutation_to_kernel)
 
-    batch, in_channel, in_height, in_width = get_const_tuple(Input.shape)
-    num_filter, _, kernel_h, kernel_w = get_const_tuple(Filter.shape)
+    batch, in_channel, *dimensions = np.array(get_const_tuple(inp.shape))[permutation_to].tolist()
+    num_filter, _, *kernel_dimensions = np.array(get_const_tuple(filt.shape))[
+        permutation_to_kernel
+    ].tolist()
 
     assert in_channel % groups == 0, "input channels must divide group size"
     assert num_filter % groups == 0, "output channels must divide group size"
 
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
+    dilated_kernel_dimensions = [(k - 1) * dil + 1 for k, dil in zip(kernel_dimensions, dilations)]
+    pad_begin, pad_end = get_pad_tuple_generic(padding, dilated_kernel_dimensions)
     # compute the output shape
     out_channel = num_filter
-    out_height = simplify(
-        (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
-    )
-    out_width = simplify(
-        (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
-    )
+    out_dimensions = [
+        simplify(d - (k - 1) * dil - 1 + pb + pe) // stride + 1
+        for d, k, dil, pb, pe, stride in zip(
+            dimensions, kernel_dimensions, dilations, pad_begin, pad_end, strides
+        )
+    ]
     # compute graph
-    pad_before = [0, 0, pad_top, pad_left]
-    pad_after = [0, 0, pad_down, pad_right]
-    temp = pad(Input, pad_before, pad_after, name="pad_temp")
+    pad_before = list(np.array([0, 0] + pad_begin)[permutation_from])
+    pad_after = list(np.array([0, 0] + pad_end)[permutation_from])
+    temp = pad(inp, pad_before, pad_after, name="pad_temp")
     rc = te.reduce_axis((0, in_channel // groups), name="rc")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
+    rs = [te.reduce_axis((0, k), name=f"r{i}") for i, k in enumerate(kernel_dimensions)]
+
+    def compute(*args):
+        nn, ff, *dim_indices = list(np.array(args)[permutation_to])
+        return te.sum(
+            temp.__getitem__(
+                tuple(
+                    np.array(
+                        [nn, ff // (num_filter // groups) * (in_channel // groups) + rc]
+                        + [
+                            di * stride + r * dil
+                            for di, stride, r, dil in zip(dim_indices, strides, rs, dilations)
+                        ]
+                    )[permutation_from]
+                )
+            ).astype(out_dtype)
+            * filt.__getitem__(tuple(np.array([ff, rc] + rs)[permutation_from_kernel])).astype(
+                out_dtype
+            ),
+            # Schedules depend on reduction axes being in the same order as the
+            # layout, so we reorder here.
+            axis=np.array([rc, *rs])[permutation_from_reductions].tolist(),
+        )
+
     return te.compute(
-        (batch, out_channel, out_height, out_width),
-        lambda nn, ff, yy, xx: te.sum(
-            temp[
-                nn,
-                ff // (num_filter // groups) * (in_channel // groups) + rc,
-                yy * stride_h + ry * dilation_h,
-                xx * stride_w + rx * dilation_w,
-            ].astype(out_dtype)
-            * Filter[ff, rc, ry, rx].astype(out_dtype),
-            axis=[rc, ry, rx],
-        ),
-        tag="group_conv2d_nchw",
+        list(np.array([batch, out_channel] + out_dimensions)[permutation_from]),
+        compute,
+        # tag is expected to be lowercase
+        tag=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
+        name=f"{'group_' if groups > 1 else ''}conv{dim}d_{order.lower()}",
     )
 
 
@@ -893,7 +897,7 @@ def group_conv2d_nhwc(Input, Filter, stride, padding, dilation, groups, out_dtyp
     Parameters
     ----------
     Input : tvm.te.Tensor
-        4-D with shape [batch, in_height, in_width, in_channel]
+        4-D with shape [batch, in_height, in_width, in_channel, ...]
 
     Filter : tvm.te.Tensor
         4-D with shape [filter_height, filter_width, in_channel // groups, num_filter]
@@ -920,56 +924,7 @@ def group_conv2d_nhwc(Input, Filter, stride, padding, dilation, groups, out_dtyp
     Output : tvm.te.Tensor
         4-D with shape [batch, out_height, out_width, out_channel]
     """
-    if out_dtype is None:
-        out_dtype = Input.dtype
-    assert isinstance(stride, int) or len(stride) == 2
-    assert isinstance(dilation, int) or len(dilation) == 2
-    if isinstance(stride, int):
-        stride_h = stride_w = stride
-    else:
-        stride_h, stride_w = stride
-
-    if isinstance(dilation, int):
-        dilation_h = dilation_w = dilation
-    else:
-        dilation_h, dilation_w = dilation
-
-    batch, in_height, in_width, in_channel = get_const_tuple(Input.shape)
-    kernel_h, kernel_w, _, num_filter = get_const_tuple(Filter.shape)
-
-    assert in_channel % groups == 0, "input channels must divide group size"
-    assert num_filter % groups == 0, "output channels must divide group size"
-
-    pad_top, pad_left, pad_down, pad_right = get_pad_tuple(padding, (kernel_h, kernel_w))
-    # compute the output shape
-    out_channel = num_filter
-    out_height = simplify(
-        (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1
-    )
-    out_width = simplify(
-        (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1
-    )
-    # compute graph
-    pad_before = [0, pad_top, pad_left, 0]
-    pad_after = [0, pad_down, pad_right, 0]
-    temp = pad(Input, pad_before, pad_after, name="pad_temp")
-    ry = te.reduce_axis((0, kernel_h), name="ry")
-    rx = te.reduce_axis((0, kernel_w), name="rx")
-    rc = te.reduce_axis((0, in_channel // groups), name="rc")
-    return te.compute(
-        (batch, out_height, out_width, out_channel),
-        lambda nn, yy, xx, ff: te.sum(
-            temp[
-                nn,
-                yy * stride_h + ry * dilation_h,
-                xx * stride_w + rx * dilation_w,
-                ff // (num_filter // groups) * (in_channel // groups) + rc,
-            ].astype(out_dtype)
-            * Filter[ry, rx, rc, ff].astype(out_dtype),
-            axis=[ry, rx, rc],
-        ),
-        tag="group_conv2d_nhwc",
-    )
+    return conv(Input, Filter, stride, padding, dilation, groups, "NHWC", out_dtype=out_dtype)
 
 
 def unpack_NCHWc_to_nchw(packed_out, out_dtype):
diff --git a/python/tvm/topi/nn/utils.py b/python/tvm/topi/nn/utils.py
index 369b62c638ca..01e1c1ab5444 100644
--- a/python/tvm/topi/nn/utils.py
+++ b/python/tvm/topi/nn/utils.py
@@ -164,6 +164,53 @@ def get_pad_tuple(padding, kernel):
     return pad_top, pad_left, pad_h - pad_top, pad_w - pad_left
 
 
+def get_pad_tuple_generic(padding, kernel):
+    """Common code to get the pad option
+
+    Parameters
+    ----------
+    padding : int or str
+        Padding size, or ['VALID', 'SAME']
+
+    kernel : tuple of int
+        Conv kernel size
+
+    Returns
+    -------
+    pad_top : int
+        Padding size on top
+
+    pad_down : int
+        Padding size on down.
+
+    pad_left : int
+        Padding size on left
+
+    pad_right : int
+        Padding size on right.
+    """
+    # compute the padding size
+    if isinstance(padding, (tuple, list)):
+        if len(padding) == len(kernel):
+            pad_dimensions = [p * 2 for p in padding]
+        elif len(padding) == len(kernel) * 2:
+            return [padding[i] for i in range(len(kernel))], [
+                padding[len(kernel) + i] for i in range(len(kernel))
+            ]
+        else:
+            raise ValueError("Size of padding can only be len(kernel) or len(kernel) * 2")
+    elif isinstance(padding, int):
+        pad_dimensions = [padding * 2 for _ in range(len(kernel))]
+    elif padding == "VALID":
+        pad_dimensions = [0 for _ in range(len(kernel))]
+    elif padding == "SAME":
+        pad_dimensions = [k - 1 for k in kernel]
+    else:
+        raise ValueError("Unknown padding option %s" % padding)
+    pad_begin = [(p + 1) // 2 for p in pad_dimensions]
+    return [pad_begin, [pd - pb for pb, pd in zip(pad_begin, pad_dimensions)]]
+
+
 def get_pad_tuple3d(padding, kernel):
     """Common code to get the pad option
 
diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py
index 8f78805fff3b..345886c2be91 100644
--- a/python/tvm/topi/testing/__init__.py
+++ b/python/tvm/topi/testing/__init__.py
@@ -21,7 +21,7 @@
 """
 from __future__ import absolute_import as _abs
 
-from .conv1d_ncw_python import conv1d_ncw_python
+from .conv1d_ncw_python import conv1d_ncw_python, group_conv1d_ncw_python
 from .conv2d_hwcn_python import conv2d_hwcn_python
 from .conv2d_nchw_python import conv2d_nchw_python
 from .conv2d_nhwc_python import conv2d_nhwc_python
diff --git a/python/tvm/topi/testing/conv1d_ncw_python.py b/python/tvm/topi/testing/conv1d_ncw_python.py
index 190e1c664610..9104cacd173e 100644
--- a/python/tvm/topi/testing/conv1d_ncw_python.py
+++ b/python/tvm/topi/testing/conv1d_ncw_python.py
@@ -43,6 +43,17 @@ def dilate_np(x, dilation):
     return x
 
 
+def group_conv1d_ncw_python(a_np, w_np, stride, padding, dilation, groups):
+    "Grouped version of `conv1d_ncw_python`, see that for documentation"
+    a_slices = np.array_split(a_np, groups, axis=1)
+    w_slices = np.array_split(w_np, groups, axis=0)
+    b_slices = [
+        conv1d_ncw_python(a_slice, w_slice, stride, padding, dilation)
+        for a_slice, w_slice in zip(a_slices, w_slices)
+    ]
+    return np.concatenate(b_slices, axis=1)
+
+
 def conv1d_ncw_python(a_np, w_np, stride, padding, dilation):
     """1D convolution operator in NCW layout
 
@@ -64,6 +75,9 @@ def conv1d_ncw_python(a_np, w_np, stride, padding, dilation):
     dilation : int
         Dilation rate of the kernel
 
+    groups : int
+        Number of groups in the convolution
+
     Returns
     -------
     b_np : numpy.ndarray
diff --git a/python/tvm/topi/x86/conv1d.py b/python/tvm/topi/x86/conv1d.py
index 121c1c21c784..76fc40cab3a4 100644
--- a/python/tvm/topi/x86/conv1d.py
+++ b/python/tvm/topi/x86/conv1d.py
@@ -126,3 +126,11 @@ def traverse(op):
 
     traverse(output_op)
     return s
+
+
+def schedule_group_conv1d_ncw(outs):
+    return schedule_conv1d_ncw(outs)
+
+
+def schedule_group_conv1d_nwc(outs):
+    return schedule_conv1d_nwc(outs)
diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h
index e5e64fa5be65..954daeaa86cf 100644
--- a/src/relay/op/nn/convolution.h
+++ b/src/relay/op/nn/convolution.h
@@ -76,7 +76,7 @@ bool Conv1DRel(const Array<Type>& types, int num_inputs, const Attrs& attrs,
   if (param->kernel_size.defined() && param->channels.defined()) {
     Array<IndexExpr> wshape;
 
-    wshape = {{param->channels, dshape_ncw[1], param->kernel_size[0]}};
+    wshape = {{param->channels, indexdiv(dshape_ncw[1], param->groups), param->kernel_size[0]}};
 
     wshape = trans_kernel_layout.BackwardShape(wshape);
     channels = param->channels;
diff --git a/tests/python/topi/python/test_topi_conv1d.py b/tests/python/topi/python/test_topi_conv1d.py
index f5284ca36fee..db8d7238feba 100644
--- a/tests/python/topi/python/test_topi_conv1d.py
+++ b/tests/python/topi/python/test_topi_conv1d.py
@@ -38,6 +38,19 @@
     "gpu": (topi.cuda.conv1d_nwc, topi.cuda.schedule_conv1d_nwc),
 }
 
+_group_conv1d_implementations = {
+    "NCW": {
+        "generic": (topi.nn.group_conv1d_ncw, topi.generic.schedule_group_conv1d_ncw),
+        "cpu": (topi.nn.group_conv1d_ncw, topi.x86.schedule_group_conv1d_ncw),
+        "gpu": (topi.cuda.group_conv1d_ncw, topi.cuda.schedule_group_conv1d_ncw),
+    },
+    "NWC": {
+        "generic": (topi.nn.group_conv1d_nwc, topi.generic.schedule_group_conv1d_nwc),
+        "cpu": (topi.nn.group_conv1d_nwc, topi.x86.schedule_group_conv1d_nwc),
+        "gpu": (topi.cuda.group_conv1d_nwc, topi.cuda.schedule_group_conv1d_nwc),
+    },
+}
+
 
 def verify_conv1d(
     batch,
@@ -122,5 +135,65 @@ def test_conv1d():
         verify_conv1d(1, 5, 27, 18, 3, 1, 1, "VALID", layout)
 
 
+layout = tvm.testing.parameter("NCW", "NWC")
+padding = tvm.testing.parameter("SAME", "VALID")
+dtype = tvm.testing.parameter("float32")
+
+# batch, in_channels, in_width, filters, kernel_size, stride, dilation, groups
+shape = tvm.testing.parameter(
+    [1, 4, 8, 4, 3, 1, 1, 4],
+    [1, 4, 8, 4, 3, 1, 1, 4],
+    [1, 16, 32, 16, 3, 1, 1, 4],
+    [1, 16, 32, 16, 3, 2, 1, 4],
+    [1, 16, 32, 16, 3, 1, 2, 4],
+    [8, 16, 32, 16, 3, 1, 1, 4],
+    [1, 16, 32, 16, 3, 1, 1, 4],
+    [1, 16, 32, 16, 2, 1, 1, 4],
+    [1, 16, 32, 16, 1, 1, 1, 4],
+    [1, 21, 12, 21, 3, 1, 1, 3],
+    [1, 20, 27, 20, 3, 1, 1, 5],
+)
+
+
+def test_group_conv1d(shape, layout, padding, target, dev, dtype):
+    batch, in_channels, in_width, filters, kernel_size, stride, dilation, groups = shape
+    if layout == "NCW":
+        in_shape = [batch, in_channels, in_width]
+        kernel_shape = [filters, in_channels // groups, kernel_size]
+    else:
+        in_shape = [batch, in_width, in_channels]
+        kernel_shape = [kernel_size, in_channels // groups, filters]
+
+    # reference data
+    a_np = np.random.uniform(size=in_shape).astype(dtype)
+    w_np = np.random.uniform(size=kernel_shape).astype(dtype)
+    if layout == "NWC":
+        np_in = np.transpose(a_np, [0, 2, 1])
+        np_w = np.transpose(w_np, [2, 1, 0])
+    else:
+        np_in = a_np
+        np_w = w_np
+    b_np = tvm.topi.testing.group_conv1d_ncw_python(np_in, np_w, stride, padding, dilation, groups)
+    if layout == "NWC":
+        b_np = np.transpose(b_np, [0, 2, 1])
+
+    A = te.placeholder(in_shape, name="A", dtype=dtype)
+    W = te.placeholder(kernel_shape, name="W", dtype=dtype)
+
+    fcompute, fschedule = tvm.topi.testing.dispatch(target, _group_conv1d_implementations[layout])
+    with tvm.target.Target(target):
+        B = fcompute(A, W, stride, padding, dilation, groups, "float32")
+        s = fschedule([B])
+
+    a = tvm.nd.array(a_np, dev)
+    w = tvm.nd.array(w_np, dev)
+    b = tvm.nd.array(np.zeros(get_const_tuple(B.shape), dtype=dtype), dev)
+    print(tvm.lower(s, [A, W, B], target))
+
+    func = tvm.build(s, [A, W, B], target)
+    func(a, w, b)
+    tvm.testing.assert_allclose(b.numpy(), b_np, rtol=1e-5)
+
+
 if __name__ == "__main__":
     test_conv1d()