[mlir] TileUsingInterface bugfix for dominance error by ziereis · Pull Request #178190 · llvm/llvm-project

ziereis · 2026-01-27T12:37:54Z

In this PR i move the insertion point in the yieldReplacementForFusedProducer because i ran into some issue where a tensor.extract_slices tried to use a result of affine.apply that was inserted at the end of the block instead of the start of it.

This is the full error of the test i added before this change:

third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: error: operand #1 does not dominate this use
  %pack = linalg.pack %gen#1
          ^
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: note: see current operation: %24 = "tensor.extract_slice"(%23, %36, %8) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:71:12: note: operand defined here (op in the same block)
  %gen:2 = linalg.generic {
           ^
// -----// IR Dump After InterpreterPass Failed (transform-interpreter) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0) -> (d0 * 16)>
#map2 = affine_map<(d0) -> (d0 * -16 + 32)>
#map3 = affine_map<(d0) -> (16, d0 * -16 + 32)>
#map4 = affine_map<(d0) -> (d0 - 1)>
"builtin.module"() ({
  "func.func"() <{function_type = (tensor<32x1024xf32>) -> (tensor<32x1024xf32>, tensor<2x512x16x2xi8>), sym_name = "fuse_pack_consumer_into_multi_output_generic"}> ({
  ^bb0(%arg1: tensor<32x1024xf32>):
    %2 = "arith.constant"() <{value = 0 : i8}> : () -> i8
    %3 = "tensor.empty"() : () -> tensor<32x1024xf32>
    %4 = "tensor.empty"() : () -> tensor<32x1024xi8>
    %5 = "tensor.empty"() : () -> tensor<2x512x16x2xi8>
    %6:2 = "linalg.generic"(%arg1, %3, %4) <{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 1, 2>}> ({
    ^bb0(%arg9: f32, %arg10: f32, %arg11: i8):
      %41 = "arith.fptoui"(%arg9) : (f32) -> i8
      "linalg.yield"(%arg9, %41) : (f32, i8) -> ()
    }) : (tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xi8>) -> (tensor<32x1024xf32>, tensor<32x1024xi8>)
    %7:3 = "scf.forall"(%5, %3, %4) <{operandSegmentSizes = array<i32: 0, 0, 0, 3>, staticLowerBound = array<i64: 0>, staticStep = array<i64: 1>, staticUpperBound = array<i64: 2>}> ({
    ^bb0(%arg2: index, %arg3: tensor<2x512x16x2xi8>, %arg4: tensor<32x1024xf32>, %arg5: tensor<32x1024xi8>):
      %8 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %9 = "affine.apply"(%arg2) <{map = #map2}> : (index) -> index
      %10 = "affine.min"(%arg2) <{map = #map3}> : (index) -> index
      %11 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %12 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %13 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %14 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %15 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %16 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %17 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %18 = "tensor.extract_slice"(%arg1, %12, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %19 = "tensor.empty"() : () -> tensor<32x1024xf32>
      %20 = "tensor.extract_slice"(%19, %14, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %21 = "tensor.extract_slice"(%3, %14, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %22 = "tensor.empty"() : () -> tensor<32x1024xi8>
      %23 = "tensor.extract_slice"(%22, %16, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %24 = "tensor.extract_slice"(%4, %16, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %25 = "tensor.empty"() : () -> tensor<32x1024xf32>
      %26 = "tensor.extract_slice"(%25, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %27 = "tensor.extract_slice"(%arg4, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %28 = "tensor.empty"() : () -> tensor<32x1024xi8>
      %29 = "tensor.extract_slice"(%28, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %30 = "tensor.extract_slice"(%arg5, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %31:2 = "linalg.generic"(%18, %27, %30) <{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 1, 2>}> ({
      ^bb0(%arg6: f32, %arg7: f32, %arg8: i8):
        %40 = "arith.fptoui"(%arg6) : (f32) -> i8
        "linalg.yield"(%arg6, %40) : (f32, i8) -> ()
      }) : (tensor<?x1024xf32>, tensor<?x1024xf32>, tensor<?x1024xi8>) -> (tensor<?x1024xf32>, tensor<?x1024xi8>)
      %32 = "tensor.extract_slice"(%6#1, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %33 = "tensor.empty"() : () -> tensor<2x512x16x2xi8>
      %34 = "tensor.extract_slice"(%33, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<2x512x16x2xi8>, index) -> tensor<1x512x16x2xi8>
      %35 = "tensor.extract_slice"(%arg3, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<2x512x16x2xi8>, index) -> tensor<1x512x16x2xi8>
      %36 = "linalg.pack"(%31#1, %35, %2) <{inner_dims_pos = array<i64: 0, 1>, operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_inner_tiles = array<i64: 16, 2>}> : (tensor<?x1024xi8>, tensor<1x512x16x2xi8>, i8) -> tensor<1x512x16x2xi8>
      %37 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %38 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %39 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      "scf.forall.in_parallel"() ({
        "tensor.parallel_insert_slice"(%36, %arg3, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<1x512x16x2xi8>, tensor<2x512x16x2xi8>, index) -> ()
        "tensor.parallel_insert_slice"(%31#0, %arg4, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<?x1024xf32>, tensor<32x1024xf32>, index, index) -> ()
        "tensor.parallel_insert_slice"(%31#1, %arg5, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<?x1024xi8>, tensor<32x1024xi8>, index, index) -> ()
      }) : () -> ()
    }) : (tensor<2x512x16x2xi8>, tensor<32x1024xf32>, tensor<32x1024xi8>) -> (tensor<2x512x16x2xi8>, tensor<32x1024xf32>, tensor<32x1024xi8>)
    "func.return"(%7#1, %7#0) : (tensor<32x1024xf32>, tensor<2x512x16x2xi8>) -> ()
  }) : () -> ()
  "builtin.module"() ({
    "transform.named_sequence"() <{arg_attrs = [{transform.readonly}], function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({
    ^bb0(%arg0: !transform.any_op):
      %0 = "transform.structured.match"(%arg0) <{ops = ["linalg.pack"]}> : (!transform.any_op) -> !transform.any_op
      %1:2 = "transform.test.fuse_and_yield"(%0) <{tile_interchange = [], tile_sizes = [1], use_forall = true}> : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
      "transform.yield"() : () -> ()
    }) : () -> ()
  }) {transform.with_named_sequence} : () -> ()
}) : () -> ()

I also noticed that Interface tests are missing from the bazel overlay so i also added this.

llvmbot · 2026-01-27T12:38:29Z

@llvm/pr-subscribers-mlir-scf

@llvm/pr-subscribers-mlir

Author: None (ziereis)

Changes

In this PR i move the insertion point in the yieldReplacementForFusedProducer because i ran into some issue where a tensor.extract_slices tried to use a result of affine.apply that was inserted at the end of the block instead of the start of it.

This is the full error of the test i added before this change:

third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: error: operand #<!-- -->1 does not dominate this use
  %pack = linalg.pack %gen#<!-- -->1
          ^
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: note: see current operation: %24 = "tensor.extract_slice"(%23, %36, %8) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:71:12: note: operand defined here (op in the same block)
  %gen:2 = linalg.generic {
           ^
// -----// IR Dump After InterpreterPass Failed (transform-interpreter) //----- //
#map = affine_map&lt;(d0, d1) -&gt; (d0, d1)&gt;
#map1 = affine_map&lt;(d0) -&gt; (d0 * 16)&gt;
#map2 = affine_map&lt;(d0) -&gt; (d0 * -16 + 32)&gt;
#map3 = affine_map&lt;(d0) -&gt; (16, d0 * -16 + 32)&gt;
#map4 = affine_map&lt;(d0) -&gt; (d0 - 1)&gt;
"builtin.module"() ({
  "func.func"() &lt;{function_type = (tensor&lt;32x1024xf32&gt;) -&gt; (tensor&lt;32x1024xf32&gt;, tensor&lt;2x512x16x2xi8&gt;), sym_name = "fuse_pack_consumer_into_multi_output_generic"}&gt; ({
  ^bb0(%arg1: tensor&lt;32x1024xf32&gt;):
    %2 = "arith.constant"() &lt;{value = 0 : i8}&gt; : () -&gt; i8
    %3 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xf32&gt;
    %4 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xi8&gt;
    %5 = "tensor.empty"() : () -&gt; tensor&lt;2x512x16x2xi8&gt;
    %6:2 = "linalg.generic"(%arg1, %3, %4) &lt;{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type&lt;parallel&gt;, #linalg.iterator_type&lt;parallel&gt;], operandSegmentSizes = array&lt;i32: 1, 2&gt;}&gt; ({
    ^bb0(%arg9: f32, %arg10: f32, %arg11: i8):
      %41 = "arith.fptoui"(%arg9) : (f32) -&gt; i8
      "linalg.yield"(%arg9, %41) : (f32, i8) -&gt; ()
    }) : (tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xi8&gt;) -&gt; (tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xi8&gt;)
    %7:3 = "scf.forall"(%5, %3, %4) &lt;{operandSegmentSizes = array&lt;i32: 0, 0, 0, 3&gt;, staticLowerBound = array&lt;i64: 0&gt;, staticStep = array&lt;i64: 1&gt;, staticUpperBound = array&lt;i64: 2&gt;}&gt; ({
    ^bb0(%arg2: index, %arg3: tensor&lt;2x512x16x2xi8&gt;, %arg4: tensor&lt;32x1024xf32&gt;, %arg5: tensor&lt;32x1024xi8&gt;):
      %8 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %9 = "affine.apply"(%arg2) &lt;{map = #map2}&gt; : (index) -&gt; index
      %10 = "affine.min"(%arg2) &lt;{map = #map3}&gt; : (index) -&gt; index
      %11 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %12 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %13 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %14 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %15 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %16 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %17 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %18 = "tensor.extract_slice"(%arg1, %12, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %19 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xf32&gt;
      %20 = "tensor.extract_slice"(%19, %14, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %21 = "tensor.extract_slice"(%3, %14, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %22 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xi8&gt;
      %23 = "tensor.extract_slice"(%22, %16, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %24 = "tensor.extract_slice"(%4, %16, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %25 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xf32&gt;
      %26 = "tensor.extract_slice"(%25, %38, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %27 = "tensor.extract_slice"(%arg4, %38, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %28 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xi8&gt;
      %29 = "tensor.extract_slice"(%28, %8, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %30 = "tensor.extract_slice"(%arg5, %8, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %31:2 = "linalg.generic"(%18, %27, %30) &lt;{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type&lt;parallel&gt;, #linalg.iterator_type&lt;parallel&gt;], operandSegmentSizes = array&lt;i32: 1, 2&gt;}&gt; ({
      ^bb0(%arg6: f32, %arg7: f32, %arg8: i8):
        %40 = "arith.fptoui"(%arg6) : (f32) -&gt; i8
        "linalg.yield"(%arg6, %40) : (f32, i8) -&gt; ()
      }) : (tensor&lt;?x1024xf32&gt;, tensor&lt;?x1024xf32&gt;, tensor&lt;?x1024xi8&gt;) -&gt; (tensor&lt;?x1024xf32&gt;, tensor&lt;?x1024xi8&gt;)
      %32 = "tensor.extract_slice"(%6#<!-- -->1, %8, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %33 = "tensor.empty"() : () -&gt; tensor&lt;2x512x16x2xi8&gt;
      %34 = "tensor.extract_slice"(%33, %arg2) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 0, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0, 0, 0&gt;, static_sizes = array&lt;i64: 1, 512, 16, 2&gt;, static_strides = array&lt;i64: 1, 1, 1, 1&gt;}&gt; : (tensor&lt;2x512x16x2xi8&gt;, index) -&gt; tensor&lt;1x512x16x2xi8&gt;
      %35 = "tensor.extract_slice"(%arg3, %arg2) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 0, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0, 0, 0&gt;, static_sizes = array&lt;i64: 1, 512, 16, 2&gt;, static_strides = array&lt;i64: 1, 1, 1, 1&gt;}&gt; : (tensor&lt;2x512x16x2xi8&gt;, index) -&gt; tensor&lt;1x512x16x2xi8&gt;
      %36 = "linalg.pack"(%31#<!-- -->1, %35, %2) &lt;{inner_dims_pos = array&lt;i64: 0, 1&gt;, operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_inner_tiles = array&lt;i64: 16, 2&gt;}&gt; : (tensor&lt;?x1024xi8&gt;, tensor&lt;1x512x16x2xi8&gt;, i8) -&gt; tensor&lt;1x512x16x2xi8&gt;
      %37 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %38 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %39 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      "scf.forall.in_parallel"() ({
        "tensor.parallel_insert_slice"(%36, %arg3, %arg2) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0, 0, 0&gt;, static_sizes = array&lt;i64: 1, 512, 16, 2&gt;, static_strides = array&lt;i64: 1, 1, 1, 1&gt;}&gt; : (tensor&lt;1x512x16x2xi8&gt;, tensor&lt;2x512x16x2xi8&gt;, index) -&gt; ()
        "tensor.parallel_insert_slice"(%31#<!-- -->0, %arg4, %38, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;?x1024xf32&gt;, tensor&lt;32x1024xf32&gt;, index, index) -&gt; ()
        "tensor.parallel_insert_slice"(%31#<!-- -->1, %arg5, %8, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;?x1024xi8&gt;, tensor&lt;32x1024xi8&gt;, index, index) -&gt; ()
      }) : () -&gt; ()
    }) : (tensor&lt;2x512x16x2xi8&gt;, tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xi8&gt;) -&gt; (tensor&lt;2x512x16x2xi8&gt;, tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xi8&gt;)
    "func.return"(%7#<!-- -->1, %7#<!-- -->0) : (tensor&lt;32x1024xf32&gt;, tensor&lt;2x512x16x2xi8&gt;) -&gt; ()
  }) : () -&gt; ()
  "builtin.module"() ({
    "transform.named_sequence"() &lt;{arg_attrs = [{transform.readonly}], function_type = (!transform.any_op) -&gt; (), sym_name = "__transform_main"}&gt; ({
    ^bb0(%arg0: !transform.any_op):
      %0 = "transform.structured.match"(%arg0) &lt;{ops = ["linalg.pack"]}&gt; : (!transform.any_op) -&gt; !transform.any_op
      %1:2 = "transform.test.fuse_and_yield"(%0) &lt;{tile_interchange = [], tile_sizes = [1], use_forall = true}&gt; : (!transform.any_op) -&gt; (!transform.any_op, !transform.any_op)
      "transform.yield"() : () -&gt; ()
    }) : () -&gt; ()
  }) {transform.with_named_sequence} : () -&gt; ()
}) : () -&gt; ()

I also noticed that Interface tests are missing from the bazel overlay so i also added this.

Full diff: https://github.com/llvm/llvm-project/pull/178190.diff

3 Files Affected:

(modified) mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp (+8-1)
(modified) mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir (+69-1)
(added) utils/bazel/llvm-project-overlay/mlir/test/Interfaces/BUILD.bazel (+20)

diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 4d22a5e97ba4a..8bfc9e9dfad3d 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -1509,6 +1509,14 @@ FailureOr<SmallVector<Operation *>> mlir::scf::yieldReplacementForFusedProducer(
     auto tilableOp = cast<TilingInterface>(originalOwner);
     // b. get iterDomain Offset and Sizes based on sliceOp tile
     SmallVector<OpFoldResult> iterDomainOffset, iterDomainSizes;
+    // Set insertion point before any operations that might create new SSA
+    // values used in offset/size computations. This ensures all values created
+    // by getIterationDomainTileFromResultTile and getResultTilePosition
+    // dominate the extract_slice operations created later.
+    if (auto tiledDestStyleOp =
+            dyn_cast<DestinationStyleOpInterface>(tiledOwner)) {
+      rewriter.setInsertionPoint(tiledDestStyleOp);
+    }
     // skip tensor.pack/unpack/pad, which expects single opResult
     if (tilableOp->getNumResults() > 1 &&
         failed(tilableOp.getIterationDomainTileFromResultTile(
@@ -1550,7 +1558,6 @@ FailureOr<SmallVector<Operation *>> mlir::scf::yieldReplacementForFusedProducer(
     // necessary
     if (auto tiledDestStyleOp =
             dyn_cast<DestinationStyleOpInterface>(tiledOwner)) {
-      rewriter.setInsertionPoint(tiledDestStyleOp);
       for (const auto &&[index, newRegionArg] :
            llvm::enumerate(newRegionIterArgs)) {
         auto destSlice = tensor::ExtractSliceOp::create(
diff --git a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir
index 8fc8f3245be15..3de6c9cb2b398 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -transform-interpreter -cse -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -transform-interpreter -cse -canonicalize -split-input-file %s | FileCheck %s
 
 func.func @gemm_gemm_fusion_yield_both(%lhs0 : tensor<?x?xf32>, %rhs0 : tensor<?x?xf32>, %rhs1 : tensor<?x?xf32>,
     %init0 : tensor<?x?xf32>, %init1 : tensor<?x?xf32>)
@@ -58,3 +58,71 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:       tensor.parallel_insert_slice %[[GEMM1_TILE]] into %[[ITERARG0]][%[[IV]], 0]
 //      CHECK:       tensor.parallel_insert_slice %[[GEMM0_TILE]] into %[[ITERARG1]][%[[IV]], 0]
 //      CHECK:   return %[[RESULT]]#1, %[[RESULT]]#0
+
+// -----
+
+func.func @fuse_pack_consumer_into_multi_output_generic(
+    %input: tensor<32x1024xf32>) -> (tensor<32x1024xf32>, tensor<2x512x16x2xi8>) {
+  %c0_i8 = arith.constant 0 : i8
+  %output_f32 = tensor.empty() : tensor<32x1024xf32>
+  %output_i8 = tensor.empty() : tensor<32x1024xi8>
+  %pack_dest = tensor.empty() : tensor<2x512x16x2xi8>
+
+  %gen:2 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<32x1024xf32>)
+    outs(%output_f32, %output_i8 : tensor<32x1024xf32>, tensor<32x1024xi8>) {
+  ^bb0(%in: f32, %out_f: f32, %out_i: i8):
+    %q = arith.fptoui %in : f32 to i8
+    linalg.yield %in, %q : f32, i8
+  } -> (tensor<32x1024xf32>, tensor<32x1024xi8>)
+
+  %pack = linalg.pack %gen#1
+    padding_value(%c0_i8 : i8)
+    inner_dims_pos = [0, 1]
+    inner_tiles = [16, 2]
+    into %pack_dest : tensor<32x1024xi8> -> tensor<2x512x16x2xi8>
+
+  return %gen#0, %pack : tensor<32x1024xf32>, tensor<2x512x16x2xi8>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0 : !transform.any_op {transform.readonly}) {
+    %pack = transform.structured.match ops{["linalg.pack"]} in %arg0
+      : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.fuse_and_yield %pack [1] use_forall true
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 16)>
+//      CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * -16 + 32, 16)>
+//      CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+//      CHECK: func.func @fuse_pack_consumer_into_multi_output_generic(
+// CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<32x1024xf32>)
+//  CHECK-DAG:   %[[C0_I8:.+]] = arith.constant 0 : i8
+//  CHECK-DAG:   %[[OUTPUT_F32:.+]] = tensor.empty() : tensor<32x1024xf32>
+//  CHECK-DAG:   %[[OUTPUT_I8:.+]] = tensor.empty() : tensor<32x1024xi8>
+//  CHECK-DAG:   %[[PACK_DEST:.+]] = tensor.empty() : tensor<2x512x16x2xi8>
+//      CHECK:   %[[RESULT:.+]]:2 = scf.forall (%[[IV:.+]]) in (2)
+// CHECK-SAME:       shared_outs(%[[ITERARG0:[a-zA-Z0-9]+]] = %[[PACK_DEST]], %[[ITERARG1:[a-zA-Z0-9]+]] = %[[OUTPUT_F32]])
+//      CHECK:     %[[OFFSET:.+]] = affine.apply #[[$MAP0]](%[[IV]])
+//      CHECK:     %[[SIZE:.+]] = affine.min #[[$MAP1]](%[[IV]])
+//  CHECK-DAG:     %[[INPUT_TILE:.+]] = tensor.extract_slice %[[INPUT]][%[[OFFSET]], 0] [%[[SIZE]], 1024]
+//  CHECK-DAG:     %[[F32_TILE:.+]] = tensor.extract_slice %[[ITERARG1]][%[[OFFSET]], 0] [%[[SIZE]], 1024]
+//  CHECK-DAG:     %[[I8_TILE:.+]] = tensor.extract_slice %[[OUTPUT_I8]][%[[OFFSET]], 0] [%[[SIZE]], 1024]
+//      CHECK:     %[[GENERIC_TILE:.+]]:2 = linalg.generic
+// CHECK-SAME:         ins(%[[INPUT_TILE]] :
+// CHECK-SAME:         outs(%[[F32_TILE]], %[[I8_TILE]] :
+//  CHECK-DAG:     %[[PACK_DEST_TILE:.+]] = tensor.extract_slice %[[ITERARG0]][%[[IV]], 0, 0, 0] [1, 512, 16, 2]
+//      CHECK:     %[[PACK_TILE:.+]] = linalg.pack %[[GENERIC_TILE]]#1
+// CHECK-SAME:         padding_value(%[[C0_I8]] : i8)
+// CHECK-SAME:         inner_dims_pos = [0, 1] inner_tiles = [16, 2]
+// CHECK-SAME:         into %[[PACK_DEST_TILE]]
+//      CHECK:     scf.forall.in_parallel {
+//      CHECK:       tensor.parallel_insert_slice %[[PACK_TILE]] into %[[ITERARG0]][%[[IV]], 0, 0, 0] [1, 512, 16, 2]
+//      CHECK:       tensor.parallel_insert_slice %[[GENERIC_TILE]]#0 into %[[ITERARG1]][%[[OFFSET]], 0] [%[[SIZE]], 1024]
+//      CHECK:   return %[[RESULT]]#1, %[[RESULT]]#0
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/Interfaces/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/Interfaces/BUILD.bazel
new file mode 100644
index 0000000000000..ef41be3ebc865
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/mlir/test/Interfaces/BUILD.bazel
@@ -0,0 +1,20 @@
+load("//llvm:lit_test.bzl", "lit_test")
+
+licenses(["notice"])
+
+package(default_visibility = ["//visibility:public"])
+
+[
+    lit_test(
+        name = "%s.test" % src,
+        srcs = [src],
+        data = [
+            "//llvm:llvm-symbolizer",
+            "//mlir:mlir-opt",
+            "//mlir/test:lit_data",
+        ],
+    )
+    for src in glob(
+        include = ["**/*.mlir"],
+    )
+]

MaheshRavishankar

Thanks!

ziereis · 2026-01-30T16:39:50Z

@rupprecht @keith @aaronmondal Hi! Anyone could take a look at this please?

aaronmondal · 2026-01-30T17:34:05Z

Ah seems like an oversight that this didn't exist already. Thanks!

ziereis · 2026-02-02T08:27:31Z

if this is good to go, i would be happy if someone could merge it. I don't have write access myself.

In this PR i move the insertion point in the `yieldReplacementForFusedProducer` because i ran into some issue where a `tensor.extract_slices` tried to use a result of `affine.apply` that was inserted at the end of the block instead of the start of it. This is the full error of the test i added before this change: ```mlir third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: error: operand llvm#1 does not dominate this use %pack = linalg.pack %gen#1 ^ third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: note: see current operation: %24 = "tensor.extract_slice"(%23, %36, %8) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32> third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:71:12: note: operand defined here (op in the same block) %gen:2 = linalg.generic { ^ // -----// IR Dump After InterpreterPass Failed (transform-interpreter) //----- // #map = affine_map<(d0, d1) -> (d0, d1)> #map1 = affine_map<(d0) -> (d0 * 16)> #map2 = affine_map<(d0) -> (d0 * -16 + 32)> #map3 = affine_map<(d0) -> (16, d0 * -16 + 32)> #map4 = affine_map<(d0) -> (d0 - 1)> "builtin.module"() ({ "func.func"() <{function_type = (tensor<32x1024xf32>) -> (tensor<32x1024xf32>, tensor<2x512x16x2xi8>), sym_name = "fuse_pack_consumer_into_multi_output_generic"}> ({ ^bb0(%arg1: tensor<32x1024xf32>): %2 = "arith.constant"() <{value = 0 : i8}> : () -> i8 %3 = "tensor.empty"() : () -> tensor<32x1024xf32> %4 = "tensor.empty"() : () -> tensor<32x1024xi8> %5 = "tensor.empty"() : () -> tensor<2x512x16x2xi8> %6:2 = "linalg.generic"(%arg1, %3, %4) <{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 1, 2>}> ({ ^bb0(%arg9: f32, %arg10: f32, %arg11: i8): %41 = "arith.fptoui"(%arg9) : (f32) -> i8 "linalg.yield"(%arg9, %41) : (f32, i8) -> () }) : (tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xi8>) -> (tensor<32x1024xf32>, tensor<32x1024xi8>) %7:3 = "scf.forall"(%5, %3, %4) <{operandSegmentSizes = array<i32: 0, 0, 0, 3>, staticLowerBound = array<i64: 0>, staticStep = array<i64: 1>, staticUpperBound = array<i64: 2>}> ({ ^bb0(%arg2: index, %arg3: tensor<2x512x16x2xi8>, %arg4: tensor<32x1024xf32>, %arg5: tensor<32x1024xi8>): %8 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index %9 = "affine.apply"(%arg2) <{map = #map2}> : (index) -> index %10 = "affine.min"(%arg2) <{map = #map3}> : (index) -> index %11 = "affine.apply"(%10) <{map = #map4}> : (index) -> index %12 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index %13 = "affine.apply"(%10) <{map = #map4}> : (index) -> index %14 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index %15 = "affine.apply"(%10) <{map = #map4}> : (index) -> index %16 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index %17 = "affine.apply"(%10) <{map = #map4}> : (index) -> index %18 = "tensor.extract_slice"(%arg1, %12, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32> %19 = "tensor.empty"() : () -> tensor<32x1024xf32> %20 = "tensor.extract_slice"(%19, %14, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32> %21 = "tensor.extract_slice"(%3, %14, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32> %22 = "tensor.empty"() : () -> tensor<32x1024xi8> %23 = "tensor.extract_slice"(%22, %16, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8> %24 = "tensor.extract_slice"(%4, %16, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8> %25 = "tensor.empty"() : () -> tensor<32x1024xf32> %26 = "tensor.extract_slice"(%25, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32> %27 = "tensor.extract_slice"(%arg4, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32> %28 = "tensor.empty"() : () -> tensor<32x1024xi8> %29 = "tensor.extract_slice"(%28, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8> %30 = "tensor.extract_slice"(%arg5, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8> %31:2 = "linalg.generic"(%18, %27, %30) <{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 1, 2>}> ({ ^bb0(%arg6: f32, %arg7: f32, %arg8: i8): %40 = "arith.fptoui"(%arg6) : (f32) -> i8 "linalg.yield"(%arg6, %40) : (f32, i8) -> () }) : (tensor<?x1024xf32>, tensor<?x1024xf32>, tensor<?x1024xi8>) -> (tensor<?x1024xf32>, tensor<?x1024xi8>) %32 = "tensor.extract_slice"(%6#1, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8> %33 = "tensor.empty"() : () -> tensor<2x512x16x2xi8> %34 = "tensor.extract_slice"(%33, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<2x512x16x2xi8>, index) -> tensor<1x512x16x2xi8> %35 = "tensor.extract_slice"(%arg3, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<2x512x16x2xi8>, index) -> tensor<1x512x16x2xi8> %36 = "linalg.pack"(%31#1, %35, %2) <{inner_dims_pos = array<i64: 0, 1>, operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_inner_tiles = array<i64: 16, 2>}> : (tensor<?x1024xi8>, tensor<1x512x16x2xi8>, i8) -> tensor<1x512x16x2xi8> %37 = "affine.apply"(%10) <{map = #map4}> : (index) -> index %38 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index %39 = "affine.apply"(%10) <{map = #map4}> : (index) -> index "scf.forall.in_parallel"() ({ "tensor.parallel_insert_slice"(%36, %arg3, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<1x512x16x2xi8>, tensor<2x512x16x2xi8>, index) -> () "tensor.parallel_insert_slice"(%31#0, %arg4, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<?x1024xf32>, tensor<32x1024xf32>, index, index) -> () "tensor.parallel_insert_slice"(%31#1, %arg5, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<?x1024xi8>, tensor<32x1024xi8>, index, index) -> () }) : () -> () }) : (tensor<2x512x16x2xi8>, tensor<32x1024xf32>, tensor<32x1024xi8>) -> (tensor<2x512x16x2xi8>, tensor<32x1024xf32>, tensor<32x1024xi8>) "func.return"(%7#1, %7#0) : (tensor<32x1024xf32>, tensor<2x512x16x2xi8>) -> () }) : () -> () "builtin.module"() ({ "transform.named_sequence"() <{arg_attrs = [{transform.readonly}], function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({ ^bb0(%arg0: !transform.any_op): %0 = "transform.structured.match"(%arg0) <{ops = ["linalg.pack"]}> : (!transform.any_op) -> !transform.any_op %1:2 = "transform.test.fuse_and_yield"(%0) <{tile_interchange = [], tile_sizes = [1], use_forall = true}> : (!transform.any_op) -> (!transform.any_op, !transform.any_op) "transform.yield"() : () -> () }) : () -> () }) {transform.with_named_sequence} : () -> () }) : () -> () ``` I also noticed that Interface tests are missing from the bazel overlay so i also added this.

fix(tile-and-fuse): fix bug in tile-and-fuse that causes dom issues

040f282

ziereis requested review from aaronmondal, keith and rupprecht as code owners January 27, 2026 12:37

llvmbot added mlir mlir:scf bazel "Peripheral" support tier build system: utils/bazel labels Jan 27, 2026

ziereis mentioned this pull request Jan 27, 2026

[CPU] Failed in TileAndFuse: operand #0 does not dominate this use iree-org/iree#21843

Open

ziereis changed the title ~~TileUsingInterface bugfix for dominance error~~ [mlir] TileUsingInterface bugfix for dominance error Jan 27, 2026

hanhanW requested a review from MaheshRavishankar January 28, 2026 01:50

MaheshRavishankar approved these changes Jan 28, 2026

View reviewed changes

aaronmondal approved these changes Jan 30, 2026

View reviewed changes

MaheshRavishankar merged commit d376a7e into llvm:main Feb 2, 2026
17 checks passed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[mlir] TileUsingInterface bugfix for dominance error#178190

[mlir] TileUsingInterface bugfix for dominance error#178190
MaheshRavishankar merged 1 commit into
llvm:mainfrom
ziereis:fix-tile-and-fuse-insert-point

ziereis commented Jan 27, 2026

Uh oh!

llvmbot commented Jan 27, 2026 •

edited

Loading

Uh oh!

MaheshRavishankar left a comment

Uh oh!

ziereis commented Jan 30, 2026

Uh oh!

aaronmondal Jan 30, 2026

Uh oh!

ziereis commented Feb 2, 2026

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

Conversation

ziereis commented Jan 27, 2026

Uh oh!

llvmbot commented Jan 27, 2026 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

MaheshRavishankar left a comment

Choose a reason for hiding this comment

Uh oh!

ziereis commented Jan 30, 2026

Uh oh!

aaronmondal Jan 30, 2026

Choose a reason for hiding this comment

Uh oh!

ziereis commented Feb 2, 2026

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

llvmbot commented Jan 27, 2026 •

edited

Loading