Skip to content

[mlir] TileUsingInterface bugfix for dominance error#178190

Merged
MaheshRavishankar merged 1 commit into
llvm:mainfrom
ziereis:fix-tile-and-fuse-insert-point
Feb 2, 2026
Merged

[mlir] TileUsingInterface bugfix for dominance error#178190
MaheshRavishankar merged 1 commit into
llvm:mainfrom
ziereis:fix-tile-and-fuse-insert-point

Conversation

@ziereis

@ziereis ziereis commented Jan 27, 2026

Copy link
Copy Markdown
Contributor

In this PR i move the insertion point in the yieldReplacementForFusedProducer because i ran into some issue where a tensor.extract_slices tried to use a result of affine.apply that was inserted at the end of the block instead of the start of it.

This is the full error of the test i added before this change:

third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: error: operand #1 does not dominate this use
  %pack = linalg.pack %gen#1
          ^
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: note: see current operation: %24 = "tensor.extract_slice"(%23, %36, %8) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:71:12: note: operand defined here (op in the same block)
  %gen:2 = linalg.generic {
           ^
// -----// IR Dump After InterpreterPass Failed (transform-interpreter) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0) -> (d0 * 16)>
#map2 = affine_map<(d0) -> (d0 * -16 + 32)>
#map3 = affine_map<(d0) -> (16, d0 * -16 + 32)>
#map4 = affine_map<(d0) -> (d0 - 1)>
"builtin.module"() ({
  "func.func"() <{function_type = (tensor<32x1024xf32>) -> (tensor<32x1024xf32>, tensor<2x512x16x2xi8>), sym_name = "fuse_pack_consumer_into_multi_output_generic"}> ({
  ^bb0(%arg1: tensor<32x1024xf32>):
    %2 = "arith.constant"() <{value = 0 : i8}> : () -> i8
    %3 = "tensor.empty"() : () -> tensor<32x1024xf32>
    %4 = "tensor.empty"() : () -> tensor<32x1024xi8>
    %5 = "tensor.empty"() : () -> tensor<2x512x16x2xi8>
    %6:2 = "linalg.generic"(%arg1, %3, %4) <{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 1, 2>}> ({
    ^bb0(%arg9: f32, %arg10: f32, %arg11: i8):
      %41 = "arith.fptoui"(%arg9) : (f32) -> i8
      "linalg.yield"(%arg9, %41) : (f32, i8) -> ()
    }) : (tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xi8>) -> (tensor<32x1024xf32>, tensor<32x1024xi8>)
    %7:3 = "scf.forall"(%5, %3, %4) <{operandSegmentSizes = array<i32: 0, 0, 0, 3>, staticLowerBound = array<i64: 0>, staticStep = array<i64: 1>, staticUpperBound = array<i64: 2>}> ({
    ^bb0(%arg2: index, %arg3: tensor<2x512x16x2xi8>, %arg4: tensor<32x1024xf32>, %arg5: tensor<32x1024xi8>):
      %8 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %9 = "affine.apply"(%arg2) <{map = #map2}> : (index) -> index
      %10 = "affine.min"(%arg2) <{map = #map3}> : (index) -> index
      %11 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %12 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %13 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %14 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %15 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %16 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %17 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %18 = "tensor.extract_slice"(%arg1, %12, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %19 = "tensor.empty"() : () -> tensor<32x1024xf32>
      %20 = "tensor.extract_slice"(%19, %14, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %21 = "tensor.extract_slice"(%3, %14, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %22 = "tensor.empty"() : () -> tensor<32x1024xi8>
      %23 = "tensor.extract_slice"(%22, %16, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %24 = "tensor.extract_slice"(%4, %16, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %25 = "tensor.empty"() : () -> tensor<32x1024xf32>
      %26 = "tensor.extract_slice"(%25, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %27 = "tensor.extract_slice"(%arg4, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %28 = "tensor.empty"() : () -> tensor<32x1024xi8>
      %29 = "tensor.extract_slice"(%28, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %30 = "tensor.extract_slice"(%arg5, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %31:2 = "linalg.generic"(%18, %27, %30) <{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 1, 2>}> ({
      ^bb0(%arg6: f32, %arg7: f32, %arg8: i8):
        %40 = "arith.fptoui"(%arg6) : (f32) -> i8
        "linalg.yield"(%arg6, %40) : (f32, i8) -> ()
      }) : (tensor<?x1024xf32>, tensor<?x1024xf32>, tensor<?x1024xi8>) -> (tensor<?x1024xf32>, tensor<?x1024xi8>)
      %32 = "tensor.extract_slice"(%6#1, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %33 = "tensor.empty"() : () -> tensor<2x512x16x2xi8>
      %34 = "tensor.extract_slice"(%33, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<2x512x16x2xi8>, index) -> tensor<1x512x16x2xi8>
      %35 = "tensor.extract_slice"(%arg3, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<2x512x16x2xi8>, index) -> tensor<1x512x16x2xi8>
      %36 = "linalg.pack"(%31#1, %35, %2) <{inner_dims_pos = array<i64: 0, 1>, operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_inner_tiles = array<i64: 16, 2>}> : (tensor<?x1024xi8>, tensor<1x512x16x2xi8>, i8) -> tensor<1x512x16x2xi8>
      %37 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %38 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %39 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      "scf.forall.in_parallel"() ({
        "tensor.parallel_insert_slice"(%36, %arg3, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<1x512x16x2xi8>, tensor<2x512x16x2xi8>, index) -> ()
        "tensor.parallel_insert_slice"(%31#0, %arg4, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<?x1024xf32>, tensor<32x1024xf32>, index, index) -> ()
        "tensor.parallel_insert_slice"(%31#1, %arg5, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<?x1024xi8>, tensor<32x1024xi8>, index, index) -> ()
      }) : () -> ()
    }) : (tensor<2x512x16x2xi8>, tensor<32x1024xf32>, tensor<32x1024xi8>) -> (tensor<2x512x16x2xi8>, tensor<32x1024xf32>, tensor<32x1024xi8>)
    "func.return"(%7#1, %7#0) : (tensor<32x1024xf32>, tensor<2x512x16x2xi8>) -> ()
  }) : () -> ()
  "builtin.module"() ({
    "transform.named_sequence"() <{arg_attrs = [{transform.readonly}], function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({
    ^bb0(%arg0: !transform.any_op):
      %0 = "transform.structured.match"(%arg0) <{ops = ["linalg.pack"]}> : (!transform.any_op) -> !transform.any_op
      %1:2 = "transform.test.fuse_and_yield"(%0) <{tile_interchange = [], tile_sizes = [1], use_forall = true}> : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
      "transform.yield"() : () -> ()
    }) : () -> ()
  }) {transform.with_named_sequence} : () -> ()
}) : () -> ()

I also noticed that Interface tests are missing from the bazel overlay so i also added this.

@llvmbot llvmbot added mlir mlir:scf bazel "Peripheral" support tier build system: utils/bazel labels Jan 27, 2026
@llvmbot

llvmbot commented Jan 27, 2026

Copy link
Copy Markdown
Member

@llvm/pr-subscribers-mlir-scf

@llvm/pr-subscribers-mlir

Author: None (ziereis)

Changes

In this PR i move the insertion point in the yieldReplacementForFusedProducer because i ran into some issue where a tensor.extract_slices tried to use a result of affine.apply that was inserted at the end of the block instead of the start of it.

This is the full error of the test i added before this change:

third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: error: operand #<!-- -->1 does not dominate this use
  %pack = linalg.pack %gen#<!-- -->1
          ^
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: note: see current operation: %24 = "tensor.extract_slice"(%23, %36, %8) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:71:12: note: operand defined here (op in the same block)
  %gen:2 = linalg.generic {
           ^
// -----// IR Dump After InterpreterPass Failed (transform-interpreter) //----- //
#map = affine_map&lt;(d0, d1) -&gt; (d0, d1)&gt;
#map1 = affine_map&lt;(d0) -&gt; (d0 * 16)&gt;
#map2 = affine_map&lt;(d0) -&gt; (d0 * -16 + 32)&gt;
#map3 = affine_map&lt;(d0) -&gt; (16, d0 * -16 + 32)&gt;
#map4 = affine_map&lt;(d0) -&gt; (d0 - 1)&gt;
"builtin.module"() ({
  "func.func"() &lt;{function_type = (tensor&lt;32x1024xf32&gt;) -&gt; (tensor&lt;32x1024xf32&gt;, tensor&lt;2x512x16x2xi8&gt;), sym_name = "fuse_pack_consumer_into_multi_output_generic"}&gt; ({
  ^bb0(%arg1: tensor&lt;32x1024xf32&gt;):
    %2 = "arith.constant"() &lt;{value = 0 : i8}&gt; : () -&gt; i8
    %3 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xf32&gt;
    %4 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xi8&gt;
    %5 = "tensor.empty"() : () -&gt; tensor&lt;2x512x16x2xi8&gt;
    %6:2 = "linalg.generic"(%arg1, %3, %4) &lt;{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type&lt;parallel&gt;, #linalg.iterator_type&lt;parallel&gt;], operandSegmentSizes = array&lt;i32: 1, 2&gt;}&gt; ({
    ^bb0(%arg9: f32, %arg10: f32, %arg11: i8):
      %41 = "arith.fptoui"(%arg9) : (f32) -&gt; i8
      "linalg.yield"(%arg9, %41) : (f32, i8) -&gt; ()
    }) : (tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xi8&gt;) -&gt; (tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xi8&gt;)
    %7:3 = "scf.forall"(%5, %3, %4) &lt;{operandSegmentSizes = array&lt;i32: 0, 0, 0, 3&gt;, staticLowerBound = array&lt;i64: 0&gt;, staticStep = array&lt;i64: 1&gt;, staticUpperBound = array&lt;i64: 2&gt;}&gt; ({
    ^bb0(%arg2: index, %arg3: tensor&lt;2x512x16x2xi8&gt;, %arg4: tensor&lt;32x1024xf32&gt;, %arg5: tensor&lt;32x1024xi8&gt;):
      %8 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %9 = "affine.apply"(%arg2) &lt;{map = #map2}&gt; : (index) -&gt; index
      %10 = "affine.min"(%arg2) &lt;{map = #map3}&gt; : (index) -&gt; index
      %11 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %12 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %13 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %14 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %15 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %16 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %17 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %18 = "tensor.extract_slice"(%arg1, %12, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %19 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xf32&gt;
      %20 = "tensor.extract_slice"(%19, %14, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %21 = "tensor.extract_slice"(%3, %14, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %22 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xi8&gt;
      %23 = "tensor.extract_slice"(%22, %16, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %24 = "tensor.extract_slice"(%4, %16, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %25 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xf32&gt;
      %26 = "tensor.extract_slice"(%25, %38, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %27 = "tensor.extract_slice"(%arg4, %38, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xf32&gt;, index, index) -&gt; tensor&lt;?x1024xf32&gt;
      %28 = "tensor.empty"() : () -&gt; tensor&lt;32x1024xi8&gt;
      %29 = "tensor.extract_slice"(%28, %8, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %30 = "tensor.extract_slice"(%arg5, %8, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %31:2 = "linalg.generic"(%18, %27, %30) &lt;{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type&lt;parallel&gt;, #linalg.iterator_type&lt;parallel&gt;], operandSegmentSizes = array&lt;i32: 1, 2&gt;}&gt; ({
      ^bb0(%arg6: f32, %arg7: f32, %arg8: i8):
        %40 = "arith.fptoui"(%arg6) : (f32) -&gt; i8
        "linalg.yield"(%arg6, %40) : (f32, i8) -&gt; ()
      }) : (tensor&lt;?x1024xf32&gt;, tensor&lt;?x1024xf32&gt;, tensor&lt;?x1024xi8&gt;) -&gt; (tensor&lt;?x1024xf32&gt;, tensor&lt;?x1024xi8&gt;)
      %32 = "tensor.extract_slice"(%6#<!-- -->1, %8, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;32x1024xi8&gt;, index, index) -&gt; tensor&lt;?x1024xi8&gt;
      %33 = "tensor.empty"() : () -&gt; tensor&lt;2x512x16x2xi8&gt;
      %34 = "tensor.extract_slice"(%33, %arg2) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 0, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0, 0, 0&gt;, static_sizes = array&lt;i64: 1, 512, 16, 2&gt;, static_strides = array&lt;i64: 1, 1, 1, 1&gt;}&gt; : (tensor&lt;2x512x16x2xi8&gt;, index) -&gt; tensor&lt;1x512x16x2xi8&gt;
      %35 = "tensor.extract_slice"(%arg3, %arg2) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 0, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0, 0, 0&gt;, static_sizes = array&lt;i64: 1, 512, 16, 2&gt;, static_strides = array&lt;i64: 1, 1, 1, 1&gt;}&gt; : (tensor&lt;2x512x16x2xi8&gt;, index) -&gt; tensor&lt;1x512x16x2xi8&gt;
      %36 = "linalg.pack"(%31#<!-- -->1, %35, %2) &lt;{inner_dims_pos = array&lt;i64: 0, 1&gt;, operandSegmentSizes = array&lt;i32: 1, 1, 1, 0&gt;, static_inner_tiles = array&lt;i64: 16, 2&gt;}&gt; : (tensor&lt;?x1024xi8&gt;, tensor&lt;1x512x16x2xi8&gt;, i8) -&gt; tensor&lt;1x512x16x2xi8&gt;
      %37 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      %38 = "affine.apply"(%arg2) &lt;{map = #map1}&gt; : (index) -&gt; index
      %39 = "affine.apply"(%10) &lt;{map = #map4}&gt; : (index) -&gt; index
      "scf.forall.in_parallel"() ({
        "tensor.parallel_insert_slice"(%36, %arg3, %arg2) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 0, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0, 0, 0&gt;, static_sizes = array&lt;i64: 1, 512, 16, 2&gt;, static_strides = array&lt;i64: 1, 1, 1, 1&gt;}&gt; : (tensor&lt;1x512x16x2xi8&gt;, tensor&lt;2x512x16x2xi8&gt;, index) -&gt; ()
        "tensor.parallel_insert_slice"(%31#<!-- -->0, %arg4, %38, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;?x1024xf32&gt;, tensor&lt;32x1024xf32&gt;, index, index) -&gt; ()
        "tensor.parallel_insert_slice"(%31#<!-- -->1, %arg5, %8, %10) &lt;{operandSegmentSizes = array&lt;i32: 1, 1, 1, 1, 0&gt;, static_offsets = array&lt;i64: -9223372036854775808, 0&gt;, static_sizes = array&lt;i64: -9223372036854775808, 1024&gt;, static_strides = array&lt;i64: 1, 1&gt;}&gt; : (tensor&lt;?x1024xi8&gt;, tensor&lt;32x1024xi8&gt;, index, index) -&gt; ()
      }) : () -&gt; ()
    }) : (tensor&lt;2x512x16x2xi8&gt;, tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xi8&gt;) -&gt; (tensor&lt;2x512x16x2xi8&gt;, tensor&lt;32x1024xf32&gt;, tensor&lt;32x1024xi8&gt;)
    "func.return"(%7#<!-- -->1, %7#<!-- -->0) : (tensor&lt;32x1024xf32&gt;, tensor&lt;2x512x16x2xi8&gt;) -&gt; ()
  }) : () -&gt; ()
  "builtin.module"() ({
    "transform.named_sequence"() &lt;{arg_attrs = [{transform.readonly}], function_type = (!transform.any_op) -&gt; (), sym_name = "__transform_main"}&gt; ({
    ^bb0(%arg0: !transform.any_op):
      %0 = "transform.structured.match"(%arg0) &lt;{ops = ["linalg.pack"]}&gt; : (!transform.any_op) -&gt; !transform.any_op
      %1:2 = "transform.test.fuse_and_yield"(%0) &lt;{tile_interchange = [], tile_sizes = [1], use_forall = true}&gt; : (!transform.any_op) -&gt; (!transform.any_op, !transform.any_op)
      "transform.yield"() : () -&gt; ()
    }) : () -&gt; ()
  }) {transform.with_named_sequence} : () -&gt; ()
}) : () -&gt; ()

I also noticed that Interface tests are missing from the bazel overlay so i also added this.


Full diff: https://github.com/llvm/llvm-project/pull/178190.diff

3 Files Affected:

  • (modified) mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp (+8-1)
  • (modified) mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir (+69-1)
  • (added) utils/bazel/llvm-project-overlay/mlir/test/Interfaces/BUILD.bazel (+20)
diff --git a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
index 4d22a5e97ba4a..8bfc9e9dfad3d 100644
--- a/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/TileUsingInterface.cpp
@@ -1509,6 +1509,14 @@ FailureOr<SmallVector<Operation *>> mlir::scf::yieldReplacementForFusedProducer(
     auto tilableOp = cast<TilingInterface>(originalOwner);
     // b. get iterDomain Offset and Sizes based on sliceOp tile
     SmallVector<OpFoldResult> iterDomainOffset, iterDomainSizes;
+    // Set insertion point before any operations that might create new SSA
+    // values used in offset/size computations. This ensures all values created
+    // by getIterationDomainTileFromResultTile and getResultTilePosition
+    // dominate the extract_slice operations created later.
+    if (auto tiledDestStyleOp =
+            dyn_cast<DestinationStyleOpInterface>(tiledOwner)) {
+      rewriter.setInsertionPoint(tiledDestStyleOp);
+    }
     // skip tensor.pack/unpack/pad, which expects single opResult
     if (tilableOp->getNumResults() > 1 &&
         failed(tilableOp.getIterationDomainTileFromResultTile(
@@ -1550,7 +1558,6 @@ FailureOr<SmallVector<Operation *>> mlir::scf::yieldReplacementForFusedProducer(
     // necessary
     if (auto tiledDestStyleOp =
             dyn_cast<DestinationStyleOpInterface>(tiledOwner)) {
-      rewriter.setInsertionPoint(tiledDestStyleOp);
       for (const auto &&[index, newRegionArg] :
            llvm::enumerate(newRegionIterArgs)) {
         auto destSlice = tensor::ExtractSliceOp::create(
diff --git a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir
index 8fc8f3245be15..3de6c9cb2b398 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt -transform-interpreter -cse -split-input-file %s | FileCheck %s
+// RUN: mlir-opt -transform-interpreter -cse -canonicalize -split-input-file %s | FileCheck %s
 
 func.func @gemm_gemm_fusion_yield_both(%lhs0 : tensor<?x?xf32>, %rhs0 : tensor<?x?xf32>, %rhs1 : tensor<?x?xf32>,
     %init0 : tensor<?x?xf32>, %init1 : tensor<?x?xf32>)
@@ -58,3 +58,71 @@ module attributes {transform.with_named_sequence} {
 //      CHECK:       tensor.parallel_insert_slice %[[GEMM1_TILE]] into %[[ITERARG0]][%[[IV]], 0]
 //      CHECK:       tensor.parallel_insert_slice %[[GEMM0_TILE]] into %[[ITERARG1]][%[[IV]], 0]
 //      CHECK:   return %[[RESULT]]#1, %[[RESULT]]#0
+
+// -----
+
+func.func @fuse_pack_consumer_into_multi_output_generic(
+    %input: tensor<32x1024xf32>) -> (tensor<32x1024xf32>, tensor<2x512x16x2xi8>) {
+  %c0_i8 = arith.constant 0 : i8
+  %output_f32 = tensor.empty() : tensor<32x1024xf32>
+  %output_i8 = tensor.empty() : tensor<32x1024xi8>
+  %pack_dest = tensor.empty() : tensor<2x512x16x2xi8>
+
+  %gen:2 = linalg.generic {
+    indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>,
+                     affine_map<(d0, d1) -> (d0, d1)>],
+    iterator_types = ["parallel", "parallel"]
+  } ins(%input : tensor<32x1024xf32>)
+    outs(%output_f32, %output_i8 : tensor<32x1024xf32>, tensor<32x1024xi8>) {
+  ^bb0(%in: f32, %out_f: f32, %out_i: i8):
+    %q = arith.fptoui %in : f32 to i8
+    linalg.yield %in, %q : f32, i8
+  } -> (tensor<32x1024xf32>, tensor<32x1024xi8>)
+
+  %pack = linalg.pack %gen#1
+    padding_value(%c0_i8 : i8)
+    inner_dims_pos = [0, 1]
+    inner_tiles = [16, 2]
+    into %pack_dest : tensor<32x1024xi8> -> tensor<2x512x16x2xi8>
+
+  return %gen#0, %pack : tensor<32x1024xf32>, tensor<2x512x16x2xi8>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0 : !transform.any_op {transform.readonly}) {
+    %pack = transform.structured.match ops{["linalg.pack"]} in %arg0
+      : (!transform.any_op) -> !transform.any_op
+    %a, %b = transform.test.fuse_and_yield %pack [1] use_forall true
+      : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    transform.yield
+  }
+}
+//      CHECK: #[[$MAP0:.+]] = affine_map<(d0) -> (d0 * 16)>
+//      CHECK: #[[$MAP1:.+]] = affine_map<(d0) -> (d0 * -16 + 32, 16)>
+//      CHECK: #[[$MAP2:.+]] = affine_map<(d0, d1) -> (d0, d1)>
+//      CHECK: func.func @fuse_pack_consumer_into_multi_output_generic(
+// CHECK-SAME:     %[[INPUT:[a-zA-Z0-9]+]]: tensor<32x1024xf32>)
+//  CHECK-DAG:   %[[C0_I8:.+]] = arith.constant 0 : i8
+//  CHECK-DAG:   %[[OUTPUT_F32:.+]] = tensor.empty() : tensor<32x1024xf32>
+//  CHECK-DAG:   %[[OUTPUT_I8:.+]] = tensor.empty() : tensor<32x1024xi8>
+//  CHECK-DAG:   %[[PACK_DEST:.+]] = tensor.empty() : tensor<2x512x16x2xi8>
+//      CHECK:   %[[RESULT:.+]]:2 = scf.forall (%[[IV:.+]]) in (2)
+// CHECK-SAME:       shared_outs(%[[ITERARG0:[a-zA-Z0-9]+]] = %[[PACK_DEST]], %[[ITERARG1:[a-zA-Z0-9]+]] = %[[OUTPUT_F32]])
+//      CHECK:     %[[OFFSET:.+]] = affine.apply #[[$MAP0]](%[[IV]])
+//      CHECK:     %[[SIZE:.+]] = affine.min #[[$MAP1]](%[[IV]])
+//  CHECK-DAG:     %[[INPUT_TILE:.+]] = tensor.extract_slice %[[INPUT]][%[[OFFSET]], 0] [%[[SIZE]], 1024]
+//  CHECK-DAG:     %[[F32_TILE:.+]] = tensor.extract_slice %[[ITERARG1]][%[[OFFSET]], 0] [%[[SIZE]], 1024]
+//  CHECK-DAG:     %[[I8_TILE:.+]] = tensor.extract_slice %[[OUTPUT_I8]][%[[OFFSET]], 0] [%[[SIZE]], 1024]
+//      CHECK:     %[[GENERIC_TILE:.+]]:2 = linalg.generic
+// CHECK-SAME:         ins(%[[INPUT_TILE]] :
+// CHECK-SAME:         outs(%[[F32_TILE]], %[[I8_TILE]] :
+//  CHECK-DAG:     %[[PACK_DEST_TILE:.+]] = tensor.extract_slice %[[ITERARG0]][%[[IV]], 0, 0, 0] [1, 512, 16, 2]
+//      CHECK:     %[[PACK_TILE:.+]] = linalg.pack %[[GENERIC_TILE]]#1
+// CHECK-SAME:         padding_value(%[[C0_I8]] : i8)
+// CHECK-SAME:         inner_dims_pos = [0, 1] inner_tiles = [16, 2]
+// CHECK-SAME:         into %[[PACK_DEST_TILE]]
+//      CHECK:     scf.forall.in_parallel {
+//      CHECK:       tensor.parallel_insert_slice %[[PACK_TILE]] into %[[ITERARG0]][%[[IV]], 0, 0, 0] [1, 512, 16, 2]
+//      CHECK:       tensor.parallel_insert_slice %[[GENERIC_TILE]]#0 into %[[ITERARG1]][%[[OFFSET]], 0] [%[[SIZE]], 1024]
+//      CHECK:   return %[[RESULT]]#1, %[[RESULT]]#0
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/Interfaces/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/Interfaces/BUILD.bazel
new file mode 100644
index 0000000000000..ef41be3ebc865
--- /dev/null
+++ b/utils/bazel/llvm-project-overlay/mlir/test/Interfaces/BUILD.bazel
@@ -0,0 +1,20 @@
+load("//llvm:lit_test.bzl", "lit_test")
+
+licenses(["notice"])
+
+package(default_visibility = ["//visibility:public"])
+
+[
+    lit_test(
+        name = "%s.test" % src,
+        srcs = [src],
+        data = [
+            "//llvm:llvm-symbolizer",
+            "//mlir:mlir-opt",
+            "//mlir/test:lit_data",
+        ],
+    )
+    for src in glob(
+        include = ["**/*.mlir"],
+    )
+]

@ziereis ziereis changed the title TileUsingInterface bugfix for dominance error [mlir] TileUsingInterface bugfix for dominance error Jan 27, 2026

@MaheshRavishankar MaheshRavishankar left a comment

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks!

@ziereis

ziereis commented Jan 30, 2026

Copy link
Copy Markdown
Contributor Author

@rupprecht @keith @aaronmondal Hi! Anyone could take a look at this please?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah seems like an oversight that this didn't exist already. Thanks!

@ziereis

ziereis commented Feb 2, 2026

Copy link
Copy Markdown
Contributor Author

if this is good to go, i would be happy if someone could merge it. I don't have write access myself.

@MaheshRavishankar MaheshRavishankar merged commit d376a7e into llvm:main Feb 2, 2026
17 checks passed
rishabhmadan19 pushed a commit to rishabhmadan19/llvm-project that referenced this pull request Feb 9, 2026
In this PR i move the insertion point in the
`yieldReplacementForFusedProducer` because i ran into some issue where a
`tensor.extract_slices` tried to use a result of `affine.apply` that was
inserted at the end of the block instead of the start of it.

This is the full error of the test i added before this change:

```mlir
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: error: operand llvm#1 does not dominate this use
  %pack = linalg.pack %gen#1
          ^
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:83:11: note: see current operation: %24 = "tensor.extract_slice"(%23, %36, %8) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
third-party/llvm-project/mlir/test/Interfaces/TilingInterface/tile-fuse-and-yield-using-scfforall.mlir:71:12: note: operand defined here (op in the same block)
  %gen:2 = linalg.generic {
           ^
// -----// IR Dump After InterpreterPass Failed (transform-interpreter) //----- //
#map = affine_map<(d0, d1) -> (d0, d1)>
#map1 = affine_map<(d0) -> (d0 * 16)>
#map2 = affine_map<(d0) -> (d0 * -16 + 32)>
#map3 = affine_map<(d0) -> (16, d0 * -16 + 32)>
#map4 = affine_map<(d0) -> (d0 - 1)>
"builtin.module"() ({
  "func.func"() <{function_type = (tensor<32x1024xf32>) -> (tensor<32x1024xf32>, tensor<2x512x16x2xi8>), sym_name = "fuse_pack_consumer_into_multi_output_generic"}> ({
  ^bb0(%arg1: tensor<32x1024xf32>):
    %2 = "arith.constant"() <{value = 0 : i8}> : () -> i8
    %3 = "tensor.empty"() : () -> tensor<32x1024xf32>
    %4 = "tensor.empty"() : () -> tensor<32x1024xi8>
    %5 = "tensor.empty"() : () -> tensor<2x512x16x2xi8>
    %6:2 = "linalg.generic"(%arg1, %3, %4) <{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 1, 2>}> ({
    ^bb0(%arg9: f32, %arg10: f32, %arg11: i8):
      %41 = "arith.fptoui"(%arg9) : (f32) -> i8
      "linalg.yield"(%arg9, %41) : (f32, i8) -> ()
    }) : (tensor<32x1024xf32>, tensor<32x1024xf32>, tensor<32x1024xi8>) -> (tensor<32x1024xf32>, tensor<32x1024xi8>)
    %7:3 = "scf.forall"(%5, %3, %4) <{operandSegmentSizes = array<i32: 0, 0, 0, 3>, staticLowerBound = array<i64: 0>, staticStep = array<i64: 1>, staticUpperBound = array<i64: 2>}> ({
    ^bb0(%arg2: index, %arg3: tensor<2x512x16x2xi8>, %arg4: tensor<32x1024xf32>, %arg5: tensor<32x1024xi8>):
      %8 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %9 = "affine.apply"(%arg2) <{map = #map2}> : (index) -> index
      %10 = "affine.min"(%arg2) <{map = #map3}> : (index) -> index
      %11 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %12 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %13 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %14 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %15 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %16 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %17 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %18 = "tensor.extract_slice"(%arg1, %12, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %19 = "tensor.empty"() : () -> tensor<32x1024xf32>
      %20 = "tensor.extract_slice"(%19, %14, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %21 = "tensor.extract_slice"(%3, %14, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %22 = "tensor.empty"() : () -> tensor<32x1024xi8>
      %23 = "tensor.extract_slice"(%22, %16, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %24 = "tensor.extract_slice"(%4, %16, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %25 = "tensor.empty"() : () -> tensor<32x1024xf32>
      %26 = "tensor.extract_slice"(%25, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %27 = "tensor.extract_slice"(%arg4, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xf32>, index, index) -> tensor<?x1024xf32>
      %28 = "tensor.empty"() : () -> tensor<32x1024xi8>
      %29 = "tensor.extract_slice"(%28, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %30 = "tensor.extract_slice"(%arg5, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %31:2 = "linalg.generic"(%18, %27, %30) <{indexing_maps = [#map, #map, #map], iterator_types = [#linalg.iterator_type<parallel>, #linalg.iterator_type<parallel>], operandSegmentSizes = array<i32: 1, 2>}> ({
      ^bb0(%arg6: f32, %arg7: f32, %arg8: i8):
        %40 = "arith.fptoui"(%arg6) : (f32) -> i8
        "linalg.yield"(%arg6, %40) : (f32, i8) -> ()
      }) : (tensor<?x1024xf32>, tensor<?x1024xf32>, tensor<?x1024xi8>) -> (tensor<?x1024xf32>, tensor<?x1024xi8>)
      %32 = "tensor.extract_slice"(%6#1, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<32x1024xi8>, index, index) -> tensor<?x1024xi8>
      %33 = "tensor.empty"() : () -> tensor<2x512x16x2xi8>
      %34 = "tensor.extract_slice"(%33, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<2x512x16x2xi8>, index) -> tensor<1x512x16x2xi8>
      %35 = "tensor.extract_slice"(%arg3, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<2x512x16x2xi8>, index) -> tensor<1x512x16x2xi8>
      %36 = "linalg.pack"(%31#1, %35, %2) <{inner_dims_pos = array<i64: 0, 1>, operandSegmentSizes = array<i32: 1, 1, 1, 0>, static_inner_tiles = array<i64: 16, 2>}> : (tensor<?x1024xi8>, tensor<1x512x16x2xi8>, i8) -> tensor<1x512x16x2xi8>
      %37 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      %38 = "affine.apply"(%arg2) <{map = #map1}> : (index) -> index
      %39 = "affine.apply"(%10) <{map = #map4}> : (index) -> index
      "scf.forall.in_parallel"() ({
        "tensor.parallel_insert_slice"(%36, %arg3, %arg2) <{operandSegmentSizes = array<i32: 1, 1, 1, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 1, 512, 16, 2>, static_strides = array<i64: 1, 1, 1, 1>}> : (tensor<1x512x16x2xi8>, tensor<2x512x16x2xi8>, index) -> ()
        "tensor.parallel_insert_slice"(%31#0, %arg4, %38, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<?x1024xf32>, tensor<32x1024xf32>, index, index) -> ()
        "tensor.parallel_insert_slice"(%31#1, %arg5, %8, %10) <{operandSegmentSizes = array<i32: 1, 1, 1, 1, 0>, static_offsets = array<i64: -9223372036854775808, 0>, static_sizes = array<i64: -9223372036854775808, 1024>, static_strides = array<i64: 1, 1>}> : (tensor<?x1024xi8>, tensor<32x1024xi8>, index, index) -> ()
      }) : () -> ()
    }) : (tensor<2x512x16x2xi8>, tensor<32x1024xf32>, tensor<32x1024xi8>) -> (tensor<2x512x16x2xi8>, tensor<32x1024xf32>, tensor<32x1024xi8>)
    "func.return"(%7#1, %7#0) : (tensor<32x1024xf32>, tensor<2x512x16x2xi8>) -> ()
  }) : () -> ()
  "builtin.module"() ({
    "transform.named_sequence"() <{arg_attrs = [{transform.readonly}], function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({
    ^bb0(%arg0: !transform.any_op):
      %0 = "transform.structured.match"(%arg0) <{ops = ["linalg.pack"]}> : (!transform.any_op) -> !transform.any_op
      %1:2 = "transform.test.fuse_and_yield"(%0) <{tile_interchange = [], tile_sizes = [1], use_forall = true}> : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
      "transform.yield"() : () -> ()
    }) : () -> ()
  }) {transform.with_named_sequence} : () -> ()
}) : () -> ()
``` 

I also noticed that Interface tests are missing from the bazel overlay
so i also added this.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

bazel "Peripheral" support tier build system: utils/bazel mlir:scf mlir

Projects

None yet

Development

Successfully merging this pull request may close these issues.

4 participants