<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/64092>64092</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            mlir PadOp Tiling Interface implement does not include generateResultTileValue

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            new issue

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          Artorias123

      </td>

    </tr>

</table>

<pre>

    We found that the implementation of the PadOp TilingInterface:PadOpTiling does not include the generateResultTileValue interface. This has led to unexpected results when we attempt to fuse the pad operation into a containing loop using transform.structured.fuse_into_containing_op.I would like to understand if this is intended behavior or a defect. Additionally, I would like to know the correct approach to fuse the pad operation into a loop.

Here are our test results on llvm-15.x

```

#map0 = affine_map<(d0, d1) -> (d0, d1)>

func.func @pad(%arg0: tensor<58x1xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> attributes {pad} {

  %c0 = arith.constant 0 : index

 %c1 = arith.constant 1 : index

  %c16 = arith.constant 16 : index

 %c32 = arith.constant 32 : index

  %zero = arith.constant 0.0 : f32

  %pad = tensor.pad %arg0 low[4, 60] high[2, 67] {

  ^bb0(%arg4: index, %arg5: index):

    tensor.yield %zero : f32

  } : tensor<58x1xf32> to tensor<64x128xf32>

  %1 = linalg.init_tensor [64, 128] : tensor<64x128xf32>

  %2 = scf.foreach_thread (%arg2, %arg3) in (%c1, %c1) -> (tensor<64x128xf32>) {

    %3 = tensor.extract_slice %arg1[%arg2, %arg3] [8, 128] [1, 1] : tensor<64x128xf32> to tensor<8x128xf32>

    %4 = tensor.extract_slice %pad[%arg2, %arg3] [8, 128] [1, 1] : tensor<64x128xf32> to tensor<8x128xf32>

    %5 = linalg.init_tensor [8, 128] : tensor<8x128xf32>

    %6 = linalg.generic {indexing_maps = [#map0, #map0, #map0], iterator_types = ["parallel", "parallel"]}

        ins(%3, %4 : tensor<8x128xf32>, tensor<8x128xf32>)

        outs(%5 : tensor<8x128xf32>) {

 ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):

        %7 = arith.addf %arg4, %arg5 : f32

        linalg.yield %7 : f32

    } -> tensor<8x128xf32>

    scf.foreach_thread.perform_concurrently {

 tensor.parallel_insert_slice %6 into %1[%arg2, %arg3] [8, 128] [1, 1] : tensor<8x128xf32> into tensor<64x128xf32>

    }  

  } {thread_dim_mapping = [2, 4]} 

  return %2 : tensor<64x128xf32>

}

transform.with_pdl_patterns {

^bb0(%arg0: !pdl.operation):

  transform.genesis.canonicalized_sequence %arg0 failures(propagate) {

  ^bb0(%arg1: !pdl.operation):

    %device_func = transform.genesis.match ops{["func.func"]} attributes {pad} in %arg1

 %foreach_thread_op = transform.genesis.match ops{["scf.foreach_thread"]} in %device_func

    // fuse and tile

    %expand_shape = transform.genesis.match ops{["tensor.pad"]} in %device_func

 transform.structured.fuse_into_containing_op %expand_shape into %foreach_thread_op

  }  

}

```

```

#map = affine_map<(d0, d1) -> (d0, d1)>

module {

  func.func @pad(%arg0: tensor<58x1xf32>, %arg1: tensor<64x128xf32>) -> tensor<64x128xf32> attributes {pad} {

 %c1 = arith.constant 1 : index

    %cst = arith.constant 0.000000e+00 : f32

    %0 = linalg.init_tensor [64, 128] : tensor<64x128xf32>

    %1 = scf.foreach_thread (%arg2, %arg3) in (%c1, %c1) -> (tensor<64x128xf32>) {

 %2 = tensor.extract_slice %arg1[%arg2, %arg3] [8, 128] [1, 1] : tensor<64x128xf32> to tensor<8x128xf32>

      %3 = tensor.pad %arg0 low[4, 60] high[2, 67] {

      ^bb0(%arg4: index, %arg5: index):

 tensor.yield %cst : f32

      } : tensor<58x1xf32> to tensor<64x128xf32>

      %4 = tensor.extract_slice %3[%arg2, %arg3] [8, 128] [1, 1] : tensor<64x128xf32> to tensor<8x128xf32>

      %5 = linalg.init_tensor [8, 128] : tensor<8x128xf32>

      %6 = linalg.generic {indexing_maps = [#map, #map, #map], iterator_types = ["parallel", "parallel"]} ins(%2, %4 : tensor<8x128xf32>, tensor<8x128xf32>) outs(%5 : tensor<8x128xf32>) {

      ^bb0(%arg4: f32, %arg5: f32, %arg6: f32):

 %7 = arith.addf %arg4, %arg5 : f32

        linalg.yield %7 : f32

 } -> tensor<8x128xf32>

      scf.foreach_thread.perform_concurrently {

        tensor.parallel_insert_slice %6 into %0[%arg2, %arg3] [8, 128] [1, 1] : tensor<8x128xf32> into tensor<64x128xf32>

      }

    } {thread_dim_mapping = [2, 4]}

    return %1 : tensor<64x128xf32>

 }

  transform.with_pdl_patterns {

  ^bb0(%arg0: !pdl.operation):

 transform.genesis.canonicalized_sequence %arg0 failures(propagate) {

 ^bb0(%arg1: !pdl.operation):

      %0 = transform.genesis.match ops{["func.func"]} attributes {pad} in %arg1

      %1 = transform.genesis.match ops{["scf.foreach_thread"]} in %0

      %2 = transform.genesis.match ops{["tensor.pad"]} in %0

      %3 = transform.structured.fuse_into_containing_op %2 into %1

    }

 }

}

```

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzMWVtz46gS_jX4hRqVhCzZfvBDHE_qzNM5dWpq91GFRdtiB4EWUC7z67cARRffkkkyO6NyOTLqbvry0Xwo1Bh-kABrlG1Qtp3R1lZKr2-0VZpTk5B0tlPsaf0n4L1qJcO2ohbbCjCvGwE1SEstVxKrvR_9H2X_bfBXLrg8fJEW9J6WgNIbPx6GMVNgsFQWc1mKloFXPIAETS38H0wr7Fcu4A8qWsD82UiEv1bc4IoaLIBhq3Ar4bGB0gLD2msZ_FCBxA-AqbVQN9ZJ7VsTZmgow6pxkzh_ubQKU1wqaSmXzi2hVINb426tptLsla4jY3Vb2lYDi5yhwqkVg1KhmugLflCtYFjwbxDcYqCNpZJh7pLCDXYfaUEyYHgHFb3nSmOlMcUM9lDaCN8wxp1fVIgnRG7xsdFvUj34KEqlNZQW06bRipbVyyG6uCIUb1F8M_7-D2jAVANWrcYWjO2TqCQW4r7-lGTRY6eRx90n_CRpTZsYo3SL6X7PJRQ1bVB6i8iSxc59liCywp9Q-hlPx1D6OdjYt7KM3BdG87ihDJElIhnVhxilN9iCNEqj9DZbPiaP-5Q4PXKLg0gyEcnnjwlZ9kLdtOcfO2BovmstGIwWGzfvYuvuglPYTVB2gWluq6hU0pXSYjd4g7lk0OXEiybnRJMT0SCbnxXOzxpOyTlhP3pq-jtoddbnKHjtYu9MD0oOK04nJCryP0MBsFAPKNvMXcLzGGVbXPFDhbIN8SMLNzJOWfZ5t4v7-s0HD_uCZaOxFUpvnlXx8-xPHAQbhTL2GfsSXQCFg_8FJIyw7gMOtRJcUnGIuOS2CIoYZZvcB5uQpY_tMroGa6E-ptxHe6WBllVhKw0-iV0iyBB_6mDJZfesTLon5WSVXAb0KNl-7nRcOHi0mpa2MIKX0C-QbHPGCRdbtlmOQ8023pnketyTLC_PJMS7Nb_qlltrv8Kr7ErZl5eqftFaPrbm9yxeuvp4cLsNoaaN8TI-WN8nQ7Ant9nW_eLWtWulC_vUwEiTNFRTIUAg0mVrMpJt0WI7eOYuLk3AV9pld34lJnJ76cFqalW1tjObXTM3Bum5huDkpu1gMpL3I9P20GV9MepulLE9frY8mDxuGuHqCtW3l8WpmOsu0x3jfPFPl3rUgHYkwfGBstUapBVPozz0nTWUreDSgB6tiDzs0K41vX9ljJ0Odq-3sBA4nvTYxSYEVjBeOyA3jgt1iPSOzQPuei0NttXyuR1e75o9XsP3QLEeuK2KhomicaxNSzOk8AhJnhogkjRMRD3NmUJmMOsWp-EmKqlUkpdU8O_ACgN_tyD7RhnjPeWi1eAw3mjV0AO1cNRzj7xIXvbCg5bBPS-hCBzHNcYT12pqywqrxrjJ_KLvWVG_xs8zFr-XBGd6zjAFZ6GaV896Cu1h-jDTKJbpvuoivUPkLnBQx3ktFzDJAzw2VLLCVLSBV7s0sJIXXfkhtn7i0PMSPMneeGXgYwgf8eHLJPldHLlWrBUwhuLvyZp_hAkHLmzsBb7qL0BkEx9T16AZfxyJG5HCf5vG9fzx9-Jwp-Ty7acCb-3NJ4PjY0EAzMkG_56TQR_wddqa_qIyfCBtfStxHcjq6O79rHUgquR9RPUt3PQiLt9CUH8mM30tLX0LMe2u1_PT-FfwU7-8p3T11Qx1UBsoavLynjDSfBU_PYHSywz1JxDUt_DT0Wb6U3kpnuyzH0NG4yPb5ANY5bHN9Mjm65glGZ3oxsA9htcpkZyxdcpW6YrOYJ3kq5hkc7JKZtU6WyxZzlLY0VUChGRlHi8gX2ZlnMbLfMlmfE1iksYLksVpPM-SCFhO8uVit0z2cb5aLNE8hppyEQlxX0dKH2bcmBbW-TxekZmgOxBmHfIj4QH7hyE9M732L4J37cGgeSy4sWawYrkVsK4F15O3_rh_7T_8i-D0lf-F1_2zVot1ZW1jHFr9yeLAbdXuolLViNy5ybs_nxqt_oLSInLnXTaI3PmQ_gkAAP__ZSQetA">