<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/57227>57227</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            [MLIR] foreach tiling miscompiles when output comes from extract.slice

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            new issue

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          ThomasRaoux

      </td>

    </tr>

</table>

<pre>

    Here is an example:

```

module {

  func.func @matmul(%A: tensor<4xf32>, %B: tensor<16xf32>) -> tensor<4xf32> {

    %B1 = tensor.extract_slice %B[10] [4] [1] : tensor<16xf32> to tensor<4xf32>

    %result = linalg.generic {indexing_maps = [

      affine_map<(d0) -> (d0)>,affine_map<(d0) -> (d0)>],

      iterator_types = ["parallel"]}

      ins(%A : tensor<4xf32>) outs(%B1 : tensor<4xf32>) {

      ^bb0(%arg3: f32, %arg4: f32):  // no predecessors

        %2 = arith.addf %arg3, %arg3 : f32

        linalg.yield %2 : f32

    } -> tensor<4xf32>

    return %result : tensor<4xf32>

  }

  transform.with_pdl_patterns {

  ^bb0(%arg0: !pdl.operation):

    transform.sequence %arg0 failures(propagate) {

    ^bb1(%arg1: !pdl.operation):

      %0 = transform.structured.match ops{["linalg.generic"]} in %arg1

      %1:2 = transform.structured.tile_to_foreach_thread_op %0 num_threads [2] (mapped to dims [0])

    }

  }

}

```

running with `mlir-opt --test-transform-dialect-interpreter -canonicalize file.mlir` gives:

```

#map0 = affine_map<(d0) -> (d0 * 2)>

#map1 = affine_map<(d0) -> (d0 * 2 + 10)>

#map2 = affine_map<(d0) -> (d0)>

module {

  func.func @matmul(%arg0: tensor<4xf32>, %arg1: tensor<16xf32>) -> tensor<4xf32> {

    %c2 = arith.constant 2 : index

    %0 = tensor.extract_slice %arg1[10] [4] [1] : tensor<16xf32> to tensor<4xf32>

    %1 = scf.foreach_thread (%arg2) in (%c2) -> (tensor<4xf32>) {

      %2 = affine.apply #map0(%arg2)

      %3 = tensor.extract_slice %arg0[%2] [2] [1] : tensor<4xf32> to tensor<2xf32>

      %4 = affine.apply #map1(%arg2)

      %5 = tensor.extract_slice %arg1[%4] [2] [1] : tensor<16xf32> to tensor<2xf32>

      %6 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel"]} ins(%3 : tensor<2xf32>) outs(%5 : tensor<2xf32>) {

      ^bb0(%arg3: f32, %arg4: f32):

        %7 = arith.addf %arg3, %arg3 : f32

        linalg.yield %7 : f32

      } -> tensor<2xf32>

      scf.foreach_thread.perform_concurrently {

        tensor.parallel_insert_slice %6 into %0[%4] [2] [1] : tensor<2xf32> into tensor<4xf32>

      }

    } {thread_dim_mapping = [0]}

    return %1 : tensor<4xf32>

  }

}

```

The offset of `tensor.parallel_insert_slice` should have been `%arg2 * 2` but we get `%arg2 * 2 + 10` which causes a miscompile.

This is happening because the foreachthread tiling code assumes the interface generates an ExtractSliceOp for the output operand:

https://github.com/llvm/llvm-project/blob/750ee8d56d0a9f8e93c032ad55841c46b1fbbbd7/mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp#L343

There is always an `ExtractSliceOp` but it may get merged with a producer ExtractSliceOp op if it exists:

https://github.com/llvm/llvm-project/blob/750ee8d56d0a9f8e93c032ad55841c46b1fbbbd7/mlir/lib/Dialect/Linalg/Utils/Utils.cpp#L349

I don't see an obvious solution as I don't see how to re-create those offsets within the general tiling function. One way to do it would be to stop composing the ExtractSliceOp but this wouldn't be very robust.

@nicolasvasilache @ftynse any thoughts? 

</pre>

<img width="1px" height="1px" alt="" src="http://email.email.llvm.org/o/eJzNV1mP2zYQ_jX2C2FBh2XLD37YZBM0QIoAbfpsUNLIYkGRKkntrvvrO0NJtuVjs4vkoYahg5yLc32jXJeH7W9ggAnLuGLwwptWwix5mIWPs_BhtgqHv39tdNlJYLP1h_6dsapTRUAXNluGDXdNJ2dxNovTB5TBHCirzSz5uHypkniWfJrFHxlufphsRqvj7oYt8H7Nd66SeQkRmyWPA2EAL87wwu2sFAX0CpAinKWPDB-Wwz3y95uamdM3jD1XaMB20nmlUigu98EeFBhRkGlClfAi1H7X8NZ6GlR3YmeMV5VQQNsoHx1Uhsezjm-9d95ImD4i7bkC4cBwp83OHVo4mRDHLTdcSsCoxMS1fpxwKTtEi90J14bpzg1E3ud3qCbxQYeln_I87Nm42SfER8R9-HFleVzZ0BOufsY_U5q1BkoowKIOey7SRyH2J-NGuDrgZVmxUf5RcMJGyRPeIWYHAbIcBV2QoW_uZN-JxoDrjJrkwy2HDPRHb48LmKXKVto0wTOeYNeWctdyh6FT9tyDF94LScksjpA80C3FWWjVO-5k2Um0hX86UH0dEDeruJAd2osCW6NbvucOrmLmdUZHndFbdPqQhH0hntQ70xXoJSgD7AdFzXRrSZPPxmnpHHMSE5GNeqfSyY74vgYnJOyc3uE68KLeuRrv5U63vWWqa4YlS-UQ-wYQZ1hdLZRU86Vo_E7oK2ozSYbrKB4fpl2xv5pOKWwBjCLLcK-Rwix069hi4cC6xdH-RSm4hMIthMLIY7rjlS0KrrQSBZfiX2AVHisgASiH7cUTxu7YkW9aECd4pj4SP-ogeH1g8dBHzrijd3Dj9QOLwmsh8ZuEnPO9A1LGUriHKmPa_hSwFOctptDKOq4c69uFb_MT6vBVFPIG_Wog6uNkiyqYJj07OomC21dU5g907vw3Ne9jn_VxDLBa5IENOTbRcsGV_Mgboe8CaTz4Ib7rj-Utd8RX7vBal_dsjV6zNX1L5Ej6D229Hbs7xq7eNUH0NdWnd__ogf8daH-C-GRqdXwT4tP7RD-H8FdIvv5VSL6-RXaN5bficV1EAUId9egdVn7RGQPKUTpNj87GtBm9vUMngzlLnxW6HZOBGsRbs2i0r-d8pQVMsKk_Kdo3AB_iGWVQS0A0ZEV4OfadZph789z7gO97DUxXlQWHN0K-17xDgGZr3WHsav4ELAdQxDPU6QBOSJR3jj0D26PUy-0RfZDquRY4YRS8s1gGnDXCFrppCT2nFuLXDf5rwn0P0jl4HubQ9iEDhi6KEwURFLoExq3tGhRMVB6sK16QSYqqD_wH06e-c_xJZ_vWkixPjRXV4gH85KTKY_7XzrUeyv20u8fc73IEmQZfpHwabwuc0v7GAQFfc6lzvK3TECAr01UZ8k2VwSYpwiTmZZpmy6hYrvKoyvO8XCOpnxpQjiC-x37SwKevvmrw4fs4hlh68YcNirbF_vI1WSYXcR2-C-UzP_jTosunBx4jJRxr-MFHqwGzx-HKj0Ecp3nE9wIHnAtH4YQmKmLDrmed_T956C9MATveT77ZnPvmCytpHF47ZgHIMzp_ErqzzGrZ0aiMucOmRLV-JoAwsCgw0RzlnrZj5VjvL8RsSp4-weSYijQKkciAfVPAMBR-cNXkvGdfSDnQinXoU8p-bYmLBF34nALlqBQ8W28Z8j6BOTCj8866SdXg9IUDqZbcPnErJJYI0ERWuQPWM575QCfo9jVF7zObwzZarcIsycJwOS-3SblJNnzuhJOwxTb0-9cvf1DTG6ptPNypYtGsGnvBUDm4iCuV0Q0bwDnw_WPeGbl9d5IILGT6_vmcruN4Pa-3K_wlfBWGiB9FGaX5Mi-yZRlnGVQQ5flc8hyk3faoquCZeRE9rM7FNg7jOMyiLErTJF4GZVzmURaXq5Bn4aoq0U3Q4FdXQHYE2uznZutNyru9xU1JOX_axD4j9gq8n0g-79CzZvu91g23f3Ddvcy9-q03_z_hEiAs">