<table border="1" cellspacing="0" cellpadding="8">
<tr>
<th>Issue</th>
<td>
<a href=https://github.com/llvm/llvm-project/issues/133964>133964</a>
</td>
</tr>
<tr>
<th>Summary</th>
<td>
[mlir][Bufferization] Bufferization does not seem to handle well degenerate slices.
</td>
</tr>
<tr>
<th>Labels</th>
<td>
mlir,
mlir:bufferization
</td>
</tr>
<tr>
<th>Assignees</th>
<td>
</td>
</tr>
<tr>
<th>Reporter</th>
<td>
MaheshRavishankar
</td>
</tr>
</table>
<pre>
I have long had this issue with bufferization where it somehow needs to always need a `tensor.extract_slice` to "do the right thing" and gets thrown off if the `extract_slice` isnt there.
For example, take these two examples
```
func.func @test(%14: index, %0 : memref<8x16xf16>, %1 : memref<8xi32>, %2 : memref<?x16xf16>) {
%16 = bufferization.to_tensor %0 restrict : memref<8x16xf16> to tensor<8x16xf16>
%17 = bufferization.to_tensor %1 restrict : memref<8xi32> to tensor<8xi32>
%18 = bufferization.to_tensor %2 restrict : memref<?x16xf16> to tensor<?x16xf16>
%19 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %18) -> (tensor<?x16xf16>) {
%20 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg0)
%extracted_slice = tensor.extract_slice %arg1[0, %20] [%14, 8] [1, 1] : tensor<?x16xf16> to tensor<?x8xf16>
%21 = scf.forall (%arg2, %arg3) in (8, 1) shared_outs(%arg4 = %extracted_slice) -> (tensor<?x8xf16>) {
%extracted_slice_0 = tensor.extract_slice %16[%arg2, %20] [1, 8] [1, 1] : tensor<8x16xf16> to tensor<1x8xf16>
%extracted_slice_1 = tensor.extract_slice %17[%arg2] [1] [1] : tensor<8xi32> to tensor<1xi32>
%22 = iree_linalg_ext.scatter {lowering_config = #iree_gpu.lowering_config<{thread = [1, 8], workgroup = [8, 8]}>} dimension_map = [0] unique_indices(true) ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x8xf16>, tensor<1xi32>) outs(%arg4 : tensor<?x8xf16>)\
{
^bb0(%arg5: f16, %arg6: f16):
iree_linalg_ext.yield %arg5 : f16
} -> tensor<?x8xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg4[0, 0] [%14, 8] [1, 1] : tensor<?x8xf16> into tensor<?x8xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg1[0, %20] [%14, 8] [1, 1] : tensor<?x8xf16> into tensor<?x16xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
bufferization.materialize_in_destination
%19 in restrict writable %2 : (tensor<?x16xf16>, memref<?x16xf16>) -> ()
return
}
// -----
func.func @test(%14: index, %0 : memref<8x16xf16>, %1 : memref<8xi32>, %2 : memref<?x16xf16>) {
%16 = bufferization.to_tensor %0 restrict : memref<8x16xf16> to tensor<8x16xf16>
%17 = bufferization.to_tensor %1 restrict : memref<8xi32> to tensor<8xi32>
%18 = bufferization.to_tensor %2 restrict : memref<?x16xf16> to tensor<?x16xf16>
%19 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %18) -> (tensor<?x16xf16>) {
%20 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg0)
%extracted_slice = tensor.extract_slice %arg1[0, %20] [%14, 8] [1, 1] : tensor<?x16xf16> to tensor<?x8xf16>
%21 = scf.forall (%arg2, %arg3) in (8, 1) shared_outs(%arg4 = %extracted_slice) -> (tensor<?x8xf16>) {
%extracted_slice_0 = tensor.extract_slice %16[%arg2, %20] [1, 8] [1, 1] : tensor<8x16xf16> to tensor<1x8xf16>
%extracted_slice_1 = tensor.extract_slice %17[%arg2] [1] [1] : tensor<8xi32> to tensor<1xi32>
%extracted_slice_2 = tensor.extract_slice %arg4[0, 0] [%14, 8] [1, 1] : tensor<?x8xf16> to tensor<?x8xf16>
%22 = iree_linalg_ext.scatter {lowering_config = #iree_gpu.lowering_config<{thread = [1, 8], workgroup = [8, 8]}>} dimension_map = [0] unique_indices(true) ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x8xf16>, tensor<1xi32>) outs(%extracted_slice_2 : ten\
sor<?x8xf16>) {
^bb0(%arg5: f16, %arg6: f16):
iree_linalg_ext.yield %arg5 : f16
} -> tensor<?x8xf16>
scf.forall.in_parallel {
tensor.parallel_insert_slice %22 into %arg4[0, 0] [%14, 8] [1, 1] : tensor<?x8xf16> into tensor<?x8xf16>
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
scf.forall.in_parallel {
tensor.parallel_insert_slice %21 into %arg1[0, %20] [%14, 8] [1, 1] : tensor<?x8xf16> into tensor<?x16xf16>
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
bufferization.materialize_in_destination
%19 in restrict writable %2 : (tensor<?x16xf16>, memref<?x16xf16>) -> ()
return
```
The only difference here is the degenerate `extract_slice` that I manually added
```
%extracted_slice_2 = tensor.extract_slice %arg4[0, 0] [%14, 8] [1, 1] : tensor<?x8xf16> to tensor<?x8xf16>
```
This is effectively a no-op/foldable extract_slice. But the two produce different outputs
```
module {
func.func @test(%arg0: index, %arg1: memref<8x16xf16>, %arg2: memref<8xi32>, %arg3: memref<?x16xf16>) {
%c0 = arith.constant 0 : index
%dim = memref.dim %arg3, %c0 : memref<?x16xf16>
%alloc = memref.alloc(%dim) : memref<?x16xf16>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg3 : memref<?x16xf16>) outs(%alloc : memref<?x16xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
scf.forall (%arg4) in (2) {
%0 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview = memref.subview %alloc[0, %0] [%arg0, 8] [1, 1] : memref<?x16xf16> to memref<?x8xf16, strided<[16, 1], offset: ?>>
scf.forall (%arg5, %arg6) in (8, 1) {
%subview_0 = memref.subview %arg1[%arg5, %0] [1, 8] [1, 1] : memref<8x16xf16> to memref<1x8xf16, strided<[16, 1], offset: ?>>
%subview_1 = memref.subview %arg2[%arg5] [1] [1] : memref<8xi32> to memref<1xi32, strided<[1], offset: ?>>
%alloc_2 = memref.alloc(%arg0) : memref<?x8xf16>
iree_linalg_ext.scatter {lowering_config = #iree_gpu.lowering_config<{thread = [1, 8], workgroup = [8, 8]}>} dimension_map = [0] unique_indices(true) ins(%subview_0, %subview_1 : memref<1x8xf16, strided<[16, 1], offset: ?>>, memref<1xi32, strided<[1], offset: ?>>) outs(%alloc_2 : memref<?x8xf16>) {
^bb0(%arg7: f16, %arg8: f16):
iree_linalg_ext.yield %arg7 : f16
}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloc_2 : memref<?x8xf16>) outs(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) outs(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloc : memref<?x16xf16>) outs(%arg3 : memref<?x16xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
return
}
}
// -----
module {
func.func @test(%arg0: index, %arg1: memref<8x16xf16>, %arg2: memref<8xi32>, %arg3: memref<?x16xf16>) {
%c0 = arith.constant 0 : index
%dim = memref.dim %arg3, %c0 : memref<?x16xf16>
%alloc = memref.alloc(%dim) : memref<?x16xf16>
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg3 : memref<?x16xf16>) outs(%alloc : memref<?x16xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
scf.forall (%arg4) in (2) {
%0 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)
%subview = memref.subview %alloc[0, %0] [%arg0, 8] [1, 1] : memref<?x16xf16> to memref<?x8xf16, strided<[16, 1], offset: ?>>
scf.forall (%arg5, %arg6) in (8, 1) {
%subview_0 = memref.subview %arg1[%arg5, %0] [1, 8] [1, 1] : memref<8x16xf16> to memref<1x8xf16, strided<[16, 1], offset: ?>>
%subview_1 = memref.subview %arg2[%arg5] [1] [1] : memref<8xi32> to memref<1xi32, strided<[1], offset: ?>>
iree_linalg_ext.scatter {lowering_config = #iree_gpu.lowering_config<{thread = [1, 8], workgroup = [8, 8]}>} dimension_map = [0] unique_indices(true) ins(%subview_0, %subview_1 : memref<1x8xf16, strided<[16, 1], offset: ?>>, memref<1xi32, strided<[1], offset: ?>>) outs(%subview : memref<?x8xf16, strided<\[16, 1], offset: ?>>) {
^bb0(%arg7: f16, %arg8: f16):
iree_linalg_ext.yield %arg7 : f16
}
} {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) outs(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
} {mapping = [#iree_codegen.workgroup_mapping<x>]}
linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloc : memref<?x16xf16>) outs(%arg3 : memref<?x16xf16>) {
^bb0(%in: f16, %out: f16):
linalg.yield %in : f16
}
return
}
}
```
(Ignore the outer `alloca` that I dont know how to avoid and is just an artifiact of the test). The inner-alloca created in the first example is completely unnecessary. Bufferization has to be able to deal with the slice not being there.
@matthias-springer can you suggest how I can go about fixing this issue?
</pre>
<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzsW19vq7gS_zTuy6gITP4-5CFNt9J5uC9X-x45MID3gJ1rm6bdT39lGwgkoU273d2jKlHVBGPG49-MZ35jZKY1zwXiikwfyPTxjtWmkGr1H1agLv7LnrkumPjJ1N1Opq-rH1CwZ4RSihwKloIpuAaudY1w4KaAXZ1lqPifzHAp4FCgQuAGtKywkAcQiKkGI4GVB_aq3TUwILPQoNBSBfhiFEvMVpc8QTILbV9CaSrBFAiK54WxY4qcUApMpJCj0WAKJQ8CZJYBz1xPMgvPRHEt7MOoMCDhmoTrJ6kAX1i1L5HQDRj2E-19jWAOsr2jfV8yC5u_cJ3VIgnsPyCT0KA2hC4InUYTEq-BixRfrDhCpyHYlgorhRmJN4uXaPaSRTMS_9Z0iE478Jge79LhXRI_9QUsgcwfSLgGJ2gGJH4cwh8YufWwel0UaqN4YkaVsmD7B06UbQeZvzdINDqIn9jJCM1sW_GL98TTEfEDZAZjDDEL126cpRtHJ1mQScXKErz9mMpDCysXtoHan7pgCtOtrI3u-kTuaaev7XJvhyR0MTJkz0xukjR0j7Ms4wIDtt-Xr83FtmJ7-zhdpGFfcBoCoWuwgzmBPVWPYhtnx9S7uxvj0pKCZg5k-hC2XhaS6SPY1e9cmG5g0TRE9iJyF_F6BNMztBd9p_FTjkbxpo0OTOXxEfqFH3cE_0mL_8mcR42xuGiLS7BtwzeBi2YepZ7iHXjRFcCNLbXoFLSLukVv6zbv6dZq0f8eKnJhOUa95WhnRt14XCFuSy5YmW_xxQQ6YcagskCW8oCKi3ybSJHxvDFL7J7I93Vwct9aY_5gCoUs9X17qNnvg1Q_cyXrfXt30d2dP1rF5o-Q8gqF5lLY1dL2czaoBf9fjVsuUp6g9RajavQ-1fjOmbEbG14Cen3ROjZJnMFFl3Dmn-uRNUGXZLpxAPf9cPrbbhd2z0_t47Z_tzRmXcuSxDYZnVnllWOZNr2n0HbvBpg_-pUxulCPazPgYrtn9heWAy2hdb327pYLjarngpQCFy5fOxjaEPOZ8NLq5gW-FV_c7Dyij1bfiu33XOStaxAaW1f0XkfiTckFMrVNebWNukQ72iW0Xbz7XQfSuxBFfYj-chR-A6Zh0vMTGMfI-VMiU8xRBN063DYdSbx5GSAxTNEVM6g4K_mfdvltU9SGC3frmAMiF9u77H1Q3LBdiUeW80YG3bxBgdqA3yZDhaZWdlyvqf2jT4Q-wb39-JYbfbvRtxt9u9G3b03fToen7_nU1-Tr97P1jVl-jlleMqiT5RnlFevlRjNvNPNGM7-YZva2BUm4_r1AkKJ8hZRb1VEkCH4PVLttSTdzVMxc3qE0BTPwAyomalZaZpGmmJ7vP_6S4f0MCbcxDJhlmBj-jHY6IOS93BP6lMkydYYZqBrAQ-12aN3-617JtE6wg9LYYLi38fAMkEqmtbVys0RGCL5jXScU3y2Pt0m-S8lv0HxHfq4i-s5Fk4ZBKm6KIJFCGyYM-FLDa9Z1TXnl-nrJgbtsydamkzU6cieHlaVM-pJcgwcl5ZVT8U0xPgUEznd5YufjNLUZuGJ73a7wS0R4A2k0pMNdWwPhJ57yWZ0bu5Kk2prXPepjmKFtXCS04XmDFhdejrnbwvlOodbjrw2SV1q7l3G5GOZbWZuzdOs-DdZdqrWcepBoz9LEkY9PTuqfE7r8VbXL5Fi7OLm63j1zPPQ9rGtqMOuloF4A8nXQWAgarw8Hd3wMohuwQd_Gy3hjRc1aWfZbZplGBziJn-xUTvkI9HhRjxBdqGmGZOU4-6b4uDB_n4GH0q-oO8ZK_K49-itTP9c_Gtef9vQfKU4u7hX0VLWNZ4peq6JzoSbHnYewtvA_85k-h_tG1UbncI0v9Q24_gr36FOhj1vuPGBuL-yEXahSTgqU-VmBsrgcMe3nrUplflapdEEUBjH3e-a3ayxwNNgxll_s_glnOguZn8mKV2XGU8P-_bXat_eev8Md_jln-6irXeFhR8N_VZX7zT3oGso82CB-j41_nl9fS62hV-C3jc3_4cukYd15Kzu_uOz87ivjVnneKs9b5fnrVp63mvFfqRk_xgq_oAr5J4vNW01yq0k-U_7-K6XJd_aiX6gu-RD7GqtMhq_fCF38yIVU7kCDnYdNW7PQTZr1XjWmUhj4KeQBCnlwBzOeJU_dyQqu4Y9aG2ACmDI84ywxIP3xCl_dLAP4vUDgQqC696IhUcgMppa82I4ZV9q0RymsyETaXwbLV6iFwAS1Zuo1gIfBwZGCuVMiOwT3ltBISJGV_oiJFevfbAppYIfWw_unOsgkrJgxBWf6Xu9tEkYFkDABr7IGXec5auOm-8O15hLYTtYGMv7iZbVHWkj8BHfpKk6X8ZLd4SqaT-JpHE-Ws7tiNcvCeYRREs5nKZtHcUjpMttN6C6cTTCZ7-74ioZ0Gk7CKFpMo3gWxMkkThcYRksWzidJRiYhVoyXQVk-V4FU-Z0bdRXF8XI2uSvZDku98q5dlVx5t24v4vXg9bj38Du1ssLud3WuySQsuTb6KN5wU7pzPk7A9JFMHwawWyoxtEMqUTuYNWJlzVAwkZYIByzL_stsZw8d3NWqXBXG7LX1bFcr59wU9S5IZEXok9Wk-brfK_kHJobQJzdrTehTM_HnFf1_AAAA__8I3eYs">