<table border="1" cellspacing="0" cellpadding="8">

    <tr>

        <th>Issue</th>

        <td>

            <a href=https://github.com/llvm/llvm-project/issues/133964>133964</a>

        </td>

    </tr>

    <tr>

        <th>Summary</th>

        <td>

            [mlir][Bufferization] Bufferization does not seem to handle well degenerate slices.

        </td>

    </tr>

    <tr>

      <th>Labels</th>

      <td>

            mlir,

            mlir:bufferization

      </td>

    </tr>

    <tr>

      <th>Assignees</th>

      <td>

      </td>

    </tr>

    <tr>

      <th>Reporter</th>

      <td>

          MaheshRavishankar

      </td>

    </tr>

</table>

<pre>

    I have long had this issue with bufferization where it somehow needs to always need a `tensor.extract_slice` to "do the right thing" and gets thrown off if the `extract_slice` isnt there.

For example, take these two examples

```

func.func @test(%14: index, %0 : memref<8x16xf16>, %1 : memref<8xi32>, %2 : memref<?x16xf16>) {

  %16 = bufferization.to_tensor %0 restrict : memref<8x16xf16> to tensor<8x16xf16>

  %17 = bufferization.to_tensor %1 restrict : memref<8xi32> to tensor<8xi32>

  %18 = bufferization.to_tensor %2 restrict : memref<?x16xf16> to tensor<?x16xf16>

 %19 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %18) -> (tensor<?x16xf16>) {

    %20 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg0)

    %extracted_slice = tensor.extract_slice %arg1[0, %20] [%14, 8] [1, 1] : tensor<?x16xf16> to tensor<?x8xf16>

    %21 = scf.forall (%arg2, %arg3) in (8, 1) shared_outs(%arg4 = %extracted_slice) -> (tensor<?x8xf16>) {

      %extracted_slice_0 = tensor.extract_slice %16[%arg2, %20] [1, 8] [1, 1] : tensor<8x16xf16> to tensor<1x8xf16>

      %extracted_slice_1 = tensor.extract_slice %17[%arg2] [1] [1] : tensor<8xi32> to tensor<1xi32>

 %22 = iree_linalg_ext.scatter {lowering_config = #iree_gpu.lowering_config<{thread = [1, 8], workgroup = [8, 8]}>} dimension_map = [0] unique_indices(true) ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x8xf16>, tensor<1xi32>) outs(%arg4 : tensor<?x8xf16>)\

 {

      ^bb0(%arg5: f16, %arg6: f16):

 iree_linalg_ext.yield %arg5 : f16

      } -> tensor<?x8xf16>

 scf.forall.in_parallel {

        tensor.parallel_insert_slice %22 into %arg4[0, 0] [%14, 8] [1, 1] : tensor<?x8xf16> into tensor<?x8xf16>

      }

 } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}

 scf.forall.in_parallel {

      tensor.parallel_insert_slice %21 into %arg1[0, %20] [%14, 8] [1, 1] : tensor<?x8xf16> into tensor<?x16xf16>

 }

  } {mapping = [#iree_codegen.workgroup_mapping<x>]}

 bufferization.materialize_in_destination

    %19 in restrict writable %2 : (tensor<?x16xf16>, memref<?x16xf16>) -> ()

  return

}

// -----

func.func @test(%14: index, %0 : memref<8x16xf16>, %1 : memref<8xi32>, %2 : memref<?x16xf16>) {

  %16 = bufferization.to_tensor %0 restrict : memref<8x16xf16> to tensor<8x16xf16>

  %17 = bufferization.to_tensor %1 restrict : memref<8xi32> to tensor<8xi32>

  %18 = bufferization.to_tensor %2 restrict : memref<?x16xf16> to tensor<?x16xf16>

 %19 = scf.forall (%arg0) in (2) shared_outs(%arg1 = %18) -> (tensor<?x16xf16>) {

    %20 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg0)

    %extracted_slice = tensor.extract_slice %arg1[0, %20] [%14, 8] [1, 1] : tensor<?x16xf16> to tensor<?x8xf16>

    %21 = scf.forall (%arg2, %arg3) in (8, 1) shared_outs(%arg4 = %extracted_slice) -> (tensor<?x8xf16>) {

      %extracted_slice_0 = tensor.extract_slice %16[%arg2, %20] [1, 8] [1, 1] : tensor<8x16xf16> to tensor<1x8xf16>

      %extracted_slice_1 = tensor.extract_slice %17[%arg2] [1] [1] : tensor<8xi32> to tensor<1xi32>

 %extracted_slice_2 = tensor.extract_slice %arg4[0, 0] [%14, 8] [1, 1] : tensor<?x8xf16> to tensor<?x8xf16>

      %22 = iree_linalg_ext.scatter {lowering_config = #iree_gpu.lowering_config<{thread = [1, 8], workgroup = [8, 8]}>} dimension_map = [0] unique_indices(true) ins(%extracted_slice_0, %extracted_slice_1 : tensor<1x8xf16>, tensor<1xi32>) outs(%extracted_slice_2 : ten\

sor<?x8xf16>) {

      ^bb0(%arg5: f16, %arg6: f16):

 iree_linalg_ext.yield %arg5 : f16

      } -> tensor<?x8xf16>

 scf.forall.in_parallel {

        tensor.parallel_insert_slice %22 into %arg4[0, 0] [%14, 8] [1, 1] : tensor<?x8xf16> into tensor<?x8xf16>

      }

 } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}

 scf.forall.in_parallel {

      tensor.parallel_insert_slice %21 into %arg1[0, %20] [%14, 8] [1, 1] : tensor<?x8xf16> into tensor<?x16xf16>

 }

  } {mapping = [#iree_codegen.workgroup_mapping<x>]}

 bufferization.materialize_in_destination

    %19 in restrict writable %2 : (tensor<?x16xf16>, memref<?x16xf16>) -> ()

  return

```

The only difference here is the degenerate `extract_slice` that I manually added

```

%extracted_slice_2 = tensor.extract_slice %arg4[0, 0] [%14, 8] [1, 1] : tensor<?x8xf16> to tensor<?x8xf16>

```

This is effectively a no-op/foldable extract_slice. But the two produce different outputs

```

module {

  func.func @test(%arg0: index, %arg1: memref<8x16xf16>, %arg2: memref<8xi32>, %arg3: memref<?x16xf16>) {

    %c0 = arith.constant 0 : index

    %dim = memref.dim %arg3, %c0 : memref<?x16xf16>

    %alloc = memref.alloc(%dim) : memref<?x16xf16>

 linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg3 : memref<?x16xf16>) outs(%alloc : memref<?x16xf16>) {

    ^bb0(%in: f16, %out: f16):

      linalg.yield %in : f16

    }

 scf.forall (%arg4) in (2) {

      %0 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)

      %subview = memref.subview %alloc[0, %0] [%arg0, 8] [1, 1] : memref<?x16xf16> to memref<?x8xf16, strided<[16, 1], offset: ?>>

 scf.forall (%arg5, %arg6) in (8, 1) {

        %subview_0 = memref.subview %arg1[%arg5, %0] [1, 8] [1, 1] : memref<8x16xf16> to memref<1x8xf16, strided<[16, 1], offset: ?>>

        %subview_1 = memref.subview %arg2[%arg5] [1] [1] : memref<8xi32> to memref<1xi32, strided<[1], offset: ?>>

        %alloc_2 = memref.alloc(%arg0) : memref<?x8xf16>

 iree_linalg_ext.scatter {lowering_config = #iree_gpu.lowering_config<{thread = [1, 8], workgroup = [8, 8]}>} dimension_map = [0] unique_indices(true) ins(%subview_0, %subview_1 : memref<1x8xf16, strided<[16, 1], offset: ?>>, memref<1xi32, strided<[1], offset: ?>>) outs(%alloc_2 : memref<?x8xf16>) {

 ^bb0(%arg7: f16, %arg8: f16):

          iree_linalg_ext.yield %arg7 : f16

        }

        linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloc_2 : memref<?x8xf16>) outs(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) {

        ^bb0(%in: f16, %out: f16):

          linalg.yield %in : f16

        }

      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}

      linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) outs(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) {

      ^bb0(%in: f16, %out: f16):

 linalg.yield %in : f16

      }

    } {mapping = [#iree_codegen.workgroup_mapping<x>]}

    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloc : memref<?x16xf16>) outs(%arg3 : memref<?x16xf16>) {

    ^bb0(%in: f16, %out: f16):

 linalg.yield %in : f16

    }

    return

  }

}

// -----

module {

 func.func @test(%arg0: index, %arg1: memref<8x16xf16>, %arg2: memref<8xi32>, %arg3: memref<?x16xf16>) {

    %c0 = arith.constant 0 : index

    %dim = memref.dim %arg3, %c0 : memref<?x16xf16>

    %alloc = memref.alloc(%dim) : memref<?x16xf16>

    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%arg3 : memref<?x16xf16>) outs(%alloc : memref<?x16xf16>) {

    ^bb0(%in: f16, %out: f16):

      linalg.yield %in : f16

    }

 scf.forall (%arg4) in (2) {

      %0 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg4)

      %subview = memref.subview %alloc[0, %0] [%arg0, 8] [1, 1] : memref<?x16xf16> to memref<?x8xf16, strided<[16, 1], offset: ?>>

 scf.forall (%arg5, %arg6) in (8, 1) {

        %subview_0 = memref.subview %arg1[%arg5, %0] [1, 8] [1, 1] : memref<8x16xf16> to memref<1x8xf16, strided<[16, 1], offset: ?>>

        %subview_1 = memref.subview %arg2[%arg5] [1] [1] : memref<8xi32> to memref<1xi32, strided<[1], offset: ?>>

        iree_linalg_ext.scatter {lowering_config = #iree_gpu.lowering_config<{thread = [1, 8], workgroup = [8, 8]}>} dimension_map = [0] unique_indices(true) ins(%subview_0, %subview_1 : memref<1x8xf16, strided<[16, 1], offset: ?>>, memref<1xi32, strided<[1], offset: ?>>) outs(%subview : memref<?x8xf16, strided<\[16, 1], offset: ?>>) {

        ^bb0(%arg7: f16, %arg8: f16):

          iree_linalg_ext.yield %arg7 : f16

        }

      } {mapping = [#gpu.thread<linear_dim_1>, #gpu.thread<linear_dim_0>]}

      linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) outs(%subview : memref<?x8xf16, strided<[16, 1], offset: ?>>) {

      ^bb0(%in: f16, %out: f16):

        linalg.yield %in : f16

      }

    } {mapping = [#iree_codegen.workgroup_mapping<x>]}

 linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloc : memref<?x16xf16>) outs(%arg3 : memref<?x16xf16>) {

    ^bb0(%in: f16, %out: f16):

      linalg.yield %in : f16

    }

 return

  }

}

```

(Ignore the outer `alloca` that I dont know how to avoid and is just an artifiact of the test). The inner-alloca created in the first example is completely unnecessary. Bufferization has to be able to deal with the slice not being there.

@matthias-springer  can you suggest how I can go about fixing this issue? 

</pre>

<img width="1" height="1" alt="" src="http://email.email.llvm.org/o/eJzsW19vq7gS_zTuy6gITP4-5CFNt9J5uC9X-x45MID3gJ1rm6bdT39lGwgkoU273d2jKlHVBGPG49-MZ35jZKY1zwXiikwfyPTxjtWmkGr1H1agLv7LnrkumPjJ1N1Opq-rH1CwZ4RSihwKloIpuAaudY1w4KaAXZ1lqPifzHAp4FCgQuAGtKywkAcQiKkGI4GVB_aq3TUwILPQoNBSBfhiFEvMVpc8QTILbV9CaSrBFAiK54WxY4qcUApMpJCj0WAKJQ8CZJYBz1xPMgvPRHEt7MOoMCDhmoTrJ6kAX1i1L5HQDRj2E-19jWAOsr2jfV8yC5u_cJ3VIgnsPyCT0KA2hC4InUYTEq-BixRfrDhCpyHYlgorhRmJN4uXaPaSRTMS_9Z0iE478Jge79LhXRI_9QUsgcwfSLgGJ2gGJH4cwh8YufWwel0UaqN4YkaVsmD7B06UbQeZvzdINDqIn9jJCM1sW_GL98TTEfEDZAZjDDEL126cpRtHJ1mQScXKErz9mMpDCysXtoHan7pgCtOtrI3u-kTuaaev7XJvhyR0MTJkz0xukjR0j7Ms4wIDtt-Xr83FtmJ7-zhdpGFfcBoCoWuwgzmBPVWPYhtnx9S7uxvj0pKCZg5k-hC2XhaS6SPY1e9cmG5g0TRE9iJyF_F6BNMztBd9p_FTjkbxpo0OTOXxEfqFH3cE_0mL_8mcR42xuGiLS7BtwzeBi2YepZ7iHXjRFcCNLbXoFLSLukVv6zbv6dZq0f8eKnJhOUa95WhnRt14XCFuSy5YmW_xxQQ6YcagskCW8oCKi3ybSJHxvDFL7J7I93Vwct9aY_5gCoUs9X17qNnvg1Q_cyXrfXt30d2dP1rF5o-Q8gqF5lLY1dL2czaoBf9fjVsuUp6g9RajavQ-1fjOmbEbG14Cen3ROjZJnMFFl3Dmn-uRNUGXZLpxAPf9cPrbbhd2z0_t47Z_tzRmXcuSxDYZnVnllWOZNr2n0HbvBpg_-pUxulCPazPgYrtn9heWAy2hdb327pYLjarngpQCFy5fOxjaEPOZ8NLq5gW-FV_c7Dyij1bfiu33XOStaxAaW1f0XkfiTckFMrVNebWNukQ72iW0Xbz7XQfSuxBFfYj-chR-A6Zh0vMTGMfI-VMiU8xRBN063DYdSbx5GSAxTNEVM6g4K_mfdvltU9SGC3frmAMiF9u77H1Q3LBdiUeW80YG3bxBgdqA3yZDhaZWdlyvqf2jT4Q-wb39-JYbfbvRtxt9u9G3b03fToen7_nU1-Tr97P1jVl-jlleMqiT5RnlFevlRjNvNPNGM7-YZva2BUm4_r1AkKJ8hZRb1VEkCH4PVLttSTdzVMxc3qE0BTPwAyomalZaZpGmmJ7vP_6S4f0MCbcxDJhlmBj-jHY6IOS93BP6lMkydYYZqBrAQ-12aN3-617JtE6wg9LYYLi38fAMkEqmtbVys0RGCL5jXScU3y2Pt0m-S8lv0HxHfq4i-s5Fk4ZBKm6KIJFCGyYM-FLDa9Z1TXnl-nrJgbtsydamkzU6cieHlaVM-pJcgwcl5ZVT8U0xPgUEznd5YufjNLUZuGJ73a7wS0R4A2k0pMNdWwPhJ57yWZ0bu5Kk2prXPepjmKFtXCS04XmDFhdejrnbwvlOodbjrw2SV1q7l3G5GOZbWZuzdOs-DdZdqrWcepBoz9LEkY9PTuqfE7r8VbXL5Fi7OLm63j1zPPQ9rGtqMOuloF4A8nXQWAgarw8Hd3wMohuwQd_Gy3hjRc1aWfZbZplGBziJn-xUTvkI9HhRjxBdqGmGZOU4-6b4uDB_n4GH0q-oO8ZK_K49-itTP9c_Gtef9vQfKU4u7hX0VLWNZ4peq6JzoSbHnYewtvA_85k-h_tG1UbncI0v9Q24_gr36FOhj1vuPGBuL-yEXahSTgqU-VmBsrgcMe3nrUplflapdEEUBjH3e-a3ayxwNNgxll_s_glnOguZn8mKV2XGU8P-_bXat_eev8Md_jln-6irXeFhR8N_VZX7zT3oGso82CB-j41_nl9fS62hV-C3jc3_4cukYd15Kzu_uOz87ivjVnneKs9b5fnrVp63mvFfqRk_xgq_oAr5J4vNW01yq0k-U_7-K6XJd_aiX6gu-RD7GqtMhq_fCF38yIVU7kCDnYdNW7PQTZr1XjWmUhj4KeQBCnlwBzOeJU_dyQqu4Y9aG2ACmDI84ywxIP3xCl_dLAP4vUDgQqC696IhUcgMppa82I4ZV9q0RymsyETaXwbLV6iFwAS1Zuo1gIfBwZGCuVMiOwT3ltBISJGV_oiJFevfbAppYIfWw_unOsgkrJgxBWf6Xu9tEkYFkDABr7IGXec5auOm-8O15hLYTtYGMv7iZbVHWkj8BHfpKk6X8ZLd4SqaT-JpHE-Ws7tiNcvCeYRREs5nKZtHcUjpMttN6C6cTTCZ7-74ioZ0Gk7CKFpMo3gWxMkkThcYRksWzidJRiYhVoyXQVk-V4FU-Z0bdRXF8XI2uSvZDku98q5dlVx5t24v4vXg9bj38Du1ssLud3WuySQsuTb6KN5wU7pzPk7A9JFMHwawWyoxtEMqUTuYNWJlzVAwkZYIByzL_stsZw8d3NWqXBXG7LX1bFcr59wU9S5IZEXok9Wk-brfK_kHJobQJzdrTehTM_HnFf1_AAAA__8I3eYs">