[Mlir-commits] [mlir] [mlir][vector] Add mask elimination transform (PR #99314)

Fri Aug 2 14:44:37 PDT 2024

================
@@ -0,0 +1,138 @@
+// RUN: mlir-opt %s -split-input-file -test-eliminate-vector-masks  | FileCheck %s
+
+// This tests a general pattern the vectorizer tends to emit.
+
+// CHECK-LABEL: @eliminate_redundant_masks_through_insert_and_extracts
+// CHECK: %[[ALL_TRUE_MASK:.*]] = arith.constant dense<true> : vector<[4]xi1>
+// CHECK: vector.transfer_read {{.*}} %[[ALL_TRUE_MASK]]
+// CHECK: vector.transfer_write {{.*}} %[[ALL_TRUE_MASK]]
+func.func @eliminate_redundant_masks_through_insert_and_extracts(%tensor: tensor<1x1000xf32>) {
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1000 = arith.constant 1000 : index
+  %c0_f32 = arith.constant 0.0 : f32
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  %extracted_slice_0 = tensor.extract_slice %tensor[0, 0] [1, %c4_vscale] [1, 1] : tensor<1x1000xf32> to tensor<1x?xf32>
+  %output_tensor = scf.for %i = %c0 to %c1000 step %c4_vscale iter_args(%arg = %extracted_slice_0) -> tensor<1x?xf32> {
+    // 1. Extract a slice.
+    %extracted_slice_1 = tensor.extract_slice %arg[0, %i] [1, %c4_vscale] [1, 1] : tensor<1x?xf32> to tensor<?xf32>
+
+    // 2. Create a mask for the slice.
+    %dim_1 = tensor.dim %extracted_slice_1, %c0 : tensor<?xf32>
+    %mask = vector.create_mask %dim_1 : vector<[4]xi1>
+
+    // 3. Read the slice and do some computation.
+    %vec = vector.transfer_read %extracted_slice_1[%c0], %c0_f32, %mask {in_bounds = [true]} : tensor<?xf32>, vector<[4]xf32>
+    %new_vec = "test.some_computation"(%vec) : (vector<[4]xf32>) -> (vector<[4]xf32>)
+
+    // 4. Write the new value.
+    %write = vector.transfer_write %new_vec, %extracted_slice_1[%c0], %mask {in_bounds = [true]} : vector<[4]xf32>, tensor<?xf32>
+
+    // 5. Insert and yield the new tensor value.
+    %result = tensor.insert_slice %write into %arg[0, %i] [1, %c4_vscale] [1, 1] : tensor<?xf32> into tensor<1x?xf32>
+    scf.yield %result : tensor<1x?xf32>
+  }
+  "test.some_use"(%output_tensor) : (tensor<1x?xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_extract_slice_size_shrink
+// CHECK-NOT: arith.constant dense<true> : vector<[4]xi1>
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<[4]xi1>) -> ()
+func.func @negative_extract_slice_size_shrink(%tensor: tensor<1000xf32>) {
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %c1000 = arith.constant 1000 : index
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  %extracted_slice = tensor.extract_slice %tensor[0] [%c4_vscale] [1] : tensor<1000xf32> to tensor<?xf32>
+  %slice = scf.for %i = %c0 to %c1000 step %c4_vscale iter_args(%arg = %extracted_slice) -> tensor<?xf32> {
+    // This mask cannot be eliminated even though looking at the above operations
+    // it appears `tensor.dim` will always be c4_vscale (so the mask all-true).
+    %dim = tensor.dim %arg, %c0 : tensor<?xf32>
+    %mask = vector.create_mask %dim : vector<[4]xi1>
+    "test.some_use"(%mask) : (vector<[4]xi1>) -> ()
+    // !!! Here the size of the mask could shrink in the next iteration.
+    %next_num_elts = affine.min  affine_map<(d0)[s0] -> (-d0 + 1000, s0)>(%i)[%c4_vscale]
+    %new_extracted_slice = tensor.extract_slice %tensor[%c4_vscale] [%next_num_elts] [1] : tensor<1000xf32> to tensor<?xf32>
+    scf.yield %new_extracted_slice : tensor<?xf32>
+  }
+  "test.some_use"(%slice) : (tensor<?xf32>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_constant_dim_not_all_true
+// CHECK-NOT: arith.constant dense<true> : vector<2x[4]xi1>
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<2x[4]xi1>) -> ()
+func.func @negative_constant_dim_not_all_true()
+{
+  %c1 = arith.constant 1 : index
+  %c4 = arith.constant 4 : index
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  %mask = vector.create_mask %c1, %c4_vscale : vector<2x[4]xi1>
+  "test.some_use"(%mask) : (vector<2x[4]xi1>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_constant_vscale_multiple_not_all_true
+// CHECK-NOT: arith.constant dense<true> : vector<2x[4]xi1>
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<2x[4]xi1>) -> ()
+func.func @negative_constant_vscale_multiple_not_all_true() {
+  %c2 = arith.constant 2 : index
+  %c3 = arith.constant 3 : index
+  %vscale = vector.vscale
+  %c3_vscale = arith.muli %vscale, %c3 : index
+  %mask = vector.create_mask %c2, %c3_vscale : vector<2x[4]xi1>
+  "test.some_use"(%mask) : (vector<2x[4]xi1>) -> ()
+  return
+}
+
+// -----
+
+// CHECK-LABEL: @negative_value_bounds_fixed_dim_not_all_true
+// CHECK-NOT: arith.constant dense<true> : vector<3x[4]xi1>
+// CHECK: %[[MASK:.*]] = vector.create_mask
+// CHECK: "test.some_use"(%[[MASK]]) : (vector<3x[4]xi1>) -> ()
+func.func @negative_value_bounds_fixed_dim_not_all_true(%tensor: tensor<2x?xf32>)
+{
+  %c0 = arith.constant 0 : index
+  %c4 = arith.constant 4 : index
+  %vscale = vector.vscale
+  %c4_vscale = arith.muli %vscale, %c4 : index
+  // This is _very_ simple but since tensor.dim is not a constant value bounds
+  // will be used to resolve it.
----------------
banach-space wrote:

```suggestion
// This is very simple, but since tensor.dim is not a constant value, bounds will be used to resolve it.
```

https://github.com/llvm/llvm-project/pull/99314