[Mlir-commits] [mlir] [mlir][test] Add TD example for peel+vectorize (depthwise conv) (PR #90200)

Mon Apr 29 13:29:30 PDT 2024

https://github.com/banach-space updated https://github.com/llvm/llvm-project/pull/90200

>From 70f32bf8e4abe3d1c743142855508ad8421d23c0 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 26 Apr 2024 12:24:14 +0100
Subject: [PATCH 1/3] [mlir][test] Add TD example for peel+vectorize (depthwise
 conv)

Adds an example that combines loop peeling and scalable vectorisation of
`linalg.depthwise_conv_2d_nhwc_hwc`. This is similar to
transform-op-peel-and-vectorize.mlir and is meant to demonstrate how to
avoid masking when vectorising using scalable vectors.
---
 .../transform-op-peel-and-vectorize-conv.mlir | 87 +++++++++++++++++++
 1 file changed, 87 insertions(+)
 create mode 100644 mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir

diff --git a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
new file mode 100644
index 00000000000000..7ff813dbded6ae
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
@@ -0,0 +1,87 @@
+// RUN: mlir-opt %s --transform-interpreter --split-input-file -resolve-shaped-type-result-dims -canonicalize | FileCheck %s
+
+// Demonstrates what happens when peeling the 4th loop (that corresponds to the
+// "depth" dimension in depthwise convs) followed by vectorization in the
+// presence of _scalable_ vectors (these are introduced through scalable
+// tiling). The main goal is to verify that canonicalizations fold away the
+// masks in the main loop.
+
+func.func @conv(%arg0: tensor<1x1080x1962x48xi32>, %arg1: tensor<1x43x48xi32>) -> tensor<1x1080x1920x48xi32> {
+// CHECK: #[[$MAP:.+]] = affine_map<()[s0] -> (-(48 mod s0) + 48)>
+
+// CHECK-LABEL:   func.func @conv(
+// CHECK-DAG:       %[[C_43:.*]] = arith.constant 43 : index
+// CHECK-DAG:       %[[C_48:.*]] = arith.constant 48 : index
+// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[C4:.*]] = arith.constant 4 : index
+// CHECK:           %[[VSCALE:.*]] = vector.vscale
+// CHECK:           %[[VSCALE_X_4:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
+
+// Loop over the channel/depth dim - the main part after vectorisation (no masking)
+
+// CHECK:               %[[UB_DEPTH_LOOP:.*]] = affine.apply #[[$MAP]](){{\[}}%[[VSCALE_X_4]]]
+// CHECK-NEXT:          %[[VAL_21:.*]] = scf.for {{.*}} to %[[UB_DEPTH_LOOP]] step %[[VSCALE_X_4]]
+
+// CHECK:                 scf.for %{{.*}} = %[[C0]] to %[[C_43]] step %[[C1]] {{.*}} -> (tensor<1x1x4x?xi32>) {
+// CHECK-NOT:               vector.mask
+// CHECK:                   vector.broadcast {{.*}} : vector<[4]xi32> to vector<1x4x[4]xi32>
+// CHECK-NEXT:              arith.muli {{.*}} : vector<1x4x[4]xi32>
+// CHECK-NEXT:              arith.addi {{.*}} : vector<1x4x[4]xi32>
+// CHECK-NOT:               vector.mask
+// CHECK:                   scf.yield {{.*}} : tensor<1x1x4x?xi32>
+// CHECK:                 }
+// CHECK:                 tensor.insert_slice {{.*}}  tensor<1x1x4x?xi32> into tensor<1x1080x1920x48xi32>
+// CHECK:                 scf.yield {{.*}} : tensor<1x1080x1920x48xi32>
+
+// CHECK-NEXT:          }
+
+// Loop over the channel/depth dim - the remainder part (no vectorisation)
+
+// CHECK:               scf.for {{.*}} to %[[C_48]] step %[[VSCALE_X_4]]
+// CHECK:                   linalg.depthwise_conv_1d_nwc_wc {{.*}} -> tensor<1x4x?xi32>
+// CHECK:                   scf.yield %{{.*}} : tensor<1x1x4x?xi32>
+// CHECK:                 }
+// CHECK:                 tensor.insert_slice {{.*}} tensor<1x1x4x?xi32> into tensor<1x1080x1920x48xi32>
+// CHECK-NEXT:            scf.yield %{{.*}} : tensor<1x1080x1920x48xi32>
+// CHECK-NEXT:          }
+
+
+  %0 = tensor.empty() : tensor<1x1080x1920x48xi32>
+  %c0_i32 = arith.constant 0 : i32
+  %1 = linalg.fill ins(%c0_i32 : i32) outs(%0 : tensor<1x1080x1920x48xi32>) -> tensor<1x1080x1920x48xi32>
+  %2 = linalg.depthwise_conv_2d_nhwc_hwc {
+    dilations = dense<1> : tensor<2xi64>,
+    strides = dense<1> : tensor<2xi64>}
+    ins(%arg0, %arg1 : tensor<1x1080x1962x48xi32>, tensor<1x43x48xi32>) outs(%1 : tensor<1x1080x1920x48xi32>) -> tensor<1x1080x1920x48xi32>
+  return %2 : tensor<1x1080x1920x48xi32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%root: !transform.any_op {transform.consume}) {
+    // 1. Tile parallel dims
+    %1 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %root : (!transform.any_op) -> !transform.any_op
+    %tiled_linalg_op_0, %loops_1:4 = transform.structured.tile_using_for %1[1, 1, 4, [4], 0, 0] : (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">, !transform.op<"scf.for">)
+
+    // 2. Tile reduction dims
+    %2 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %loops_1#3 : (!transform.op<"scf.for">) -> !transform.any_op
+    %tiled_linalg_op_1, %loops_2:2 = transform.structured.tile_using_for %2[0, 0, 0, 0, 1, 1] : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+
+    // 3. Decompose 2D conv into 2 x 1D conv
+    %3 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %loops_1#3 : (!transform.op<"scf.for">) -> !transform.any_op
+    %4 = transform.structured.decompose %3 : (!transform.any_op) -> !transform.any_op
+    %f00 = transform.structured.match ops{["func.func"]} in %root
+        : (!transform.any_op) -> !transform.any_op
+    transform.apply_patterns to %f00 {
+        transform.apply_patterns.canonicalization
+        transform.apply_patterns.linalg.tiling_canonicalization
+    } : !transform.any_op
+
+    // 4. Apply loop peeling
+    %main_loop, %remainder_loop = transform.loop.peel %loops_1#3 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">, !transform.op<"scf.for">)
+    %5 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %main_loop : (!transform.op<"scf.for">) -> !transform.any_op
+
+    transform.structured.vectorize %5 vector_sizes [2, 4, [4], 16] : !transform.any_op
+    transform.yield
+  }
+}

>From 548ac63914f84f74dcf5dd9e05411cd5fc7be64a Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Fri, 26 Apr 2024 16:59:12 +0100
Subject: [PATCH 2/3] fixup! [mlir][test] Add TD example for peel+vectorize
 (depthwise conv)

Remove redundant transformations
---
 .../Linalg/transform-op-peel-and-vectorize-conv.mlir        | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
index 7ff813dbded6ae..d8419a72640316 100644
--- a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
@@ -70,12 +70,6 @@ module attributes {transform.with_named_sequence} {
     // 3. Decompose 2D conv into 2 x 1D conv
     %3 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %loops_1#3 : (!transform.op<"scf.for">) -> !transform.any_op
     %4 = transform.structured.decompose %3 : (!transform.any_op) -> !transform.any_op
-    %f00 = transform.structured.match ops{["func.func"]} in %root
-        : (!transform.any_op) -> !transform.any_op
-    transform.apply_patterns to %f00 {
-        transform.apply_patterns.canonicalization
-        transform.apply_patterns.linalg.tiling_canonicalization
-    } : !transform.any_op
 
     // 4. Apply loop peeling
     %main_loop, %remainder_loop = transform.loop.peel %loops_1#3 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">, !transform.op<"scf.for">)

>From 6c96cc0faab5f95fd7fa3a270e071ce89f3f17f0 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski at arm.com>
Date: Mon, 29 Apr 2024 21:28:55 +0100
Subject: [PATCH 3/3] fixup! fixup! [mlir][test] Add TD example for
 peel+vectorize (depthwise conv)

Minor improvements
---
 .../transform-op-peel-and-vectorize-conv.mlir      | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
index d8419a72640316..7f3997633a307d 100644
--- a/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-peel-and-vectorize-conv.mlir
@@ -18,11 +18,10 @@ func.func @conv(%arg0: tensor<1x1080x1962x48xi32>, %arg1: tensor<1x43x48xi32>) -
 // CHECK:           %[[VSCALE:.*]] = vector.vscale
 // CHECK:           %[[VSCALE_X_4:.*]] = arith.muli %[[VSCALE]], %[[C4]] : index
 
-// Loop over the channel/depth dim - the main part after vectorisation (no masking)
-
+// Loop over the channel/depth dim - the main part after vectorisation (vectorized, no masking)
 // CHECK:               %[[UB_DEPTH_LOOP:.*]] = affine.apply #[[$MAP]](){{\[}}%[[VSCALE_X_4]]]
 // CHECK-NEXT:          %[[VAL_21:.*]] = scf.for {{.*}} to %[[UB_DEPTH_LOOP]] step %[[VSCALE_X_4]]
-
+// Loop over the Filter width dim
 // CHECK:                 scf.for %{{.*}} = %[[C0]] to %[[C_43]] step %[[C1]] {{.*}} -> (tensor<1x1x4x?xi32>) {
 // CHECK-NOT:               vector.mask
 // CHECK:                   vector.broadcast {{.*}} : vector<[4]xi32> to vector<1x4x[4]xi32>
@@ -36,9 +35,10 @@ func.func @conv(%arg0: tensor<1x1080x1962x48xi32>, %arg1: tensor<1x43x48xi32>) -
 
 // CHECK-NEXT:          }
 
-// Loop over the channel/depth dim - the remainder part (no vectorisation)
-
+// Loop over the channel/depth dim - the remainder part (not vectorized)
 // CHECK:               scf.for {{.*}} to %[[C_48]] step %[[VSCALE_X_4]]
+// Loop over the Filter width dim
+// CHECK:                 scf.for %{{.*}} = %[[C0]] to %[[C_43]] step %[[C1]] {{.*}} -> (tensor<1x1x4x?xi32>) {
 // CHECK:                   linalg.depthwise_conv_1d_nwc_wc {{.*}} -> tensor<1x4x?xi32>
 // CHECK:                   scf.yield %{{.*}} : tensor<1x1x4x?xi32>
 // CHECK:                 }
@@ -71,11 +71,13 @@ module attributes {transform.with_named_sequence} {
     %3 = transform.structured.match ops{["linalg.depthwise_conv_2d_nhwc_hwc"]} in %loops_1#3 : (!transform.op<"scf.for">) -> !transform.any_op
     %4 = transform.structured.decompose %3 : (!transform.any_op) -> !transform.any_op
 
-    // 4. Apply loop peeling
+    // 4. Apply loop peeling - only the 4th loop
     %main_loop, %remainder_loop = transform.loop.peel %loops_1#3 : (!transform.op<"scf.for">) -> (!transform.op<"scf.for">, !transform.op<"scf.for">)
     %5 = transform.structured.match ops{["linalg.depthwise_conv_1d_nwc_wc"]} in %main_loop : (!transform.op<"scf.for">) -> !transform.any_op
 
+    // 5. Vectorize, but only the main loop
     transform.structured.vectorize %5 vector_sizes [2, 4, [4], 16] : !transform.any_op
+
     transform.yield
   }
 }