[Mlir-commits] [mlir] [MLIR] Add allow Insert/extract slice option to pack/unpack op (PR #117340)

Mon Dec 9 14:07:56 PST 2024

================
@@ -0,0 +1,238 @@
+// RUN: mlir-opt %s --transform-interpreter --split-input-file -canonicalize | FileCheck %s
+
+// For pack op, we use lowerPadLikeWithInsertSlice = false to ensure no insert_slice is generated.
+// This allows linalg.transpose to be fused as a producer operation. Alternatively, without this attribute
+// insert_slice will be generated and fusion blocked.
+
+module {
+  // CHECK-label: func @fuse_pack_as_producer
+  // CHECK:       scf.forall {{.*}} {
+  // CHECK:         linalg.transpose
+  // CHECK:         linalg.generic
+  // CHECK:         scf.forall.in_parallel
+  // CHECK:       }
+  func.func @fuse_pack_as_producer(%src: tensor<128x256xf32>, %other: tensor<4x4x128x256xf32>)
+      -> tensor<4x4x128x256xf32> {
+    %dest = tensor.empty() : tensor<1x1x128x256xf32>
+    %pack = tensor.pack %src inner_dims_pos = [0, 1] inner_tiles = [128, 256]
+        into %dest : tensor<128x256xf32> -> tensor<1x1x128x256xf32>
+
+    %out = tensor.empty() : tensor<4x4x128x256xf32>
+    %res = linalg.generic
+        {indexing_maps = [affine_map<(i, j, k, l) -> (0, 0, k, l)>,
+                          affine_map<(i, j, k, l) -> (i, j, k, l)>,
+                          affine_map<(i, j, k, l) -> (i, j, k, l)>],
+         iterator_types = ["parallel", "parallel", "parallel", "parallel"]}
+        ins(%pack, %other: tensor<1x1x128x256xf32>, tensor<4x4x128x256xf32>)
+        outs(%out: tensor<4x4x128x256xf32>) {
+      ^bb0(%pack_elem: f32, %other_elem: f32, %out_elem: f32):
+        %r = arith.addf %pack_elem, %other_elem : f32
+        linalg.yield %r : f32
+    } -> tensor<4x4x128x256xf32>
+
+    return %res : tensor<4x4x128x256xf32>
+  }
+
+  module attributes {transform.with_named_sequence} {
+    transform.named_sequence @__transform_main(%arg1: !transform.any_op {transform.readonly}) {
+      // Find and lower pack operation.
+      %pack = transform.structured.match ops{["tensor.pack"]} in %arg1
+        : (!transform.any_op) -> !transform.op<"tensor.pack">
+      %paded, %expanded, %transpose = transform.structured.lower_pack %pack {lowerPadLikeWithInsertSlice = false}
+        : (!transform.op<"tensor.pack">)
+        -> (!transform.op<"tensor.pad">,
+            !transform.op<"tensor.expand_shape">,
+            !transform.op<"linalg.transpose">)
+
+      %root = transform.structured.match ops{["linalg.generic"]} in %arg1
+          : (!transform.any_op) -> !transform.any_op
+      // Tile the lialg operation with parallel forall loop tiling [4, 4].
+      %tiled_op, %forall_op = transform.structured.tile_using_forall %root num_threads [4, 4]
+          : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+
+      // Fuse the transpose operation into the tiled loop.
+      transform.structured.fuse_into_containing_op %transpose into %forall_op
+          : (!transform.op<"linalg.transpose">, !transform.any_op) -> (!transform.any_op, !transform.any_op)
+      transform.yield
+    }
+  }
+}
+
+// -----
+// For pack op, by default lowerPadLikeWithInsertSlice = true, which generates insert_slice and blocks fusion.
+
+module {
+  // CHECK-label: func @fuse_pack_as_producer_blocked_by_insert_slice
+  // CHECK:       tensor.insert_slice
+  // CHECK:       scf.forall {{.*}} {
+  // CHECK:         scf.forall.in_parallel
----------------
jerryyin wrote:

Good point!

This is addressed in the latest commit. I'll leave this review open for a day or two in case you've seen anything else worth mentioning.

https://github.com/llvm/llvm-project/pull/117340