[Mlir-commits] [mlir] [MLIR] Add pattern to bubble up tensor.extract_slice (PR #126898)

Wed Feb 12 03:53:48 PST 2025

https://github.com/ofri-frishman created https://github.com/llvm/llvm-project/pull/126898

Add a pattern that bubbles up tensor.extract_slice through tensor.expand_shape.
This pattern enables tiling and fusing op chains which contain tensor.expand_shape if added as a cleanup pattern of tile and fuse utility.
Without this pattern that would not be possible, as tensor.expand_shape does not implement the tiling interface. In addition, registering this pattern as a cleanup pattern for transform.structured.fuse.
The pattren was first implement in IREE project by Quinn Dawkins and is being upstreamed.

>From c59dc1a615fef8ed52d35482b26b1abf25a81d35 Mon Sep 17 00:00:00 2001
From: Ofri Frishman <ofri.frishman at mobileye.com>
Date: Wed, 12 Feb 2025 13:12:48 +0200
Subject: [PATCH] [MLIR] Add pattern to bubble up tensor.extract_slice

Add a pattern that bubbles up tensor.extract_slice through
tensor.expand_shape.
This pattern enables tiling and fusing op chains which contain
tensor.expand_shape if added as a cleanup pattern of tile and fuse
utility.
Without this pattern that would not be possible, as
tensor.expand_shape does not implement the tiling interface.
In addition, registering this pattern as a cleanup pattern for
transform.structured.fuse.
The pattren was first implement in IREE project by
Quinn Dawkins and is being upstreamed.
---
 .../Dialect/Tensor/Transforms/Transforms.h    |   6 +
 .../TransformOps/LinalgTransformOps.cpp       |   1 +
 .../Transforms/BubbleUpExtractSlice.cpp       | 207 ++++++++++++++++++
 .../Dialect/Tensor/Transforms/CMakeLists.txt  |   1 +
 .../Dialect/Linalg/transform-op-fuse.mlir     | 138 ++++++++++++
 5 files changed, 353 insertions(+)
 create mode 100644 mlir/lib/Dialect/Tensor/Transforms/BubbleUpExtractSlice.cpp

diff --git a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
index ae695e0326ca1..dc4558a605a59 100644
--- a/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Tensor/Transforms/Transforms.h
@@ -58,6 +58,12 @@ void populateFoldTensorSubsetIntoVectorTransferPatterns(
 void populateMergeConsecutiveInsertExtractSlicePatterns(
     RewritePatternSet &patterns);
 
+/// Appends patterns that are used to bubble up tensor.extract slice op above
+/// its producer. When used as cleanup patterns of tile and fuse, enables fusing
+/// the producer with the consumer even if the producer does not implement the
+/// tiling interface.
+void populateBubbleUpExtractSliceOpPatterns(RewritePatternSet &patterns);
+
 /// Populates `patterns` with patterns that drop redundant tensor.insert_slice
 /// rank expansions.
 void populateDropRedundantInsertSliceRankExpansionPatterns(
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 51d1df52598c7..5146bebe0108e 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -582,6 +582,7 @@ transform::FuseOp::apply(transform::TransformRewriter &rewriter,
     RewritePatternSet patterns(context);
     tensor::ExtractSliceOp::getCanonicalizationPatterns(patterns, context);
     tensor::populateMergeConsecutiveInsertExtractSlicePatterns(patterns);
+    tensor::populateBubbleUpExtractSliceOpPatterns(patterns);
     tileAndFuseOptions.cleanupPatterns = std::move(patterns);
   }
 
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BubbleUpExtractSlice.cpp b/mlir/lib/Dialect/Tensor/Transforms/BubbleUpExtractSlice.cpp
new file mode 100644
index 0000000000000..a0d3c6d25bbe8
--- /dev/null
+++ b/mlir/lib/Dialect/Tensor/Transforms/BubbleUpExtractSlice.cpp
@@ -0,0 +1,207 @@
+//===- BubbleUpExtractSlice.cpp ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Swap a `tensor.extract_slice` with the producer of the source in some cases
+// where that is valid. When used as cleanup patterns of tile and fuse, enables
+// fusing the producer with the consumer even if the producer does not implement
+// the tiling interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
+#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/Dialect/Tensor/Utils/Utils.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Interfaces/ValueBoundsOpInterface.h"
+
+using namespace mlir;
+using namespace mlir::tensor;
+
+/// Converts `tensor.extract_slice(tensor.expand_shape)` to
+/// `tensor.expand_shape(tensor.extract_slice)`.
+/// For this transformation to be possible, the slice must be fully contiguous
+/// within each reassociation group of the expand_shape. If the transformation
+/// is not possible, or if the slice is rank reducting, the function returns
+/// failure.
+///
+/// Example:
+/// ```
+/// %reshape = tensor.expand_shape %in [[0, 1], [2, 3], [4, 5, 6]]
+///     tensor<8x16x32xf32> to tensor<2x4x2x8x4x2x4xf32>
+/// %slice = tensor.extract_slice %reshape ...
+///     tensor<2x4x2x8x4x2x4xf32> to tensor<2x4x1x5x1x1x4xf32>
+///
+/// // The transformation is possible because each reassociation group has a
+/// // contiguous slice. (i.e., [2x4->2x4], [2x8->1x5], [4x2x4->1x1x4])
+/// // After the transformation:
+///
+/// %slice = tensor.extract_slice %in ...
+///     tensor<8x16x32xf32> to tensor<8x5x4xf32>
+/// %reshape = tensor.expand_shape %slice [[0, 1], [2, 3], [4, 5, 6]]
+///     tensor<8x5x4xf32> to tensor<2x4x1x5x1x1x4xf32>
+/// ```
+static LogicalResult
+swapExpandShapeWithSlice(RewriterBase &rewriter,
+                         tensor::ExpandShapeOp expandShapeOp,
+                         tensor::ExtractSliceOp sliceOp) {
+  SmallVector<OpFoldResult> offsets = sliceOp.getMixedOffsets();
+  SmallVector<OpFoldResult> sizes = sliceOp.getMixedSizes();
+
+  if (static_cast<size_t>(sliceOp.getResultType().getRank()) != sizes.size()) {
+    return rewriter.notifyMatchFailure(sliceOp,
+                                       "unimplemented: rank reducing slice");
+  }
+
+  // Helper variables and function for accumulating the new offset and length
+  // values.
+  Location loc = expandShapeOp->getLoc();
+  AffineExpr d0, d1, d2;
+  bindDims(rewriter.getContext(), d0, d1, d2);
+  // Multiply two integers.
+  auto mul = [&](OpFoldResult v1, OpFoldResult v2) {
+    auto mulMap = AffineMap::get(2, 0, {d0 * d1});
+    return affine::makeComposedFoldedAffineApply(rewriter, loc, mulMap,
+                                                 {v1, v2});
+  };
+
+  SmallVector<OpFoldResult> outputShape =
+      getMixedValues(expandShapeOp.getStaticOutputShape(),
+                     expandShapeOp.getOutputShape(), rewriter);
+
+  auto isZeroOffsetAndFullSize = [](OpFoldResult offset, OpFoldResult sliceSize,
+                                    OpFoldResult size) {
+    if (!isConstantIntValue(offset, 0))
+      return false;
+    FailureOr<bool> maybeEqual =
+        ValueBoundsConstraintSet::areEqual(sliceSize, size);
+    return llvm::succeeded(maybeEqual) && maybeEqual.value();
+  };
+
+  // First verify that this is a full slice of the expanded tensor.
+  for (const ReassociationIndices &indices :
+       expandShapeOp.getReassociationIndices()) {
+    int64_t i = 0;
+    int64_t e = indices.size();
+    // Find the first expanded dim after the first dim with non-unit extracted
+    // size.
+    for (; i < e; ++i) {
+      if (!isConstantIntValue(sizes[indices[i]], 1)) {
+        // +1 to skip the first non-unit size dim.
+        i++;
+        break;
+      }
+    }
+
+    // Verify that all subsequent dimensions extract the full size of the
+    // source tensor.
+    for (; i < e; ++i) {
+      int64_t expandedDim = indices[i];
+      if (!isZeroOffsetAndFullSize(offsets[expandedDim], sizes[expandedDim],
+                                   outputShape[expandedDim])) {
+        return rewriter.notifyMatchFailure(
+            sliceOp, "Not a contiguous slice of the expanded tensor.");
+      }
+    }
+  }
+
+  // Compute new offsets, lengths, and strides.
+  SmallVector<OpFoldResult> newOffsets, newLengths, newStrides;
+  for (const ReassociationIndices &indices :
+       expandShapeOp.getReassociationIndices()) {
+    OpFoldResult newSize = rewriter.getIndexAttr(1);
+    SmallVector<OpFoldResult> basis, delinOffsets;
+
+    int64_t i = 0;
+    int64_t e = indices.size();
+    // Offset = cumulative product of leading unit extracted dims.
+    for (; i < e; ++i) {
+      int64_t expandedDim = indices[i];
+      if (!isConstantIntValue(sizes[expandedDim], 1))
+        break;
+
+      basis.push_back(outputShape[expandedDim]);
+      delinOffsets.push_back(offsets[expandedDim]);
+    }
+
+    if (i != e) {
+      int64_t expandedDim = indices[i];
+      basis.push_back(outputShape[expandedDim]);
+      delinOffsets.push_back(offsets[expandedDim]);
+      newSize = sizes[expandedDim];
+      i++;
+    }
+
+    for (; i < e; ++i) {
+      OpFoldResult fullSize = outputShape[indices[i]];
+      basis.push_back(fullSize);
+      delinOffsets.push_back(rewriter.getIndexAttr(0));
+      newSize = mul(newSize, fullSize);
+    }
+    SmallVector<Value> offsetVals =
+        llvm::map_to_vector(delinOffsets, [&](OpFoldResult ofr) {
+          return getValueOrCreateConstantIndexOp(rewriter, loc, ofr);
+        });
+    OpFoldResult newOffset = rewriter
+                                 .create<affine::AffineLinearizeIndexOp>(
+                                     loc, offsetVals, basis, /*disjoint=*/true)
+                                 .getResult();
+    newOffsets.push_back(newOffset);
+    newLengths.push_back(newSize);
+
+    // Only unit stride supported.
+    newStrides.push_back(rewriter.getIndexAttr(1));
+  }
+
+  // The shape of the result can be obtained from the sizes passed in.
+  SmallVector<Value> dynDims;
+  SmallVector<int64_t> shape;
+  dispatchIndexOpFoldResults(sizes, dynDims, shape);
+  RankedTensorType resultType = RankedTensorType::get(
+      shape, expandShapeOp.getResultType().getElementType());
+
+  // Create a new ExtractSliceOp and ExpandShapeOp.
+  Value newSliceOp = rewriter.create<tensor::ExtractSliceOp>(
+      loc, expandShapeOp.getSrc(), newOffsets, newLengths, newStrides);
+  auto newExpandShapeOp = rewriter.create<tensor::ExpandShapeOp>(
+      loc, resultType, newSliceOp, expandShapeOp.getReassociationIndices(),
+      sizes);
+  rewriter.replaceOp(sliceOp, newExpandShapeOp);
+  return success();
+}
+
+namespace {
+
+struct SwapExpandShapeWithSlicePattern
+    : public OpRewritePattern<tensor::ExtractSliceOp> {
+  using OpRewritePattern<tensor::ExtractSliceOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp,
+                                PatternRewriter &rewriter) const override {
+    auto expandOp = sliceOp.getSource().getDefiningOp<tensor::ExpandShapeOp>();
+    if (!expandOp) {
+      return failure();
+    }
+
+    if (!sliceOp.hasUnitStride()) {
+      return rewriter.notifyMatchFailure(sliceOp,
+                                         "unsupported: non-unit stride");
+    }
+
+    return swapExpandShapeWithSlice(rewriter, expandOp, sliceOp);
+  }
+};
+
+} // namespace
+
+void mlir::tensor::populateBubbleUpExtractSliceOpPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<SwapExpandShapeWithSlicePattern>(patterns.getContext());
+}
diff --git a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
index cc6275fee671a..634cc93a08352 100644
--- a/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/Tensor/Transforms/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_dialect_library(MLIRTensorTransforms
   RewriteAsConstant.cpp
   SwapExtractSliceWithProducerPatterns.cpp
   SubsetInsertionOpInterfaceImpl.cpp
+  BubbleUpExtractSlice.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/Tensor/Transforms
diff --git a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir
index ac1ca9319d335..22796611c5934 100644
--- a/mlir/test/Dialect/Linalg/transform-op-fuse.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-fuse.mlir
@@ -278,3 +278,141 @@ module attributes {transform.with_named_sequence} {
     transform.yield
   }
 }
+
+// -----
+
+// CHECK-LABEL: func.func @swap_expand_shape_with_extract_slice
+//     CHECK: scf.for %[[X:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:   scf.for %[[Y:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:     scf.for %[[Z:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:       %[[LINEAR_IDX:.+]] = affine.linearize_index disjoint [%[[X]], %[[Y]], %[[Z]]] by (2, 3, 10)
+//     CHECK:       %[[SLICE:.+]] = tensor.extract_slice %{{.*}}[%[[LINEAR_IDX]]] [5] [1] : tensor<60xf32> to tensor<5xf32>
+//     CHECK:       %[[EXPAND:.+]] = tensor.expand_shape %[[SLICE]] {{\[\[}}0, 1, 2]] output_shape [1, 1, 5]
+//     CHECK:       linalg.exp ins(%[[EXPAND]]
+func.func @swap_expand_shape_with_extract_slice(%0: tensor<60xf32>) -> tensor<2x3x10xf32> {
+  %expand = tensor.expand_shape %0 [[0, 1, 2]] output_shape [2, 3, 10] : tensor<60xf32> into tensor<2x3x10xf32>
+  %empty = tensor.empty() : tensor<2x3x10xf32>
+  %exp = linalg.exp ins(%expand : tensor<2x3x10xf32>) outs(%empty : tensor<2x3x10xf32>) -> tensor<2x3x10xf32>
+  return %exp : tensor<2x3x10xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.exp"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %transformed, %loops:3 = transform.structured.fuse %0 [1, 1, 5] interchange [0, 1, 2] apply_cleanup = true : 
+      (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.any_op, !transform.any_op)
+    transform.yield 
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @swap_expand_shape_with_extract_slice_full_inner_dim
+//     CHECK: scf.for %[[X:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:   scf.for %[[Y:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:       %[[LINEAR_IDX:.+]] = affine.linearize_index disjoint [%[[X]], %[[Y]]{{.*}} by (3, 4, 10)
+//     CHECK:       %[[SLICE:.+]] = tensor.extract_slice %{{.*}}[%[[LINEAR_IDX]]] [20] [1] : tensor<120xf32> to tensor<20xf32>
+//     CHECK:       %[[EXPAND:.+]] = tensor.expand_shape %[[SLICE]] {{\[\[}}0, 1, 2]] output_shape [1, 2, 10]
+//     CHECK:       linalg.exp ins(%[[EXPAND]]
+func.func @swap_expand_shape_with_extract_slice_full_inner_dim(%0: tensor<120xf32>) -> tensor<3x4x10xf32> {
+  %expand = tensor.expand_shape %0 [[0, 1, 2]] output_shape [3, 4, 10] : tensor<120xf32> into tensor<3x4x10xf32>
+  %empty = tensor.empty() : tensor<3x4x10xf32>
+  %exp = linalg.exp ins(%expand : tensor<3x4x10xf32>) outs(%empty : tensor<3x4x10xf32>) -> tensor<3x4x10xf32>
+  return %exp : tensor<3x4x10xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.exp"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %transformed, %loops:2 = transform.structured.fuse %0 [1, 2, 0] interchange [0, 1, 2] apply_cleanup = true : 
+      (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.any_op)
+    transform.yield 
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @swap_expand_shape_with_extract_slice_full_inner_dim
+//     CHECK: tensor.expand_shape
+//     CHECK: scf.for
+//     CHECK:   scf.for
+//     CHECK:     scf.for
+//     CHECK:       linalg.exp
+func.func @swap_expand_shape_with_extract_slice_full_inner_dim(%0: tensor<120xf32>) -> tensor<3x4x10xf32> {
+  %expand = tensor.expand_shape %0 [[0, 1, 2]] output_shape [3, 4, 10] : tensor<120xf32> into tensor<3x4x10xf32>
+  %empty = tensor.empty() : tensor<3x4x10xf32>
+  %exp = linalg.exp ins(%expand : tensor<3x4x10xf32>) outs(%empty : tensor<3x4x10xf32>) -> tensor<3x4x10xf32>
+  return %exp : tensor<3x4x10xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.exp"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %transformed, %loops:3 = transform.structured.fuse %0 [1, 2, 5] interchange [0, 1, 2] apply_cleanup = true : 
+      (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.any_op, !transform.any_op)
+    transform.yield 
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @swap_expand_shape_with_extract_slice_multiple_expanded_dims
+//     CHECK: %[[C0:.+]] = arith.constant 0 : index
+//     CHECK: scf.for %[[X:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:   scf.for %[[Y:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:     scf.for %[[Z:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:       scf.for %[[W:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:       %[[LINEAR_IDX0:.+]] = affine.linearize_index disjoint [%[[X]], %[[Y]], %[[C0]]] by (3, 4, 10)
+//     CHECK:       %[[LINEAR_IDX1:.+]] = affine.linearize_index disjoint [%[[Z]], %[[W]]] by (7, 8)
+//     CHECK:       %[[SLICE:.+]] = tensor.extract_slice %{{.*}}[%[[LINEAR_IDX0]], %[[LINEAR_IDX1]]] [20, 4] [1, 1] : tensor<120x56xf32> to tensor<20x4xf32>
+//     CHECK:       %[[EXPAND:.+]] = tensor.expand_shape %[[SLICE]] {{\[\[}}0, 1, 2], [3, 4]] output_shape [1, 2, 10, 1, 4]
+//     CHECK:       linalg.exp ins(%[[EXPAND]]
+module {
+  func.func @swap_expand_shape_with_extract_slice_multiple_expanded_dims(%0: tensor<120x56xf32>) -> tensor<3x4x10x7x8xf32> {
+    %expand = tensor.expand_shape %0 [[0, 1, 2], [3, 4]] output_shape [3, 4, 10, 7, 8] : tensor<120x56xf32> into tensor<3x4x10x7x8xf32>
+    %empty = tensor.empty() : tensor<3x4x10x7x8xf32>
+    %exp = linalg.exp ins(%expand : tensor<3x4x10x7x8xf32>) outs(%empty : tensor<3x4x10x7x8xf32>) -> tensor<3x4x10x7x8xf32>
+    return %exp : tensor<3x4x10x7x8xf32>
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.exp"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %transformed, %loops:4 = transform.structured.fuse %0 [1, 2, 0, 1, 4] interchange [0, 1, 2, 3, 4] apply_cleanup = true : 
+      (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">, !transform.any_op, !transform.any_op, !transform.any_op)
+    transform.yield 
+  }
+}
+
+// -----
+
+//     CHECK: scf.for %[[X:[A-Za-z0-9]+]] = {{.*}}
+//     CHECK:    %[[LINEAR_IDX:.+]] = affine.linearize_index disjoint [%[[X]], {{.*}} by (8, 32)
+//     CHECK:    %[[SLICE:.+]] = tensor.extract_slice %{{.*}}[0, 0, %[[LINEAR_IDX]]] [1, 1800, 32] [1, 1, 1] : tensor<1x1800x256xf32> to tensor<1x1800x32xf32>
+//     CHECK:    %[[ABS:.+]] = linalg.abs ins(%[[SLICE]]
+//     CHECK:    %[[EXPAND:.+]] = tensor.expand_shape %[[ABS]] {{\[\[}}0], [1], [2, 3]] output_shape [1, 1800, 1, 32]
+//     CHECK:    linalg.exp ins(%[[EXPAND]]
+module {
+  func.func @swap_expand_shape_with_extract_slice_and_fuse_with_expand_producer(%0: tensor<1x1800x256xf32>) -> tensor<1x1800x8x32xf32> {
+    %empty1 = tensor.empty() : tensor<1x1800x256xf32>
+    %exp1 = linalg.abs ins(%0 : tensor<1x1800x256xf32>) outs(%empty1 : tensor<1x1800x256xf32>) -> tensor<1x1800x256xf32>
+    %expand = tensor.expand_shape %exp1 [[0], [1], [2, 3]] output_shape [1, 1800, 8, 32] : tensor<1x1800x256xf32> into tensor<1x1800x8x32xf32>
+    %empty2 = tensor.empty() : tensor<1x1800x8x32xf32>
+    %exp2 = linalg.exp ins(%expand : tensor<1x1800x8x32xf32>) outs(%empty2 : tensor<1x1800x8x32xf32>) -> tensor<1x1800x8x32xf32>
+    return %exp2 : tensor<1x1800x8x32xf32>
+  }
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.exp"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %transformed, %loops:1 = transform.structured.fuse %0 [0, 0, 1, 0] interchange [0, 1, 2, 3] apply_cleanup = true : 
+      (!transform.any_op) -> (!transform.any_op, !transform.op<"scf.for">)
+    transform.yield 
+  }
+}
+
+
+
+