[Mlir-commits] [mlir] 2240568 - [MLIR][Linalg] Hoist padding across multiple levels of tiling

Tue Mar 23 10:48:47 PDT 2021

Author: Nicolas Vasilache
Date: 2021-03-23T17:47:32Z
New Revision: 22405685794a4908ae64e71d97532f8ab6d34f5c

URL: https://github.com/llvm/llvm-project/commit/22405685794a4908ae64e71d97532f8ab6d34f5c
DIFF: https://github.com/llvm/llvm-project/commit/22405685794a4908ae64e71d97532f8ab6d34f5c.diff

LOG: [MLIR][Linalg] Hoist padding across multiple levels of tiling

This revision introduces proper backward slice computation during the hoisting of
PadTensorOp. This allows hoisting padding even across multiple levels of tiling.
Such hoisting requires the proper handling of loop bounds that may depend on enclosing
loop variables.

Differential revision: https://reviews.llvm.org/D98965

Added: 
    

Modified: 
    mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
    mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
    mlir/test/Dialect/Linalg/hoist-padding.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
index 734075beb9d26..b930a8f4db6e1 100644

--- a/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
+++ b/mlir/include/mlir/Dialect/Affine/IR/AffineOps.td
@@ -75,6 +75,11 @@ def AffineApplyOp : Affine_Op<"apply", [NoSideEffect]> {
     OpBuilder<(ins "AffineMap":$map, "ValueRange":$mapOperands),
     [{
       build($_builder, $_state, $_builder.getIndexType(), map, mapOperands);
+    }]>,
+    OpBuilder<(ins "ArrayRef<AffineExpr> ":$exprList,"ValueRange":$mapOperands),
+    [{
+      build($_builder, $_state, $_builder.getIndexType(),
+            AffineMap::inferFromExprList(exprList).front(), mapOperands);
     }]>
   ];
 

diff  --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
index 1fcd3f4ed8753..9df4b34a4b9fa 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp
@@ -13,7 +13,9 @@
 
 #include "mlir/Dialect/Linalg/Transforms/Hoisting.h"
 #include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/Affine/Utils.h"
 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/SCF/Utils.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
@@ -503,6 +505,134 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {
   }
 }
 
+/// Return success if `v` is a value that is only transitively defined by ops of
+/// type in `OpTypeList`.
+template <typename... OpTypeList>
+static bool backwardsSliceOnlyHasOpsOfType(scf::ForOp outerLimit, Value v) {
+  // Compute a backward slice up to, but not including, `outerLimit`.
+  llvm::SetVector<Operation *> backwardSlice;
+  getBackwardSlice(v, &backwardSlice, [&](Operation *op) {
+    return outerLimit->isProperAncestor(op);
+  });
+  // Traverse the backward slice and ensure we can perform the computation to
+  // hoist.
+  for (Operation *op : backwardSlice) {
+    if (isa<OpTypeList...>(op))
+      continue;
+    LLVM_DEBUG(DBGS() << "Abort: unadmissible op in slice " << *op << "\n");
+    return false;
+  }
+  return true;
+}
+
+bool isDefinedOutsideOrConstant(scf::ForOp outer, Value v) {
+  return outer.isDefinedOutsideOfLoop(v) || v.getDefiningOp<ConstantOp>();
+}
+
+/// Compute the tightest lower bound with quantities that are all defined
+/// outside of `outer`.
+/// Return null if such a bound cannot be computed.
+Value computeLoopIndependentLowerBound(OpBuilder &b, scf::ForOp outer,
+                                       Value v) {
+  if (isDefinedOutsideOrConstant(outer, v))
+    return v;
+  return Value();
+}
+
+/// Compute the tightest upper bound with quantities that are all defined
+/// outside of `outer`.
+/// Expects all ops in the backward slice of `v` up to `outer` to be either
+/// scf.for, affine.min or affine.apply.
+static Value computeLoopIndependentUpperBound(OpBuilder &b, scf::ForOp outer,
+                                              Value v) {
+  if (isDefinedOutsideOrConstant(outer, v))
+    return v;
+
+  LLVM_DEBUG(DBGS() << "Begin loopIndependentUpperBound for: " << v << "\n");
+
+  bool ok =
+      backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp, AffineApplyOp>(
+          outer, v);
+  assert(ok && "expected to only be defined by scf::ForOp and AffineMinOp");
+
+  // Compute a backward slice up to, but not including, `outer`.
+  llvm::SetVector<Operation *> backwardSlice;
+  getBackwardSlice(v, &backwardSlice,
+                   [&](Operation *op) { return outer->isProperAncestor(op); });
+  backwardSlice.insert(v.getDefiningOp());
+
+  OpBuilder::InsertionGuard g(b);
+  b.setInsertionPoint(outer);
+  Value res = v;
+  BlockAndValueMapping bvm;
+  for (Operation *op : backwardSlice) {
+    if (isa<scf::ForOp>(op))
+      continue;
+    if (isa<AffineApplyOp>(op)) {
+      b.clone(*op, bvm);
+      continue;
+    }
+    auto sliceMinOp = cast<AffineMinOp>(op);
+    // Perform the substitution of the operands of AffineMinOp.
+    auto mapAndOperands = substituteMin(
+        sliceMinOp, [&](Operation *op) { return outer->isAncestor(op); });
+    SmallVector<Value> resultOperands = mapAndOperands.dims;
+    llvm::append_range(resultOperands, mapAndOperands.symbols);
+    AffineMap map = mapAndOperands.map;
+    canonicalizeMapAndOperands(&map, &resultOperands);
+    map = simplifyAffineMap(map);
+    res = b.create<AffineMinOp>(
+        outer->getLoc(), map,
+        llvm::to_vector<4>(llvm::map_range(resultOperands, [&](Value operand) {
+          return bvm.lookupOrDefault(operand);
+        })));
+    bvm.map(sliceMinOp, res);
+  }
+  LLVM_DEBUG(DBGS() << "End loopIndependentUpperBound with: " << res << "\n");
+  return res;
+}
+
+/// Return the number of iterations in the loop (ub - lb).ceilDiv(step).
+/// The returned Value is guaranteed not to depend on any loop comprised in
+/// [`outer`, `forOp`].
+/// Return null if such a loop-independent quantity cannot be computed.
+static Value buildLoopTripCount(OpBuilder &b, scf::ForOp outer,
+                                scf::ForOp forOp) {
+  MLIRContext *ctx = forOp->getContext();
+  AffineExpr lb, ub, step;
+  bindDims(ctx, lb, ub);
+  bindSymbols(ctx, step);
+  Value lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
+        ubVal = computeLoopIndependentUpperBound(b, outer, forOp.upperBound()),
+        stepVal = forOp.step();
+  if (!lbVal || !ubVal || !stepVal)
+    return Value();
+  auto loc = forOp->getLoc();
+  Value res = b.create<AffineApplyOp>(loc, (ub - lb).ceilDiv(step),
+                                      ValueRange{lbVal, ubVal, stepVal});
+  return res;
+}
+
+/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
+/// The returned Value is guaranteed not to depend on any loop comprised in
+/// [`outer`, `forOp`].
+/// Return null if such a loop-independent quantity cannot be computed.
+static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp outer,
+                                     scf::ForOp forOp) {
+  MLIRContext *ctx = forOp->getContext();
+  AffineExpr iv, lb, step;
+  bindDims(ctx, iv, lb);
+  bindSymbols(ctx, step);
+  Value ivVal = forOp.getInductionVar(),
+        lbVal = computeLoopIndependentLowerBound(b, outer, forOp.lowerBound()),
+        stepVal = forOp.step();
+  if (!ivVal || !lbVal || !stepVal)
+    return Value();
+  auto loc = forOp->getLoc();
+  return b.create<AffineApplyOp>(loc, (iv - lb).ceilDiv(step),
+                                 ValueRange{ivVal, lbVal, stepVal});
+}
+
 /// Ensure prerequisites that guarantee pad op hoisting can occur.
 /// Return failure in the cases when we cannot perform hoisting; i.e. if either:
 ///   1. There exists a use of `padTensorOp` that is not a linalg input operand.
@@ -510,8 +640,10 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {
 ///   3. There exists an op with a region that is dominated by
 ///   `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a
 ///    LinalgOp.
-///   3. There exists an op with side effects that is dominated by
-///    `outermostEnclosingForOp` and that isn't a LoopLikeInterface.
+///   4. There exists an op with side effects that is dominated by
+///   `outermostEnclosingForOp` and that isn't a LoopLikeInterface.
+///   5. The lower bound, upper bound and step of all the loops involved in the
+///   hoisting can be
 ///
 /// While ensuring prerequisites:
 ///   1. Fill the `backwardSlice` to contain the topologically sorted ops
@@ -523,7 +655,8 @@ void mlir::linalg::hoistRedundantVectorTransfers(FuncOp func) {
 static LogicalResult
 hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
                                    llvm::SetVector<Operation *> &backwardSlice,
-                                   llvm::SetVector<Operation *> &packingLoops) {
+                                   llvm::SetVector<Operation *> &packingLoops,
+                                   SmallVector<Value> &dynamicTensorSizes) {
   // Bail on any use that isn't an input of a Linalg op.
   // Hoisting of inplace updates happens after vectorization.
   for (OpOperand &use : padTensorOp.result().getUses()) {
@@ -583,36 +716,39 @@ hoistPaddingOnTensorsPrerequisites(linalg::PadTensorOp padTensorOp, int nLevels,
   // `outermostEnclosingForOp`.
   assert(outermostEnclosingForOp == backwardSlice.front());
 
-  return success();
-}
-
-/// Return the number of iterations in the loop (ub - lb).ceilDiv(step).
-static Value buildLoopTripCount(OpBuilder &b, scf::ForOp forOp) {
-  MLIRContext *ctx = forOp->getContext();
-  AffineExpr lb, ub, step;
-  bindDims(ctx, lb, ub);
-  bindSymbols(ctx, step);
-  return b.create<AffineApplyOp>(
-      forOp->getLoc(), AffineMap::get(2, 1, {(ub - lb).ceilDiv(step)}, ctx),
-      ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()});
-}
+  scf::ForOp outer = cast<scf::ForOp>(outermostEnclosingForOp);
+  if (llvm::any_of(packingLoops, [&](Operation *op) {
+        scf::ForOp forOp = cast<scf::ForOp>(op);
+        Value lb = forOp.lowerBound(), ub = forOp.upperBound(),
+              step = forOp.step();
+        return !isDefinedOutsideOrConstant(outer, lb) ||
+               !(isDefinedOutsideOrConstant(outer, ub) ||
+                 backwardsSliceOnlyHasOpsOfType<scf::ForOp, AffineMinOp,
+                                                AffineApplyOp>(outer, ub)) ||
+               !isDefinedOutsideOrConstant(outer, step);
+      }))
+    return failure();
 
-/// Return the current iteration number in the loop (iv - lb).ceilDiv(step).
-static Value buildLoopIterationCount(OpBuilder &b, scf::ForOp forOp) {
-  MLIRContext *ctx = forOp->getContext();
-  AffineExpr iv, lb, step;
-  bindDims(ctx, iv, lb);
-  bindSymbols(ctx, step);
-  return b.create<AffineApplyOp>(
-      forOp->getLoc(), AffineMap::get(2, 1, {(iv - lb).ceilDiv(step)}, ctx),
-      ValueRange{forOp.getInductionVar(), forOp.lowerBound(), forOp.step()});
+  // IP just before the outermost loop considered that we hoist above.
+  OpBuilder b(outermostEnclosingForOp);
+  dynamicTensorSizes =
+      llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) {
+        return buildLoopTripCount(b, cast<scf::ForOp>(outermostEnclosingForOp),
+                                  cast<scf::ForOp>(op));
+      }));
+  // Assert all loop trip counts can be computed.
+  if (!llvm::all_of(dynamicTensorSizes, [](Value v) { return v; }))
+    llvm_unreachable("loop independence prerequisite not met");
+  return success();
 }
 
 LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
                                                   unsigned nLoops) {
+  SmallVector<Value> dynamicTensorSizes;
   llvm::SetVector<Operation *> backwardSlice, packingLoops;
   if (failed(hoistPaddingOnTensorsPrerequisites(padTensorOp, nLoops,
-                                                backwardSlice, packingLoops)))
+                                                backwardSlice, packingLoops,
+                                                dynamicTensorSizes)))
     return failure();
 
   // Update actual number of loops, which may be smaller.
@@ -636,12 +772,8 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
   llvm::append_range(packedShape, paddedTensorType.getShape());
   auto packedTensorType =
       RankedTensorType::get(packedShape, paddedTensorType.getElementType());
-  auto dynamicSizes =
-      llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *op) {
-        return buildLoopTripCount(b, cast<scf::ForOp>(op));
-      }));
   Value packedTensor = b.create<linalg::InitTensorOp>(
-      loc, dynamicSizes, packedTensorType.getShape(),
+      loc, dynamicTensorSizes, packedTensorType.getShape(),
       packedTensorType.getElementType());
 
   // Clone the operations involved in the backward slice, iteratively stepping
@@ -656,9 +788,9 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
   clonedLoopIvs.reserve(nLoops);
   leadingPackedTensorIndexings.reserve(nLoops);
   BlockAndValueMapping bvm;
-  // Stack step 1. iteratively clone loops and push `packedTensor`.
   // Insert `padTensorOp` into the backwardSlice so we clone it too.
   backwardSlice.insert(padTensorOp);
+  // Stack step 1. iteratively clone loops and push `packedTensor`.
   for (Operation *op : backwardSlice) {
     if (op->getNumRegions() == 0 || isa<linalg::PadTensorOp>(op)) {
       b.clone(*op, bvm);
@@ -670,15 +802,23 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
     // Unused loop, just skip it.
     if (!packingLoops.contains(forOp))
       continue;
+
     auto clonedForOp =
-        b.create<scf::ForOp>(loc, forOp.lowerBound(), forOp.upperBound(),
-                             forOp.step(), packedTensor);
+        b.create<scf::ForOp>(loc, bvm.lookupOrDefault(forOp.lowerBound()),
+                             bvm.lookupOrDefault(forOp.upperBound()),
+                             bvm.lookupOrDefault(forOp.step()), packedTensor);
+
+    bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar());
     assert(clonedForOp->getNumRegions() == 1);
     clonedLoopIvs.push_back(clonedForOp.getInductionVar());
+
     b.setInsertionPointToStart(&clonedForOp->getRegion(0).front());
-    leadingPackedTensorIndexings.push_back(
-        buildLoopIterationCount(b, clonedForOp));
-    bvm.map(forOp.getInductionVar(), clonedLoopIvs.back());
+    Value loopIndependentIterationCount = buildLoopIterationCount(
+        b, cast<scf::ForOp>(outermostEnclosingForOp), clonedForOp);
+    // Assert the loop-independent iteration count can be computed.
+    if (!loopIndependentIterationCount)
+      llvm_unreachable("loop independence prerequisite not met");
+    leadingPackedTensorIndexings.push_back(loopIndependentIterationCount);
     packedTensor = clonedForOp.getRegionIterArgs().front();
   }
 
@@ -716,8 +856,13 @@ LogicalResult mlir::linalg::hoistPaddingOnTensors(PadTensorOp &padTensorOp,
   b.setInsertionPoint(padTensorOp);
   SmallVector<Value> loopIterationCounts =
       llvm::to_vector<4>(llvm::map_range(packingLoops, [&](Operation *loop) {
-        return buildLoopIterationCount(b, cast<scf::ForOp>(loop));
+        return buildLoopIterationCount(
+            b, cast<scf::ForOp>(outermostEnclosingForOp),
+            cast<scf::ForOp>(loop));
       }));
+  // Assert all loop iteration counts can be computed.
+  if (llvm::any_of(loopIterationCounts, [](Value v) { return !v; }))
+    llvm_unreachable("loop independence prerequisite not met");
   // offsets = [originalLoopIvs, 0 .. 0].
   offsets.assign(loopIterationCounts.begin(), loopIterationCounts.end());
   offsets.append(paddedRank, b.getIndexAttr(0));

diff  --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir
index 969974aa7e52d..2459d2af4546c 100644
--- a/mlir/test/Dialect/Linalg/hoist-padding.mlir
+++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir
@@ -1,16 +1,15 @@
-// RUN: mlir-opt %s -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s
 
+// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)>
+// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
+// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)>
+// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
 #map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)>
 #map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)>
 #map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)>
 #map3 = affine_map<(d0, d1) -> (2, d0 - d1)>
 #map4 = affine_map<(d0, d1) -> (3, d0 - d1)>
 
-// CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)>
-// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
-// CHECK-DAG: #[[$DIVS3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 3)>
-// CHECK-DAG: #[[$DIVS4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)>
-
 // CHECK-LABEL: func @matmul_tensors
 //  CHECK-SAME:   %[[TA:[0-9a-z]+]]: tensor
 //  CHECK-SAME:   %[[TB:[0-9a-z]+]]: tensor
@@ -129,3 +128,92 @@ func @matmul_tensors(
   }
   return %3 : tensor<?x?xf32>
 }
+
+// -----
+
+// CHECK-DAG: #[[$MIN_REST8:[0-9a-z]+]] = affine_map<(d0)[s0] -> (8, -d0 + s0)>
+// CHECK-DAG: #[[$MIN_MOD4:[0-9a-z]+]] = affine_map<(d0) -> (4, d0 - ((d0 - 1) floordiv 4) * 4)>
+// CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)>
+// CHECK-DAG: #[[$DIV2:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 2)>
+#map0 = affine_map<(d0)[s0] -> (8, -d0 + s0)>
+#map1 = affine_map<(d0, d1) -> (4, d0 - d1)>
+#map2 = affine_map<(d0, d1) -> (2, d0 - d1)>
+
+// CHECK-LABEL: func @dot
+func @dot(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>, %arg2: tensor<f32>)
+    -> tensor<f32>
+{
+  %c8 = constant 8 : index
+  %c4 = constant 4 : index
+  %cst = constant 0.000000e+00 : f32
+  %c2 = constant 2 : index
+  %c0 = constant 0 : index
+  %1 = memref.dim %arg0, %c0 : tensor<?xf32>
+  %2 = memref.dim %arg0, %c0 : tensor<?xf32>
+  %3 = memref.dim %arg1, %c0 : tensor<?xf32>
+
+  //      CHECK: scf.for %[[I:[0-9a-z]+]] =
+  //
+  //      CHECK:   %[[MR8:.*]] = affine.min #[[$MIN_REST8]](%[[I]])
+  //      CHECK:   %[[D0:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
+  //      CHECK:   %[[MM4:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
+  //      CHECK:   %[[D1:.*]] = affine.apply #[[$DIV2]](%[[MM4]])
+  // Init tensor and pack.
+  //      CHECK:   %[[INIT_PACKED_A:.*]] = linalg.init_tensor [%[[D0]], %[[D1]], 2] : tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_A:.*]] = scf.for %[[II:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_A]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:     scf.for %[[III:[0-9a-z]+]] =
+  //      CHECK:       subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
+  //
+  //      CHECK:   %[[D0_2:.*]] = affine.apply #[[$DIV4]](%[[MR8]])
+  //      CHECK:   %[[MM4_2:.*]] = affine.min #[[$MIN_MOD4]](%[[MR8]])
+  //      CHECK:   %[[D1_2:.*]] = affine.apply #[[$DIV2]](%[[MM4_2]])
+  // Init tensor and pack.
+  //      CHECK:   %[[INIT_PACKED_B:.*]] = linalg.init_tensor [%[[D0_2]], %[[D1_2]], 2] : tensor<?x?x2xf32>
+  //      CHECK:   %[[PACKED_B:.*]] = scf.for %[[II_2:[0-9a-z]+]] = {{.*}} iter_args(%{{.*}} = %[[INIT_PACKED_B]]) -> (tensor<?x?x2xf32>) {
+  //      CHECK:     scf.for %[[III_2:[0-9a-z]+]] =
+  //      CHECK:       subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0] [1, 1, 2] [1, 1, 1] : tensor<2xf32> into tensor<?x?x2xf32>
+  // Compute.
+  //      CHECK:   scf.for %[[II_3:[0-9a-z]+]] =
+  //      CHECK:     scf.for %[[III_3:[0-9a-z]+]] = {{.*}} iter_args(%[[C:.*]] = %{{.*}}) -> (tensor<f32>) {
+  //      CHECK:       %[[IDX0:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
+  //      CHECK:       %[[IDX1:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
+  //      CHECK:       %[[A:.*]] = subtensor %[[PACKED_A]][%[[IDX0]], %[[IDX1]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
+  //      CHECK:       %[[IDX0_2:.*]] = affine.apply #[[$DIV4]](%[[II_3]])
+  //      CHECK:       %[[IDX1_2:.*]] = affine.apply #[[$DIV2]](%[[III_3]])
+  //      CHECK:       %[[B:.*]] = subtensor %[[PACKED_B]][%[[IDX0_2]], %[[IDX1_2]], 0] [1, 1, 2] [1, 1, 1] : tensor<?x?x2xf32> to tensor<2xf32>
+  //      CHECK:       linalg.dot ins(%[[A]], %[[B]] : tensor<2xf32>, tensor<2xf32>) outs(%[[C]] : tensor<f32>) -> tensor<f32>
+
+  %4 = scf.for %arg3 = %c0 to %1 step %c8 iter_args(%arg4 = %arg2) -> (tensor<f32>) {
+    %5 = affine.min #map0(%arg3)[%2]
+    %6 = subtensor %arg0[%arg3] [%5] [1] : tensor<?xf32> to tensor<?xf32>
+    %7 = affine.min #map0(%arg3)[%3]
+    %8 = subtensor %arg1[%arg3] [%7] [1] : tensor<?xf32> to tensor<?xf32>
+    %9 = scf.for %arg5 = %c0 to %5 step %c4 iter_args(%arg6 = %arg4) -> (tensor<f32>) {
+      %10 = affine.min #map1(%5, %arg5)
+      %11 = subtensor %6[%arg5] [%10] [1] : tensor<?xf32> to tensor<?xf32>
+      %12 = affine.min #map1(%7, %arg5)
+      %13 = subtensor %8[%arg5] [%12] [1] : tensor<?xf32> to tensor<?xf32>
+      %14 = scf.for %arg7 = %c0 to %10 step %c2 iter_args(%arg8 = %arg6) -> (tensor<f32>) {
+        %15 = affine.min #map2(%10, %arg7)
+        %16 = subtensor %11[%arg7] [%15] [1] : tensor<?xf32> to tensor<?xf32>
+        %17 = affine.min #map2(%12, %arg7)
+        %18 = subtensor %13[%arg7] [%17] [1] : tensor<?xf32> to tensor<?xf32>
+        %19 = subi %c2, %15 : index
+        %20 = linalg.pad_tensor %16 low[%c0] high[%19]  {
+        ^bb0(%arg9: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?xf32> to tensor<2xf32>
+        %21 = subi %c2, %17 : index
+        %22 = linalg.pad_tensor %18 low[%c0] high[%21]  {
+        ^bb0(%arg9: index):  // no predecessors
+          linalg.yield %cst : f32
+        } : tensor<?xf32> to tensor<2xf32>
+        %23 = linalg.dot ins(%20, %22 : tensor<2xf32>, tensor<2xf32>) outs(%arg8 : tensor<f32>) -> tensor<f32>
+        scf.yield %23 : tensor<f32>
+      }
+      scf.yield %14 : tensor<f32>
+    }
+    scf.yield %9 : tensor<f32>
+  }
+  return %4 : tensor<f32>
+}