[Mlir-commits] [mlir] 239eff5 - [mlir][VectorOps] Redo the scalar loop emission in VectoToSCF to pad instead of clipping

Tue Sep 8 02:15:56 PDT 2020

Author: Benjamin Kramer
Date: 2020-09-08T11:15:25+02:00
New Revision: 239eff502bca64f544f311e7d7a65fdec01cb9c4

URL: https://github.com/llvm/llvm-project/commit/239eff502bca64f544f311e7d7a65fdec01cb9c4
DIFF: https://github.com/llvm/llvm-project/commit/239eff502bca64f544f311e7d7a65fdec01cb9c4.diff

LOG: [mlir][VectorOps] Redo the scalar loop emission in VectoToSCF to pad instead of clipping

This replaces the select chain for edge-padding with an scf.if that
performs the memory operation when the index is in bounds and uses the
pad value when it's not. For transfer_write the same mechanism is used,
skipping the store when the index is out of bounds.

The integration test has a bunch of cases of how I believe this should
work.

Differential Revision: https://reviews.llvm.org/D87241

Added: 
    

Modified: 
    mlir/integration_test/Dialect/Vector/CPU/test-transfer-to-loops.mlir
    mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
    mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
    mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/integration_test/Dialect/Vector/CPU/test-transfer-to-loops.mlir b/mlir/integration_test/Dialect/Vector/CPU/test-transfer-to-loops.mlir
index 8d965779dfc6..38cbabc32998 100644

--- a/mlir/integration_test/Dialect/Vector/CPU/test-transfer-to-loops.mlir
+++ b/mlir/integration_test/Dialect/Vector/CPU/test-transfer-to-loops.mlir
@@ -4,6 +4,7 @@
 // RUN: FileCheck %s
 
 #map0 = affine_map<(d0, d1) -> (d1, d0)>
+#map1 = affine_map<(d0, d1) -> (d1)>
 
 func @print_memref_f32(memref<*xf32>)
 
@@ -29,6 +30,7 @@ func @main() {
   %c0 = constant 0 : index
   %c1 = constant 1 : index
   %c2 = constant 2 : index
+  %c3 = constant 3 : index
   %c6 = constant 6 : index
   %cst = constant -4.2e+01 : f32
   %0 = call @alloc_2d_filled_f32(%c6, %c6) : (index, index) -> memref<?x?xf32>
@@ -76,6 +78,28 @@ func @main() {
   // CHECK-SAME:    ( 205, 305, 405, 505, 504 ),
   // CHECK-SAME:    ( 105, 205, 305, 405, 505 ) )
 
+  %3 = vector.transfer_read %0[%c2, %c3], %cst : memref<?x?xf32>, vector<5x5xf32>
+  vector.print %3 : vector<5x5xf32>
+  // New 5x5 block rooted @{2, 3} in memory.
+  // CHECK-NEXT: ( ( 403, 503, 502, -42, -42 ),
+  // CHECK-SAME:   ( 404, 504, 503, -42, -42 ),
+  // CHECK-SAME:   ( 405, 505, 504, -42, -42 ),
+  // CHECK-SAME:   ( 305, 405, 505, -42, -42 ),
+  // CHECK-SAME:   ( -42, -42, -42, -42, -42 ) )
+
+  %4 = vector.transfer_read %0[%c2, %c3], %cst {permutation_map = #map0} : memref<?x?xf32>, vector<5x5xf32>
+  vector.print %4 : vector<5x5xf32>
+  // Transposed 5x5 block rooted @{2, 3} in memory.
+  // CHECK-NEXT: ( ( 403, 404, 405, 305, -42 ),
+  // CHECK-SAME:   ( 503, 504, 505, 405, -42 ),
+  // CHECK-SAME:   ( 502, 503, 504, 505, -42 ),
+  // CHECK-SAME:   ( -42, -42, -42, -42, -42 ),
+  // CHECK-SAME:   ( -42, -42, -42, -42, -42 ) )
+
+  %5 = vector.transfer_read %0[%c2, %c3], %cst {permutation_map = #map1} : memref<?x?xf32>, vector<5xf32>
+  vector.print %5 : vector<5xf32>
+  // CHECK-NEXT: ( 403, 503, 502, -42, -42 )
+
   dealloc %0 : memref<?x?xf32>
   return
 }

diff  --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index a43bec855ff0..d51a96dca384 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -1096,7 +1096,7 @@ static bool isContiguous(MemRefType memRefType,
                          SmallVectorImpl<int64_t> &strides) {
   int64_t offset;
   auto successStrides = getStridesAndOffset(memRefType, strides, offset);
-  bool isContiguous = (strides.back() == 1);
+  bool isContiguous = strides.empty() || strides.back() == 1;
   if (isContiguous) {
     auto sizes = memRefType.getShape();
     for (int index = 0, e = strides.size() - 2; index < e; ++index) {

diff  --git a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
index 08d0117e6a17..801ead825ffc 100644
--- a/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
+++ b/mlir/lib/Conversion/VectorToSCF/VectorToSCF.cpp
@@ -111,15 +111,6 @@ class NDTransferOpHelper {
   template <typename Lambda>
   void emitLoops(Lambda loopBodyBuilder);
 
-  /// Operate within the body of `emitLoops` to:
-  ///   1. Compute the indexings `majorIvs + majorOffsets` and save them in
-  ///      `majorIvsPlusOffsets`.
-  ///   2. Return a boolean that determines whether the first `majorIvs.rank()`
-  ///      dimensions `majorIvs + majorOffsets` are all within `memrefBounds`.
-  Value emitInBoundsCondition(ValueRange majorIvs, ValueRange majorOffsets,
-                              MemRefBoundsCapture &memrefBounds,
-                              SmallVectorImpl<Value> &majorIvsPlusOffsets);
-
   /// Common state to lower vector transfer ops.
   PatternRewriter &rewriter;
   const VectorTransferToSCFOptions &options;
@@ -196,11 +187,16 @@ static Value onTheFlyFoldSLT(Value v, Value ub) {
   return slt(v, ub);
 }
 
-template <typename ConcreteOp>
-Value NDTransferOpHelper<ConcreteOp>::emitInBoundsCondition(
-    ValueRange majorIvs, ValueRange majorOffsets,
-    MemRefBoundsCapture &memrefBounds,
-    SmallVectorImpl<Value> &majorIvsPlusOffsets) {
+///   1. Compute the indexings `majorIvs + majorOffsets` and save them in
+///      `majorIvsPlusOffsets`.
+///   2. Return a value of i1 that determines whether the first `majorIvs.rank()`
+///      dimensions `majorIvs + majorOffsets` are all within `memrefBounds`.
+static Value
+emitInBoundsCondition(PatternRewriter &rewriter,
+                      VectorTransferOpInterface xferOp, unsigned leadingRank,
+                      ValueRange majorIvs, ValueRange majorOffsets,
+                      MemRefBoundsCapture &memrefBounds,
+                      SmallVectorImpl<Value> &majorIvsPlusOffsets) {
   Value inBoundsCondition;
   majorIvsPlusOffsets.reserve(majorIvs.size());
   unsigned idx = 0;
@@ -271,7 +267,8 @@ LogicalResult NDTransferOpHelper<TransferReadOp>::doReplace() {
     // context.
     SmallVector<Value, 4> majorIvsPlusOffsets;
     Value inBoundsCondition = emitInBoundsCondition(
-        majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
+        rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
+        leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
 
     if (inBoundsCondition) {
       // 2. If the condition is not null, we need an IfOp, which may yield
@@ -374,7 +371,8 @@ LogicalResult NDTransferOpHelper<TransferWriteOp>::doReplace() {
     // context.
     SmallVector<Value, 4> majorIvsPlusOffsets;
     Value inBoundsCondition = emitInBoundsCondition(
-        majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
+        rewriter, cast<VectorTransferOpInterface>(xferOp.getOperation()),
+        leadingRank, majorIvs, majorOffsets, memrefBounds, majorIvsPlusOffsets);
 
     if (inBoundsCondition) {
       // 2.a. If the condition is not null, we need an IfOp, to write
@@ -424,60 +422,6 @@ static int computeCoalescedIndex(TransferOpTy transfer) {
   return coalescedIdx;
 }
 
-/// Emits remote memory accesses that are clipped to the boundaries of the
-/// MemRef.
-template <typename TransferOpTy>
-static SmallVector<Value, 8>
-clip(TransferOpTy transfer, MemRefBoundsCapture &bounds, ArrayRef<Value> ivs) {
-  using namespace mlir::edsc;
-
-  Value zero(std_constant_index(0)), one(std_constant_index(1));
-  SmallVector<Value, 8> memRefAccess(transfer.indices());
-  SmallVector<Value, 8> clippedScalarAccessExprs(memRefAccess.size());
-  // Indices accessing to remote memory are clipped and their expressions are
-  // returned in clippedScalarAccessExprs.
-  for (unsigned memRefDim = 0; memRefDim < clippedScalarAccessExprs.size();
-       ++memRefDim) {
-    // Linear search on a small number of entries.
-    int loopIndex = -1;
-    auto exprs = transfer.permutation_map().getResults();
-    for (auto en : llvm::enumerate(exprs)) {
-      auto expr = en.value();
-      auto dim = expr.template dyn_cast<AffineDimExpr>();
-      // Sanity check.
-      assert(
-          (dim || expr.template cast<AffineConstantExpr>().getValue() == 0) &&
-          "Expected dim or 0 in permutationMap");
-      if (dim && memRefDim == dim.getPosition()) {
-        loopIndex = en.index();
-        break;
-      }
-    }
-
-    // We cannot distinguish atm between unrolled dimensions that implement
-    // the "always full" tile abstraction and need clipping from the other
-    // ones. So we conservatively clip everything.
-    using namespace edsc::op;
-    auto N = bounds.ub(memRefDim);
-    auto i = memRefAccess[memRefDim];
-    if (loopIndex < 0) {
-      auto N_minus_1 = N - one;
-      auto select_1 = std_select(slt(i, N), i, N_minus_1);
-      clippedScalarAccessExprs[memRefDim] =
-          std_select(slt(i, zero), zero, select_1);
-    } else {
-      auto ii = ivs[loopIndex];
-      auto i_plus_ii = i + ii;
-      auto N_minus_1 = N - one;
-      auto select_1 = std_select(slt(i_plus_ii, N), i_plus_ii, N_minus_1);
-      clippedScalarAccessExprs[memRefDim] =
-          std_select(slt(i_plus_ii, zero), zero, select_1);
-    }
-  }
-
-  return clippedScalarAccessExprs;
-}
-
 namespace mlir {
 
 template <typename TransferOpTy>
@@ -497,6 +441,60 @@ MemRefType VectorTransferRewriter<TransferOpTy>::tmpMemRefType(
                          {}, 0);
 }
 
+static void emitWithBoundsChecks(
+    PatternRewriter &rewriter, VectorTransferOpInterface transfer,
+    ValueRange ivs, MemRefBoundsCapture &memRefBoundsCapture,
+    function_ref<void(ArrayRef<Value>)> inBoundsFun,
+    function_ref<void(ArrayRef<Value>)> outOfBoundsFun = nullptr) {
+  // Permute the incoming indices according to the permutation map.
+  SmallVector<Value, 4> indices =
+      linalg::applyMapToValues(rewriter, transfer.getLoc(),
+                               transfer.permutation_map(), transfer.indices());
+
+  // Generate a bounds check if necessary.
+  SmallVector<Value, 4> majorIvsPlusOffsets;
+  Value inBoundsCondition =
+      emitInBoundsCondition(rewriter, transfer, 0, ivs, indices,
+                            memRefBoundsCapture, majorIvsPlusOffsets);
+
+  // Apply the permutation map to the ivs. The permutation map may not use all
+  // the inputs.
+  SmallVector<Value, 4> scalarAccessExprs(transfer.indices().size());
+  for (unsigned memRefDim = 0; memRefDim < transfer.indices().size();
+       ++memRefDim) {
+    // Linear search on a small number of entries.
+    int loopIndex = -1;
+    auto exprs = transfer.permutation_map().getResults();
+    for (auto en : llvm::enumerate(exprs)) {
+      auto expr = en.value();
+      auto dim = expr.dyn_cast<AffineDimExpr>();
+      // Sanity check.
+      assert((dim || expr.cast<AffineConstantExpr>().getValue() == 0) &&
+             "Expected dim or 0 in permutationMap");
+      if (dim && memRefDim == dim.getPosition()) {
+        loopIndex = en.index();
+        break;
+      }
+    }
+
+    using namespace edsc::op;
+    auto i = transfer.indices()[memRefDim];
+    scalarAccessExprs[memRefDim] = loopIndex < 0 ? i : i + ivs[loopIndex];
+  }
+
+  if (inBoundsCondition)
+    conditionBuilder(
+        /* scf.if */ inBoundsCondition, // {
+        [&] { inBoundsFun(scalarAccessExprs); },
+        // } else {
+        outOfBoundsFun ? [&] { outOfBoundsFun(scalarAccessExprs); }
+                       : function_ref<void()>()
+        // }
+    );
+  else
+    inBoundsFun(scalarAccessExprs);
+}
+
 /// Lowers TransferReadOp into a combination of:
 ///   1. local memory allocation;
 ///   2. perfect loop nest over:
@@ -588,17 +586,25 @@ LogicalResult VectorTransferRewriter<TransferReadOp>::matchAndRewrite(
   Value tmp = setAllocAtFunctionEntry(tmpMemRefType(transfer), transfer);
   StdIndexedValue local(tmp);
   loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
-    auto ivs = llvm::to_vector<8>(loopIvs);
+    auto ivsStorage = llvm::to_vector<8>(loopIvs);
     // Swap the ivs which will reorder memory accesses.
     if (coalescedIdx >= 0)
-      std::swap(ivs.back(), ivs[coalescedIdx]);
-    // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
-    SmallVector<Value, 8> indices = clip(transfer, memRefBoundsCapture, ivs);
-    ArrayRef<Value> indicesRef(indices), ivsRef(ivs);
-    Value pos = std_index_cast(IntegerType::get(32, ctx), ivsRef.back());
-    Value scal = remote(indicesRef);
-    Value vector = vector_insert_element(scal, local(ivsRef.drop_back()), pos);
-    local(ivsRef.drop_back()) = vector;
+      std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]);
+
+    ArrayRef<Value> ivs(ivsStorage);
+    Value pos = std_index_cast(IntegerType::get(32, ctx), ivs.back());
+    Value inVector = local(ivs.drop_back());
+    auto loadValue = [&](ArrayRef<Value> indices) {
+      Value vector = vector_insert_element(remote(indices), inVector, pos);
+      local(ivs.drop_back()) = vector;
+    };
+    auto loadPadding = [&](ArrayRef<Value>) {
+      Value vector = vector_insert_element(transfer.padding(), inVector, pos);
+      local(ivs.drop_back()) = vector;
+    };
+    emitWithBoundsChecks(
+        rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs,
+        memRefBoundsCapture, loadValue, loadPadding);
   });
   Value vectorValue = std_load(vector_type_cast(tmp));
 
@@ -674,17 +680,21 @@ LogicalResult VectorTransferRewriter<TransferWriteOp>::matchAndRewrite(
   Value vec = vector_type_cast(tmp);
   std_store(vectorValue, vec);
   loopNestBuilder(lbs, ubs, steps, [&](ValueRange loopIvs) {
-    auto ivs = llvm::to_vector<8>(loopIvs);
-    // Swap the ivs which will reorder memory accesses.
+    auto ivsStorage = llvm::to_vector<8>(loopIvs);
+    // Swap the ivsStorage which will reorder memory accesses.
     if (coalescedIdx >= 0)
-      std::swap(ivs.back(), ivs[coalescedIdx]);
-    // Computes clippedScalarAccessExprs in the loop nest scope (ivs exist).
-    SmallVector<Value, 8> indices = clip(transfer, memRefBoundsCapture, ivs);
-    ArrayRef<Value> indicesRef(indices), ivsRef(ivs);
+      std::swap(ivsStorage.back(), ivsStorage[coalescedIdx]);
+
+    ArrayRef<Value> ivs(ivsStorage);
     Value pos =
-        std_index_cast(IntegerType::get(32, op->getContext()), ivsRef.back());
-    Value scalar = vector_extract_element(local(ivsRef.drop_back()), pos);
-    remote(indices) = scalar;
+        std_index_cast(IntegerType::get(32, op->getContext()), ivs.back());
+    auto storeValue = [&](ArrayRef<Value> indices) {
+      Value scalar = vector_extract_element(local(ivs.drop_back()), pos);
+      remote(indices) = scalar;
+    };
+    emitWithBoundsChecks(
+        rewriter, cast<VectorTransferOpInterface>(transfer.getOperation()), ivs,
+        memRefBoundsCapture, storeValue);
   });
 
   // 3. Erase.

diff  --git a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
index 5e8aea1f5113..ef1b2e995053 100644
--- a/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
+++ b/mlir/test/Conversion/VectorToSCF/vector-to-loops.mlir
@@ -15,11 +15,13 @@ func @materialize_read_1d() {
       %ip3 = affine.apply affine_map<(d0) -> (d0 + 3)> (%i1)
       %f4 = vector.transfer_read %A[%i0, %ip3], %f0 {permutation_map = affine_map<(d0, d1) -> (d0)>} : memref<7x42xf32>, vector<4xf32>
       // Both accesses in the load must be clipped otherwise %i1 + 2 and %i1 + 3 will go out of bounds.
-      // CHECK: {{.*}} = select
-      // CHECK: %[[FILTERED1:.*]] = select
-      // CHECK: {{.*}} = select
-      // CHECK: %[[FILTERED2:.*]] = select
-      // CHECK: %{{.*}} = load {{.*}}[%[[FILTERED1]], %[[FILTERED2]]] : memref<7x42xf32>
+      // CHECK: scf.if
+      // CHECK-NEXT: load
+      // CHECK-NEXT: vector.insertelement
+      // CHECK-NEXT: store
+      // CHECK-NEXT: else
+      // CHECK-NEXT: vector.insertelement
+      // CHECK-NEXT: store
     }
   }
   return
@@ -53,7 +55,6 @@ func @materialize_read_1d_partially_specialized(%dyn1 : index, %dyn2 : index, %d
 // -----
 
 // CHECK: #[[$ADD:map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)>
-// CHECK: #[[$SUB:map[0-9]+]] = affine_map<()[s0] -> (s0 - 1)>
 
 // CHECK-LABEL: func @materialize_read(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
 func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
@@ -72,37 +73,18 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
   // CHECK-NEXT:          scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
   // CHECK-NEXT:            scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
   // CHECK-NEXT:              scf.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$ADD]](%[[I0]], %[[I4]])
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}]
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = select
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
-  // CHECK-NEXT:                %[[L0:.*]] = select
-  //
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}]
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = select
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
-  // CHECK-NEXT:                %[[L1:.*]] = select
-  //
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}]
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = select
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
-  // CHECK-NEXT:                %[[L2:.*]] = select
-  //
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$ADD]](%[[I3]], %[[I6]])
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}]
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = select
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
-  // CHECK-NEXT:                %[[L3:.*]] = select
-  // CHECK-NEXT:                %[[VIDX:.*]] = index_cast %[[I4]]
-  //
-  // CHECK-NEXT:                %[[SCAL:.*]] = load %{{.*}}[%[[L0]], %[[L1]], %[[L2]], %[[L3]]] : memref<?x?x?x?xf32>
-  // CHECK-NEXT:                %[[VEC:.*]] = load %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
-  // CHECK-NEXT:                %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %[[VEC]][%[[VIDX]] : i32] : vector<3xf32>
-  // CHECK-NEXT:                store %[[RVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
+  // CHECK:                     %[[VIDX:.*]] = index_cast %[[I4]]
+  // CHECK:                     %[[VEC:.*]] = load %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
+  // CHECK:                     %[[L0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]])
+  // CHECK:                     %[[L3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I6]])
+  // CHECK-NEXT:                scf.if
+  // CHECK-NEXT:                  %[[SCAL:.*]] = load %{{.*}}[%[[L0]], %[[I1]], %[[I2]], %[[L3]]] : memref<?x?x?x?xf32>
+  // CHECK-NEXT:                  %[[RVEC:.*]] = vector.insertelement %[[SCAL]], %[[VEC]][%[[VIDX]] : i32] : vector<3xf32>
+  // CHECK-NEXT:                  store %[[RVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
+  // CHECK-NEXT:                } else {
+  // CHECK-NEXT:                  %[[CVEC:.*]] = vector.insertelement
+  // CHECK-NEXT:                  store %[[CVEC]], %[[ALLOC]][%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
+  // CHECK-NEXT:                }
   // CHECK-NEXT:              }
   // CHECK-NEXT:            }
   // CHECK-NEXT:          }
@@ -132,7 +114,6 @@ func @materialize_read(%M: index, %N: index, %O: index, %P: index) {
 // -----
 
 // CHECK: #[[$ADD:map[0-9]+]] = affine_map<(d0, d1) -> (d0 + d1)>
-// CHECK: #[[$SUB:map[0-9]+]] = affine_map<()[s0] -> (s0 - 1)>
 
 // CHECK-LABEL:func @materialize_write(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index, %{{.*}}: index) {
 func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
@@ -153,37 +134,15 @@ func @materialize_write(%M: index, %N: index, %O: index, %P: index) {
   // CHECK-NEXT:          scf.for %[[I4:.*]] = %[[C0]] to %[[C3]] step %[[C1]] {
   // CHECK-NEXT:            scf.for %[[I5:.*]] = %[[C0]] to %[[C4]] step %[[C1]] {
   // CHECK-NEXT:              scf.for %[[I6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] {
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$ADD]](%[[I0]], %[[I4]])
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}]
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = select {{.*}}, {{.*}}, {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
-  // CHECK-NEXT:                %[[S0:.*]] = select {{.*}}, %[[C0]], {{.*}} : index
-  //
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$ADD]](%[[I1]], %[[I5]])
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}]
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = select {{.*}}, {{.*}}, {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
-  // CHECK-NEXT:                %[[S1:.*]] = select {{.*}}, %[[C0]], {{.*}} : index
-  //
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}]
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", %[[I2]], %{{.*}} : index
-  // CHECK-NEXT:                {{.*}} = select {{.*}}, %[[I2]], {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", %[[I2]], %[[C0]] : index
-  // CHECK-NEXT:                %[[S2:.*]] = select {{.*}}, %[[C0]], {{.*}} : index
-  //
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$ADD]](%[[I3]], %[[I6]])
-  // CHECK-NEXT:                {{.*}} = affine.apply #[[$SUB]]()[%{{.*}}]
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = select {{.*}}, {{.*}}, {{.*}} : index
-  // CHECK-NEXT:                {{.*}} = cmpi "slt", {{.*}}, %[[C0]] : index
-  // CHECK-NEXT:                %[[S3:.*]] = select {{.*}}, %[[C0]], {{.*}} : index
-  // CHECK-NEXT:                %[[VIDX:.*]] = index_cast %[[I4]]
-  //
-  // CHECK-NEXT:                %[[VEC:.*]] = load {{.*}}[%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
-  // CHECK-NEXT:                %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32>
-  // CHECK-NEXT:                store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[S2]], %[[S3]]] : memref<?x?x?x?xf32>
+  // CHECK:                     %[[VIDX:.*]] = index_cast %[[I4]]
+  // CHECK:                     %[[S0:.*]] = affine.apply #[[$ADD]](%[[I0]], %[[I4]])
+  // CHECK:                     %[[S1:.*]] = affine.apply #[[$ADD]](%[[I1]], %[[I5]])
+  // CHECK:                     %[[S3:.*]] = affine.apply #[[$ADD]](%[[I3]], %[[I6]])
+  // CHECK-NEXT:                scf.if
+  // CHECK-NEXT:                  %[[VEC:.*]] = load {{.*}}[%[[I6]], %[[I5]]] : memref<5x4xvector<3xf32>>
+  // CHECK-NEXT:                  %[[SCAL:.*]] = vector.extractelement %[[VEC]][%[[VIDX]] : i32] : vector<3xf32>
+  //      CHECK:                  store %[[SCAL]], {{.*}}[%[[S0]], %[[S1]], %[[I2]], %[[S3]]] : memref<?x?x?x?xf32>
+  // CHECK-NEXT:                }
   // CHECK-NEXT:              }
   // CHECK-NEXT:            }
   // CHECK-NEXT:          }