[Mlir-commits] [mlir] 44826ec - [MLIR] Correct memrefdataflow behavior in the presence of cast and other operations

Mon Jun 28 09:23:49 PDT 2021

Author: William S. Moses
Date: 2021-06-28T12:23:29-04:00
New Revision: 44826ecd929bdd33b3c86650198a5f8a57965cc7

URL: https://github.com/llvm/llvm-project/commit/44826ecd929bdd33b3c86650198a5f8a57965cc7
DIFF: https://github.com/llvm/llvm-project/commit/44826ecd929bdd33b3c86650198a5f8a57965cc7.diff

LOG: [MLIR] Correct memrefdataflow behavior in the presence of cast and other operations

MemRefDataFlow performs mem2reg style operations for affine load/stores. Unfortunately, it is not presently correct in the presence of external operations such as memref.cast, or function calls. This diff extends the functionality of the pass to remain correct in the presence of such ops.

Differential Revision: https://reviews.llvm.org/D104053

Added: 
    

Modified: 
    mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
    mlir/test/Dialect/Affine/scalrep.mlir

Removed: 
    


################################################################################
diff  --git a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
index 2ab4d8fada6b..5be0dcdaea15 100644

--- a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
@@ -33,27 +33,17 @@ using namespace mlir;
 namespace {
 // The store to load forwarding and load CSE rely on three conditions:
 //
-// 1) store/load and load need to have mathematically equivalent affine access
-// functions (checked after full composition of load/store operands); this
-// implies that they access the same single memref element for all iterations of
-// the common surrounding loop,
+// 1) store/load providing a replacement value and load being replaced need to
+// have mathematically equivalent affine access functions (checked after full
+// composition of load/store operands); this implies that they access the same
+// single memref element for all iterations of the common surrounding loop,
 //
 // 2) the store/load op should dominate the load op,
 //
-// 3) among all op's that satisfy both (1) and (2), for store to load
-// forwarding, the one that does not dominate any store op that has a
-// dependence into the load, is provably the last writer to the particular
-// memref location being loaded at the load op, and its store value can be
-// forwarded to the load; for load CSE, any op that does not dominate any store
-// op that have a dependence into the load can be forwarded and the first one
-// found is chosen. Note that the only dependences that are to be considered are
-// those that are satisfied at the block* of the innermost common surrounding
-// loop of the <store/load, load> being considered.
-//
-// (* A dependence being satisfied at a block: a dependence that is satisfied by
-// virtue of the destination operation appearing textually / lexically after
-// the source operation within the body of a 'affine.for' operation; thus, a
-// dependence is always either satisfied by a loop or by a block).
+// 3) no operation that may write to memory read by the load being replaced can
+// occur after executing the instruction (load or store) providing the
+// replacement value and before the load being replaced (thus potentially
+// allowing overwriting the memory read by the load).
 //
 // The above conditions are simple to check, sufficient, and powerful for most
 // cases in practice - they are sufficient, but not necessary --- since they
@@ -70,16 +60,14 @@ struct AffineScalarReplacement
     : public AffineScalarReplacementBase<AffineScalarReplacement> {
   void runOnFunction() override;
 
-  LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp);
-  void loadCSE(AffineReadOpInterface loadOp);
-
-  // A list of memref's that are potentially dead / could be eliminated.
-  SmallPtrSet<Value, 4> memrefsToErase;
-  // Load ops whose results were replaced by those forwarded from stores
-  // dominating stores or loads..
-  SmallVector<Operation *, 8> loadOpsToErase;
+  LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp,
+                                   SmallVectorImpl<Operation *> &loadOpsToErase,
+                                   SmallPtrSetImpl<Value> &memrefsToErase,
+                                   DominanceInfo &domInfo);
 
-  DominanceInfo *domInfo = nullptr;
+  void loadCSE(AffineReadOpInterface loadOp,
+               SmallVectorImpl<Operation *> &loadOpsToErase,
+               DominanceInfo &domInfo);
 };
 
 } // end anonymous namespace
@@ -91,61 +79,204 @@ mlir::createAffineScalarReplacementPass() {
   return std::make_unique<AffineScalarReplacement>();
 }
 
-// Check if the store may be reaching the load.
-static bool storeMayReachLoad(Operation *storeOp, Operation *loadOp,
-                              unsigned minSurroundingLoops) {
-  MemRefAccess srcAccess(storeOp);
-  MemRefAccess destAccess(loadOp);
-  FlatAffineConstraints dependenceConstraints;
-  unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *storeOp);
-  unsigned d;
-  // Dependences at loop depth <= minSurroundingLoops do NOT matter.
-  for (d = nsLoops + 1; d > minSurroundingLoops; d--) {
-    DependenceResult result = checkMemrefAccessDependence(
-        srcAccess, destAccess, d, &dependenceConstraints,
-        /*dependenceComponents=*/nullptr);
-    if (hasDependence(result))
-      break;
-  }
-  if (d <= minSurroundingLoops)
-    return false;
+/// Ensure that all operations that could be executed after `start`
+/// (noninclusive) and prior to `memOp` (e.g. on a control flow/op path
+/// between the operations) do not have the potential memory effect
+/// `EffectType` on `memOp`. `memOp`  is an operation that reads or writes to
+/// a memref. For example, if `EffectType` is MemoryEffects::Write, this method
+/// will check if there is no write to the memory between `start` and `memOp`
+/// that would change the read within `memOp`.
+template <typename EffectType, typename T>
+bool hasNoInterveningEffect(Operation *start, T memOp) {
+
+  Value memref = memOp.getMemRef();
+  bool isOriginalAllocation = memref.getDefiningOp<memref::AllocaOp>() ||
+                              memref.getDefiningOp<memref::AllocOp>();
+
+  // A boolean representing whether an intervening operation could have impacted
+  // memOp.
+  bool hasSideEffect = false;
+
+  // Check whether the effect on memOp can be caused by a given operation op.
+  std::function<void(Operation *)> checkOperation = [&](Operation *op) {
+    // If the effect has alreay been found, early exit,
+    if (hasSideEffect)
+      return;
+
+    if (auto memEffect = dyn_cast<MemoryEffectOpInterface>(op)) {
+      SmallVector<MemoryEffects::EffectInstance, 1> effects;
+      memEffect.getEffects(effects);
+
+      bool opMayHaveEffect = false;
+      for (auto effect : effects) {
+        // If op causes EffectType on a potentially aliasing location for
+        // memOp, mark as having the effect.
+        if (isa<EffectType>(effect.getEffect())) {
+          if (isOriginalAllocation && effect.getValue() &&
+              (effect.getValue().getDefiningOp<memref::AllocaOp>() ||
+               effect.getValue().getDefiningOp<memref::AllocOp>())) {
+            if (effect.getValue() != memref)
+              continue;
+          }
+          opMayHaveEffect = true;
+          break;
+        }
+      }
 
-  return true;
+      if (!opMayHaveEffect)
+        return;
+
+      // If the side effect comes from an affine read or write, try to
+      // prove the side effecting `op` cannot reach `memOp`.
+      if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
+        MemRefAccess srcAccess(op);
+        MemRefAccess destAccess(memOp);
+        // Dependence analysis is only correct if both ops operate on the same
+        // memref.
+        if (srcAccess.memref == destAccess.memref) {
+          FlatAffineConstraints dependenceConstraints;
+
+          // Number of loops containing the start op and the ending operation.
+          unsigned minSurroundingLoops =
+              getNumCommonSurroundingLoops(*start, *memOp);
+
+          // Number of loops containing the operation `op` which has the
+          // potential memory side effect and can occur on a path between
+          // `start` and `memOp`.
+          unsigned nsLoops = getNumCommonSurroundingLoops(*op, *memOp);
+
+          // For ease, let's consider the case that `op` is a store and we're
+          // looking for other potential stores (e.g `op`) that overwrite memory
+          // after `start`, and before being read in `memOp`. In this case, we
+          // only need to consider other potential stores with depth >
+          // minSurrounding loops since `start` would overwrite any store with a
+          // smaller number of surrounding loops before.
+          unsigned d;
+          for (d = nsLoops + 1; d > minSurroundingLoops; d--) {
+            DependenceResult result = checkMemrefAccessDependence(
+                srcAccess, destAccess, d, &dependenceConstraints,
+                /*dependenceComponents=*/nullptr);
+            if (hasDependence(result)) {
+              hasSideEffect = true;
+              return;
+            }
+          }
+
+          // No side effect was seen, simply return.
+          return;
+        }
+      }
+      hasSideEffect = true;
+      return;
+    }
+
+    if (op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
+      // Recurse into the regions for this op and check whether the internal
+      // operations may have the side effect `EffectType` on memOp.
+      for (Region &region : op->getRegions())
+        for (Block &block : region)
+          for (Operation &op : block)
+            checkOperation(&op);
+      return;
+    }
+
+    // Otherwise, conservatively assume generic operations have the effect
+    // on the operation
+    hasSideEffect = true;
+    return;
+  };
+
+  // Check all paths from ancestor op `parent` to the operation `to` for the
+  // effect. It is known that `to` must be contained within `parent`.
+  auto until = [&](Operation *parent, Operation *to) {
+    // TODO check only the paths from `parent` to `to`.
+    // Currently we fallback and check the entire parent op, rather than
+    // just the paths from the parent path, stopping after reaching `to`.
+    // This is conservatively correct, but could be made more aggressive.
+    assert(parent->isAncestor(to));
+    checkOperation(parent);
+  };
+
+  // Check for all paths from operation `from` to operation `untilOp` for the
+  // given memory effect.
+  std::function<void(Operation *, Operation *)> recur =
+      [&](Operation *from, Operation *untilOp) {
+        assert(
+            from->getParentRegion()->isAncestor(untilOp->getParentRegion()) &&
+            "Checking for side effect between two operations without a common "
+            "ancestor");
+
+        // If the operations are in 
diff erent regions, recursively consider all
+        // path from `from` to the parent of `to` and all paths from the parent
+        // of `to` to `to`.
+        if (from->getParentRegion() != untilOp->getParentRegion()) {
+          recur(from, untilOp->getParentOp());
+          until(untilOp->getParentOp(), untilOp);
+          return;
+        }
+
+        // Now, assuming that `from` and `to` exist in the same region, perform
+        // a CFG traversal to check all the relevant operations.
+
+        // Additional blocks to consider.
+        SmallVector<Block *, 2> todoBlocks;
+        {
+          // First consider the parent block of `from` an check all operations
+          // after `from`.
+          for (auto iter = ++from->getIterator(), end = from->getBlock()->end();
+               iter != end && &*iter != untilOp; ++iter) {
+            checkOperation(&*iter);
+          }
+
+          // If the parent of `from` doesn't contain `to`, add the successors
+          // to the list of blocks to check.
+          if (untilOp->getBlock() != from->getBlock())
+            for (Block *succ : from->getBlock()->getSuccessors())
+              todoBlocks.push_back(succ);
+        }
+
+        SmallPtrSet<Block *, 4> done;
+        // Traverse the CFG until hitting `to`.
+        while (todoBlocks.size()) {
+          Block *blk = todoBlocks.pop_back_val();
+          if (done.count(blk))
+            continue;
+          done.insert(blk);
+          for (auto &op : *blk) {
+            if (&op == untilOp)
+              break;
+            checkOperation(&op);
+            if (&op == blk->getTerminator())
+              for (Block *succ : blk->getSuccessors())
+                todoBlocks.push_back(succ);
+          }
+        }
+      };
+  recur(start, memOp);
+  return !hasSideEffect;
 }
 
-// This is a straightforward implementation not optimized for speed. Optimize
-// if needed.
-LogicalResult
-AffineScalarReplacement::forwardStoreToLoad(AffineReadOpInterface loadOp) {
-  // First pass over the use list to get the minimum number of surrounding
-  // loops common between the load op and the store op, with min taken across
-  // all store ops.
-  SmallVector<Operation *, 8> storeOps;
-  unsigned minSurroundingLoops = getNestingDepth(loadOp);
+/// Attempt to eliminate loadOp by replacing it with a value stored into memory
+/// which the load is guaranteed to retrieve. This check involves three
+/// components: 1) The store and load must be on the same location 2) The store
+/// must dominate (and therefore must always occur prior to) the load 3) No
+/// other operations will overwrite the memory loaded between the given load
+/// and store.  If such a value exists, the replaced `loadOp` will be added to
+/// `loadOpsToErase` and its memref will be added to `memrefsToErase`.
+LogicalResult AffineScalarReplacement::forwardStoreToLoad(
+    AffineReadOpInterface loadOp, SmallVectorImpl<Operation *> &loadOpsToErase,
+    SmallPtrSetImpl<Value> &memrefsToErase, DominanceInfo &domInfo) {
+
+  // The store op candidate for forwarding that satisfies all conditions
+  // to replace the load, if any.
+  Operation *lastWriteStoreOp = nullptr;
+
   for (auto *user : loadOp.getMemRef().getUsers()) {
     auto storeOp = dyn_cast<AffineWriteOpInterface>(user);
     if (!storeOp)
       continue;
-    unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *storeOp);
-    minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
-    storeOps.push_back(storeOp);
-  }
-
-  // The list of store op candidates for forwarding that satisfy conditions
-  // (1) and (2) above - they will be filtered later when checking (3).
-  SmallVector<Operation *, 8> fwdingCandidates;
-
-  // Store ops that have a dependence into the load (even if they aren't
-  // forwarding candidates). Each forwarding candidate will be checked for a
-  // dominance on these. 'fwdingCandidates' are a subset of depSrcStores.
-  SmallVector<Operation *, 8> depSrcStores;
-
-  for (auto *storeOp : storeOps) {
-    if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
-      continue;
-
-    // Stores that *may* be reaching the load.
-    depSrcStores.push_back(storeOp);
+    MemRefAccess srcAccess(storeOp);
+    MemRefAccess destAccess(loadOp);
 
     // 1. Check if the store and the load have mathematically equivalent
     // affine access functions; this implies that they statically refer to the
@@ -155,41 +286,24 @@ AffineScalarReplacement::forwardStoreToLoad(AffineReadOpInterface loadOp) {
     //     store %A[%M]
     //     load %A[%N]
     // Use the AffineValueMap 
diff erence based memref access equality checking.
-    MemRefAccess srcAccess(storeOp);
-    MemRefAccess destAccess(loadOp);
     if (srcAccess != destAccess)
       continue;
 
     // 2. The store has to dominate the load op to be candidate.
-    if (!domInfo->dominates(storeOp, loadOp))
+    if (!domInfo.dominates(storeOp, loadOp))
+      continue;
+
+    // 3. Ensure there is no intermediate operation which could replace the
+    // value in memory.
+    if (!hasNoInterveningEffect<MemoryEffects::Write>(storeOp, loadOp))
       continue;
 
     // We now have a candidate for forwarding.
-    fwdingCandidates.push_back(storeOp);
+    assert(lastWriteStoreOp == nullptr &&
+           "multiple simulataneous replacement stores");
+    lastWriteStoreOp = storeOp;
   }
 
-  // 3. Of all the store ops that meet the above criteria, the store op
-  // that does not dominate any of the ops in 'depSrcStores' (if such exists)
-  // will not have any of those latter ops on its paths to `loadOp`. It would
-  // thus be the unique store providing the value to the load. This condition is
-  // however conservative for eg:
-  //
-  // for ... {
-  //    store
-  //    load
-  //    store
-  //    load
-  // }
-  //
-  Operation *lastWriteStoreOp = nullptr;
-  for (auto *storeOp : fwdingCandidates) {
-    if (llvm::all_of(depSrcStores, [&](Operation *depStore) {
-          return !domInfo->properlyDominates(storeOp, depStore);
-        })) {
-      lastWriteStoreOp = storeOp;
-      break;
-    }
-  }
   if (!lastWriteStoreOp)
     return failure();
 
@@ -213,109 +327,85 @@ AffineScalarReplacement::forwardStoreToLoad(AffineReadOpInterface loadOp) {
 // loadA will be be replaced with loadB if:
 // 1) loadA and loadB have mathematically equivalent affine access functions.
 // 2) loadB dominates loadA.
-// 3) loadB does not dominate any of the store ops that have a dependence into
-//    loadA.
-void AffineScalarReplacement::loadCSE(AffineReadOpInterface loadOp) {
-  // The list of load op candidates for forwarding that satisfy conditions
-  // (1) and (2) above - they will be filtered later when checking (3).
-  SmallVector<Operation *, 8> fwdingCandidates;
-  SmallVector<Operation *, 8> storeOps;
-  unsigned minSurroundingLoops = getNestingDepth(loadOp);
-  MemRefAccess memRefAccess(loadOp);
-  // First pass over the use list to get 1) the minimum number of surrounding
-  // loops common between the load op and an load op candidate, with min taken
-  // across all load op candidates; 2) load op candidates; 3) store ops.
-  // We take min across all load op candidates instead of all load ops to make
-  // sure later dependence check is performed at loop depths that do matter.
-  for (auto *user : loadOp.getMemRef().getUsers()) {
-    if (auto storeOp = dyn_cast<AffineWriteOpInterface>(user)) {
-      storeOps.push_back(storeOp);
-    } else if (auto aLoadOp = dyn_cast<AffineReadOpInterface>(user)) {
-      MemRefAccess otherMemRefAccess(aLoadOp);
-      // No need to consider Load ops that have been replaced in previous store
-      // to load forwarding or loadCSE. If loadA or storeA can be forwarded to
-      // loadB, then loadA or storeA can be forwarded to loadC iff loadB can be
-      // forwarded to loadC.
-      // If loadB is visited before loadC and replace with loadA, we do not put
-      // loadB in candidates list, only loadA. If loadC is visited before loadB,
-      // loadC may be replaced with loadB, which will be replaced with loadA
-      // later.
-      if (aLoadOp != loadOp && !llvm::is_contained(loadOpsToErase, aLoadOp) &&
-          memRefAccess == otherMemRefAccess &&
-          domInfo->dominates(aLoadOp, loadOp)) {
-        fwdingCandidates.push_back(aLoadOp);
-        unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *aLoadOp);
-        minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
-      }
+// 3) There is no write between loadA and loadB.
+void AffineScalarReplacement::loadCSE(
+    AffineReadOpInterface loadA, SmallVectorImpl<Operation *> &loadOpsToErase,
+    DominanceInfo &domInfo) {
+  SmallVector<AffineReadOpInterface, 4> loadCandidates;
+  for (auto *user : loadA.getMemRef().getUsers()) {
+    auto loadB = dyn_cast<AffineReadOpInterface>(user);
+    if (!loadB || loadB == loadA)
+      continue;
+
+    MemRefAccess srcAccess(loadB);
+    MemRefAccess destAccess(loadA);
+
+    // 1. The accesses have to be to the same location.
+    if (srcAccess != destAccess) {
+      continue;
     }
-  }
 
-  // No forwarding candidate.
-  if (fwdingCandidates.empty())
-    return;
+    // 2. The store has to dominate the load op to be candidate.
+    if (!domInfo.dominates(loadB, loadA))
+      continue;
 
-  // Store ops that have a dependence into the load.
-  SmallVector<Operation *, 8> depSrcStores;
+    // 3. There is no write between loadA and loadB.
+    if (!hasNoInterveningEffect<MemoryEffects::Write>(loadB.getOperation(),
+                                                      loadA))
+      continue;
 
-  for (auto *storeOp : storeOps) {
-    if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
+    // Check if two values have the same shape. This is needed for affine vector
+    // loads.
+    if (loadB.getValue().getType() != loadA.getValue().getType())
       continue;
 
-    // Stores that *may* be reaching the load.
-    depSrcStores.push_back(storeOp);
+    loadCandidates.push_back(loadB);
   }
 
-  // 3. Of all the load op's that meet the above criteria, return the first load
-  // found that does not dominate any op in 'depSrcStores' and has the same
-  // shape as the load to be replaced (if one exists). The shape check is needed
-  // for affine vector loads.
-  Operation *firstLoadOp = nullptr;
-  Value oldVal = loadOp.getValue();
-  for (auto *loadOp : fwdingCandidates) {
-    if (llvm::all_of(depSrcStores,
-                     [&](Operation *depStore) {
-                       return !domInfo->properlyDominates(loadOp, depStore);
-                     }) &&
-        cast<AffineReadOpInterface>(loadOp).getValue().getType() ==
-            oldVal.getType()) {
-      firstLoadOp = loadOp;
+  // Of the legal load candidates, use the one that dominates all others
+  // to minimize the subsequent need to loadCSE
+  Value loadB;
+  for (AffineReadOpInterface option : loadCandidates) {
+    if (llvm::all_of(loadCandidates, [&](AffineReadOpInterface depStore) {
+          return depStore == option ||
+                 domInfo.dominates(option.getOperation(),
+                                   depStore.getOperation());
+        })) {
+      loadB = option.getValue();
       break;
     }
   }
-  if (!firstLoadOp)
-    return;
 
-  // Perform the actual load to load forwarding.
-  Value loadVal = cast<AffineReadOpInterface>(firstLoadOp).getValue();
-  loadOp.getValue().replaceAllUsesWith(loadVal);
-  // Record this to erase later.
-  loadOpsToErase.push_back(loadOp);
+  if (loadB) {
+    loadA.getValue().replaceAllUsesWith(loadB);
+    // Record this to erase later.
+    loadOpsToErase.push_back(loadA);
+  }
 }
 
 void AffineScalarReplacement::runOnFunction() {
   // Only supports single block functions at the moment.
   FuncOp f = getFunction();
-  if (!llvm::hasSingleElement(f)) {
-    markAllAnalysesPreserved();
-    return;
-  }
 
-  domInfo = &getAnalysis<DominanceInfo>();
+  // Load op's whose results were replaced by those forwarded from stores.
+  SmallVector<Operation *, 8> opsToErase;
+
+  // A list of memref's that are potentially dead / could be eliminated.
+  SmallPtrSet<Value, 4> memrefsToErase;
 
-  loadOpsToErase.clear();
-  memrefsToErase.clear();
+  auto &domInfo = getAnalysis<DominanceInfo>();
 
-  // Walk all load's and perform store to load forwarding and loadCSE.
+  // Walk all load's and perform store to load forwarding.
   f.walk([&](AffineReadOpInterface loadOp) {
-    // Do store to load forwarding first, if no success, try loadCSE.
-    if (failed(forwardStoreToLoad(loadOp)))
-      loadCSE(loadOp);
+    if (failed(
+            forwardStoreToLoad(loadOp, opsToErase, memrefsToErase, domInfo))) {
+      loadCSE(loadOp, opsToErase, domInfo);
+    }
   });
 
-  // Erase all load op's whose results were replaced with store or load fwd'ed
-  // ones.
-  for (auto *loadOp : loadOpsToErase)
-    loadOp->erase();
+  // Erase all load op's whose results were replaced with store fwd'ed ones.
+  for (auto *op : opsToErase)
+    op->erase();
 
   // Check if the store fwd'ed memrefs are now left with only stores and can
   // thus be completely deleted. Note: the canonicalize pass should be able

diff  --git a/mlir/test/Dialect/Affine/scalrep.mlir b/mlir/test/Dialect/Affine/scalrep.mlir
index 8d39fe300345..452ff0939a18 100644
--- a/mlir/test/Dialect/Affine/scalrep.mlir
+++ b/mlir/test/Dialect/Affine/scalrep.mlir
@@ -554,3 +554,91 @@ func @vector_load_affine_apply_store_load(%in : memref<512xf32>, %out : memref<5
   }
   return
 }
+
+// CHECK-LABEL: func @external_no_forward_load
+
+func @external_no_forward_load(%in : memref<512xf32>, %out : memref<512xf32>) {
+  affine.for %i = 0 to 16 {
+    %ld0 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld0, %out[32*%i] : memref<512xf32>
+    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+// CHECK:   affine.load
+// CHECK:   affine.store
+// CHECK:   affine.load
+// CHECK:   affine.store
+
+// CHECK-LABEL: func @external_no_forward_store
+
+func @external_no_forward_store(%in : memref<512xf32>, %out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %in[32*%i] : memref<512xf32>
+    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+// CHECK:   affine.store
+// CHECK:   affine.load
+// CHECK:   affine.store
+
+// CHECK-LABEL: func @no_forward_cast
+
+func @no_forward_cast(%in : memref<512xf32>, %out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  %cf2 = constant 2.0 : f32
+  %m2 = memref.cast %in : memref<512xf32> to memref<?xf32>
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %in[32*%i] : memref<512xf32>
+    affine.store %cf2, %m2[32*%i] : memref<?xf32>
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+// CHECK:   affine.store
+// CHECK-NEXT:   affine.store
+// CHECK-NEXT:   affine.load
+// CHECK-NEXT:   affine.store
+
+// Although there is a dependence from the second store to the load, it is
+// satisfied by the outer surrounding loop, and does not prevent the first
+// store to be forwarded to the load.
+
+// CHECK-LABEL: func @overlap_no_fwd
+func @overlap_no_fwd(%N : index) -> f32 {
+  %cf7 = constant 7.0 : f32
+  %cf9 = constant 9.0 : f32
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %m = memref.alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 5 {
+    affine.store %cf7, %m[2 * %i0] : memref<10xf32>
+    affine.for %i1 = 0 to %N {
+      %v0 = affine.load %m[2 * %i0] : memref<10xf32>
+      %v1 = addf %v0, %v0 : f32
+      affine.store %cf9, %m[%i0 + 1] : memref<10xf32>
+    }
+  }
+  // Due to this load, the memref isn't optimized away.
+  %v3 = affine.load %m[%c1] : memref<10xf32>
+  return %v3 : f32
+
+// CHECK:  affine.for %{{.*}} = 0 to 5 {
+// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
+// CHECK-NEXT:      %{{.*}} = affine.load
+// CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:  return %{{.*}} : f32
+}
+