[llvm] 43aa722 - [DSE,MemorySSA] Check if Current is valid for elimination first.

Fri Aug 28 03:19:59 PDT 2020

Author: Florian Hahn
Date: 2020-08-28T11:19:04+01:00
New Revision: 43aa7227dfab8f847a346ce6da7318584182ddf1

URL: https://github.com/llvm/llvm-project/commit/43aa7227dfab8f847a346ce6da7318584182ddf1
DIFF: https://github.com/llvm/llvm-project/commit/43aa7227dfab8f847a346ce6da7318584182ddf1.diff

LOG: [DSE,MemorySSA] Check if Current is valid for elimination first.

This changes getDomMemoryDef to check if a Current is a valid
candidate for elimination before checking for reads. Before the change,
we were spending a lot of compile-time in checking for read accesses for
Current that might not even be removable.

This patch flips the logic, so we skip Current if they cannot be
removed before checking all their uses. This is much more efficient in
practice.

It also adds a more aggressive limit for checking partially overlapping
stores. The main problem with overlapping stores is that we do not know
if they will lead to elimination until seeing all of them. This patch
limits adds a new limit for overlapping store candidates, which keeps
the number of modified overlapping stores roughly the same.

This is another substantial compile-time improvement (while also
increasing the number of stores eliminated). Geomean -O3 -0.67%,
ReleaseThinLTO -0.97%.

http://llvm-compile-time-tracker.com/compare.php?from=0a929b6978a068af8ddb02d0d4714a2843dd8ba9&to=2e630629b43f64b60b282e90f0d96082fde2dacc&stat=instructions

Reviewed By: asbirlea

Differential Revision: https://reviews.llvm.org/D86487

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
    llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
    llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
    llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
    llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
    llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 0c7992031eb5..ae4cc567f0a9 100644

--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -87,6 +87,8 @@ STATISTIC(NumModifiedStores, "Number of stores modified");
 STATISTIC(NumCFGChecks, "Number of stores modified");
 STATISTIC(NumCFGTries, "Number of stores modified");
 STATISTIC(NumCFGSuccess, "Number of stores modified");
+STATISTIC(NumGetDomMemoryDefPassed,
+          "Number of times a valid candidate is returned from getDomMemoryDef");
 STATISTIC(NumDomMemDefChecks,
           "Number iterations check for reads in getDomMemoryDef");
 
@@ -116,6 +118,12 @@ static cl::opt<unsigned> MemorySSAUpwardsStepLimit(
     cl::desc("The maximum number of steps while walking upwards to find "
              "MemoryDefs that may be killed (default = 70)"));
 
+static cl::opt<unsigned> MemorySSAPartialStoreLimit(
+    "dse-memoryssa-partial-store-limit", cl::init(5), cl::Hidden,
+    cl::desc("The maximum number candidates that only partially overwrite the "
+             "killing MemoryDef to consider"
+             " (default = 5)"));
+
 static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
     "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
     cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
@@ -1464,12 +1472,12 @@ namespace {
 // 2. Check that there are no reads between EarlierAccess and the StartDef by
 //    checking all uses starting at EarlierAccess and walking until we see
 //    StartDef.
-// 3. For each found EarlierDef, check that:
-//   1. There are no barrier instructions between EarlierDef and StartDef (like
+// 3. For each found CurrentDef, check that:
+//   1. There are no barrier instructions between CurrentDef and StartDef (like
 //       throws or stores with ordering constraints).
-//   2. StartDef is executed whenever EarlierDef is executed.
-//   3. StartDef completely overwrites EarlierDef.
-// 4. Erase EarlierDef from the function and MemorySSA.
+//   2. StartDef is executed whenever CurrentDef is executed.
+//   3. StartDef completely overwrites CurrentDef.
+// 4. Erase CurrentDef from the function and MemorySSA.
 
 // Returns true if \p M is an intrisnic that does not read or write memory.
 bool isNoopIntrinsic(MemoryUseOrDef *M) {
@@ -1801,26 +1809,29 @@ struct DSEState {
     return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
   }
 
-  // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
-  // read access between them or on any other path to a function exit block if
-  // \p DefLoc is not accessible after the function returns. If there is no such
-  // MemoryDef, return None. The returned value may not (completely) overwrite
-  // \p DefLoc. Currently we bail out when we encounter an aliasing MemoryUse
-  // (read).
+  // Find a MemoryDef writing to \p DefLoc and dominating \p StartAccess, with
+  // no read access between them or on any other path to a function exit block
+  // if \p DefLoc is not accessible after the function returns. If there is no
+  // such MemoryDef, return None. The returned value may not (completely)
+  // overwrite \p DefLoc. Currently we bail out when we encounter an aliasing
+  // MemoryUse (read).
   Optional<MemoryAccess *>
-  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current,
+  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *StartAccess,
                   MemoryLocation DefLoc, const Value *DefUO, CheckCache &Cache,
-                  unsigned &ScanLimit, unsigned &WalkerStepLimit) {
+                  unsigned &ScanLimit, unsigned &WalkerStepLimit,
+                  bool IsMemTerm, unsigned &PartialLimit) {
     if (ScanLimit == 0 || WalkerStepLimit == 0) {
       LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
       return None;
     }
 
-    MemoryAccess *StartAccess = Current;
+    MemoryAccess *Current = StartAccess;
+    Instruction *KillingI = KillingDef->getMemoryInst();
     bool StepAgain;
-    LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
-                      << "\n");
-    // Find the next clobbering Mod access for DefLoc, starting at Current.
+    LLVM_DEBUG(dbgs() << "  trying to get dominating access for "
+                      << *StartAccess << "\n");
+
+    // Find the next clobbering Mod access for DefLoc, starting at StartAccess.
     do {
       StepAgain = false;
       // Reached TOP.
@@ -1839,12 +1850,86 @@ struct DSEState {
       if (isa<MemoryPhi>(Current))
         break;
 
-      // Check if we can skip EarlierDef for DSE.
-      MemoryDef *CurrentDef = dyn_cast<MemoryDef>(Current);
-      if (CurrentDef &&
-          canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+      // Below, check if CurrentDef is a valid candidate to be eliminated by
+      // KillingDef. If it is not, check the next candidate.
+      MemoryDef *CurrentDef = cast<MemoryDef>(Current);
+      Instruction *CurrentI = CurrentDef->getMemoryInst();
+
+      if (canSkipDef(CurrentDef, !isInvisibleToCallerBeforeRet(DefUO))) {
+        StepAgain = true;
+        Current = CurrentDef->getDefiningAccess();
+        continue;
+      }
+
+      // Before we try to remove anything, check for any extra throwing
+      // instructions that block us from DSEing
+      if (mayThrowBetween(KillingI, CurrentI, DefUO)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
+        return None;
+      }
+
+      // Check for anything that looks like it will be a barrier to further
+      // removal
+      if (isDSEBarrier(DefUO, CurrentI)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
+        return None;
+      }
+
+      // If Current is known to be on path that reads DefLoc or is a read
+      // clobber, bail out, as the path is not profitable. We skip this check
+      // for intrinsic calls, because the code knows how to handle memcpy
+      // intrinsics.
+      if (!isa<IntrinsicInst>(CurrentI) &&
+          (Cache.KnownReads.contains(Current) ||
+           isReadClobber(DefLoc, CurrentI))) {
+        Cache.KnownReads.insert(Current);
+        return None;
+      }
+
+      // If Current cannot be analyzed or is not removable, check the next
+      // candidate.
+      if (!hasAnalyzableMemoryWrite(CurrentI, TLI) || !isRemovable(CurrentI)) {
         StepAgain = true;
         Current = CurrentDef->getDefiningAccess();
+        continue;
+      }
+
+      auto CurrentLoc = getLocForWriteEx(CurrentI);
+      if (!CurrentLoc)
+        break;
+
+      if (IsMemTerm) {
+        // If the killing def is a memory terminator (e.g. lifetime.end), check
+        // the next candidate if the current Current does not write the same
+        // underlying object as the terminator.
+        const Value *NIUnd = getUnderlyingObject(CurrentLoc->Ptr);
+        if (DefUO != NIUnd) {
+          StepAgain = true;
+          Current = CurrentDef->getDefiningAccess();
+        }
+        continue;
+      } else {
+        int64_t InstWriteOffset, DepWriteOffset;
+        auto OR = isOverwrite(DefLoc, *CurrentLoc, DL, TLI, DepWriteOffset,
+                              InstWriteOffset, BatchAA, &F);
+        // If Current does not write to the same object as KillingDef, check
+        // the next candidate.
+        if (OR == OW_Unknown) {
+          StepAgain = true;
+          Current = CurrentDef->getDefiningAccess();
+        } else if (OR == OW_MaybePartial) {
+          // If KillingDef only partially overwrites Current, check the next
+          // candidate if the partial step limit is exceeded. This aggressively
+          // limits the number of candidates for partial store elimination,
+          // which are less likely to be removable in the end.
+          if (PartialLimit <= 1) {
+            StepAgain = true;
+            Current = CurrentDef->getDefiningAccess();
+            WalkerStepLimit -= 1;
+            continue;
+          }
+          PartialLimit -= 1;
+        }
       }
     } while (StepAgain);
 
@@ -2260,10 +2345,14 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
 
     unsigned ScanLimit = MemorySSAScanLimit;
     unsigned WalkerStepLimit = MemorySSAUpwardsStepLimit;
+    unsigned PartialLimit = MemorySSAPartialStoreLimit;
     // Worklist of MemoryAccesses that may be killed by KillingDef.
     SetVector<MemoryAccess *> ToCheck;
     ToCheck.insert(KillingDef->getDefiningAccess());
 
+    if (!SILocUnd)
+      continue;
+    bool IsMemTerm = State.isMemTerminatorInst(SI);
     DSEState::CheckCache Cache;
     // Check if MemoryAccesses in the worklist are killed by KillingDef.
     for (unsigned I = 0; I < ToCheck.size(); I++) {
@@ -2271,9 +2360,9 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
       if (State.SkipStores.count(Current))
         continue;
 
-      Optional<MemoryAccess *> Next =
-          State.getDomMemoryDef(KillingDef, Current, SILoc, SILocUnd, Cache,
-                                ScanLimit, WalkerStepLimit);
+      Optional<MemoryAccess *> Next = State.getDomMemoryDef(
+          KillingDef, Current, SILoc, SILocUnd, Cache, ScanLimit,
+          WalkerStepLimit, IsMemTerm, PartialLimit);
 
       if (!Next) {
         LLVM_DEBUG(dbgs() << "  finished walk\n");
@@ -2301,41 +2390,17 @@ bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
       MemoryDef *NextDef = dyn_cast<MemoryDef>(EarlierAccess);
       Instruction *NI = NextDef->getMemoryInst();
       LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
-
-      // Before we try to remove anything, check for any extra throwing
-      // instructions that block us from DSEing
-      if (State.mayThrowBetween(SI, NI, SILocUnd)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
-        break;
-      }
-
-      // Check for anything that looks like it will be a barrier to further
-      // removal
-      if (State.isDSEBarrier(SILocUnd, NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
-        continue;
-      }
-
       ToCheck.insert(NextDef->getDefiningAccess());
-
-      if (!hasAnalyzableMemoryWrite(NI, TLI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot analyze def\n");
-        continue;
-      }
-
-      if (!isRemovable(NI)) {
-        LLVM_DEBUG(dbgs() << "  ... skip, cannot remove def\n");
-        continue;
-      }
+      NumGetDomMemoryDefPassed++;
 
       if (!DebugCounter::shouldExecute(MemorySSACounter))
         continue;
 
       MemoryLocation NILoc = *State.getLocForWriteEx(NI);
 
-      if (State.isMemTerminatorInst(SI)) {
+      if (IsMemTerm) {
         const Value *NIUnd = getUnderlyingObject(NILoc.Ptr);
-        if (!SILocUnd || SILocUnd != NIUnd)
+        if (SILocUnd != NIUnd)
           continue;
         LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
                           << "\n  KILLER: " << *SI << '\n');

diff  --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
index faf7041bfc38..a3bd300c8b78 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/combined-partial-overwrites.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck %s
+; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false < %s | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
+; RUN: opt -S -dse -enable-dse-memoryssa -enable-dse-partial-store-merging=false -dse-memoryssa-partial-store-limit=10 < %s | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s
 target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64le-unknown-linux"
 
@@ -209,22 +210,43 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
 declare void @goFunc(%struct.foostruct*)
 declare i32 @fa(i8*, i8**, i32, i8, i8*)
 
+; We miss this case, because of an aggressive limit of partial overlap analysis.
+; With a larger partial store limit, we remove the memset.
 define void @test4()  {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
-; CHECK-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
-; CHECK-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
-; CHECK-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
-; CHECK-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
-; CHECK-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
-; CHECK-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
-; CHECK-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
-; CHECK-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
-; CHECK-NEXT:    ret void
+; DEFAULT-LIMIT-LABEL: @test4(
+; DEFAULT-LIMIT-NEXT:  entry:
+; DEFAULT-LIMIT-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V1:%.*]] = bitcast %struct.foostruct* [[BANG]] to i8*
+; DEFAULT-LIMIT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[V1]], i64 32
+; DEFAULT-LIMIT-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[TMP0]], i8 0, i64 8, i1 false)
+; DEFAULT-LIMIT-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
+; DEFAULT-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
+; DEFAULT-LIMIT-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
+; DEFAULT-LIMIT-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
+; DEFAULT-LIMIT-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
+; DEFAULT-LIMIT-NEXT:    ret void
+;
+; LARGER-LIMIT-LABEL: @test4(
+; LARGER-LIMIT-NEXT:  entry:
+; LARGER-LIMIT-NEXT:    [[BANG:%.*]] = alloca [[STRUCT_FOOSTRUCT:%.*]], align 8
+; LARGER-LIMIT-NEXT:    [[V2:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 0
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V2]], align 8
+; LARGER-LIMIT-NEXT:    [[V3:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 1
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V3]], align 8
+; LARGER-LIMIT-NEXT:    [[V4:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 2
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V4]], align 8
+; LARGER-LIMIT-NEXT:    [[V5:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 3
+; LARGER-LIMIT-NEXT:    store i32 (i8*, i8**, i32, i8, i8*)* @fa, i32 (i8*, i8**, i32, i8, i8*)** [[V5]], align 8
+; LARGER-LIMIT-NEXT:    [[V6:%.*]] = getelementptr inbounds [[STRUCT_FOOSTRUCT]], %struct.foostruct* [[BANG]], i64 0, i32 4
+; LARGER-LIMIT-NEXT:    store void (i8*, i32, i32)* null, void (i8*, i32, i32)** [[V6]], align 8
+; LARGER-LIMIT-NEXT:    call void @goFunc(%struct.foostruct* [[BANG]])
+; LARGER-LIMIT-NEXT:    ret void
 ;
 entry:
 

diff  --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
index 4171b714d1be..9def78290089 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/debug-counter.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 
-; XFAIL: *
-
 ; REQUIRES: asserts
 
 ; Eliminates store to %R in the entry block.

diff  --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
index ae3066192a00..0e722c56f5f9 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/memoryssa-scan-limit.ll
@@ -1,7 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 
-; XFAIL: *
-
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -S | FileCheck --check-prefix=NO-LIMIT %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=0 -S | FileCheck --check-prefix=LIMIT-0 %s
 ; RUN: opt < %s -basic-aa -dse -enable-dse-memoryssa -dse-memoryssa-scanlimit=2 -S | FileCheck --check-prefix=LIMIT-2 %s

diff  --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
index 579ec8e268fc..e6e206ef5abc 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/multiblock-overlap.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck %s
+; RUN: opt -dse -enable-dse-memoryssa %s -S | FileCheck --check-prefixes=CHECK,DEFAULT-LIMIT %s
+; RUN: opt -dse -enable-dse-memoryssa -dse-memoryssa-partial-store-limit=10 %s -S | FileCheck --check-prefixes=CHECK,LARGER-LIMIT %s
 
 
 %struct.ham = type { [3 x double], [3 x double]}
@@ -7,28 +8,55 @@
 declare void @may_throw()
 declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
 
+; We miss this case, because of an aggressive limit of partial overlap analysis.
+; With a larger partial store limit, we remove the memset.
 define void @overlap1(%struct.ham* %arg, i1 %cond) {
-; CHECK-LABEL: @overlap1(
-; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
-; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
-; CHECK:       bb7:
-; CHECK-NEXT:    br label [[BB9:%.*]]
-; CHECK:       bb8:
-; CHECK-NEXT:    br label [[BB9]]
-; CHECK:       bb9:
-; CHECK-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
-; CHECK-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
-; CHECK-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
-; CHECK-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
-; CHECK-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
-; CHECK-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
-; CHECK-NEXT:    ret void
+; DEFAULT-LIMIT-LABEL: @overlap1(
+; DEFAULT-LIMIT-NEXT:  bb:
+; DEFAULT-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; DEFAULT-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; DEFAULT-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; DEFAULT-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; DEFAULT-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; DEFAULT-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; DEFAULT-LIMIT-NEXT:    [[TMP6:%.*]] = bitcast double* [[TMP2]] to i8*
+; DEFAULT-LIMIT-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, i8* [[TMP6]], i64 32
+; DEFAULT-LIMIT-NEXT:    call void @llvm.memset.p0i8.i64(i8* nonnull align 8 dereferenceable(48) [[TMP0]], i8 0, i64 16, i1 false)
+; DEFAULT-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; DEFAULT-LIMIT:       bb7:
+; DEFAULT-LIMIT-NEXT:    br label [[BB9:%.*]]
+; DEFAULT-LIMIT:       bb8:
+; DEFAULT-LIMIT-NEXT:    br label [[BB9]]
+; DEFAULT-LIMIT:       bb9:
+; DEFAULT-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
+; DEFAULT-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
+; DEFAULT-LIMIT-NEXT:    ret void
+;
+; LARGER-LIMIT-LABEL: @overlap1(
+; LARGER-LIMIT-NEXT:  bb:
+; LARGER-LIMIT-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], %struct.ham* [[ARG:%.*]], i64 0, i32 0, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 0, i64 0
+; LARGER-LIMIT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 2
+; LARGER-LIMIT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i64 1
+; LARGER-LIMIT-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_HAM]], %struct.ham* [[ARG]], i64 0, i32 1, i32 0
+; LARGER-LIMIT-NEXT:    br i1 [[COND:%.*]], label [[BB7:%.*]], label [[BB8:%.*]]
+; LARGER-LIMIT:       bb7:
+; LARGER-LIMIT-NEXT:    br label [[BB9:%.*]]
+; LARGER-LIMIT:       bb8:
+; LARGER-LIMIT-NEXT:    br label [[BB9]]
+; LARGER-LIMIT:       bb9:
+; LARGER-LIMIT-NEXT:    store double 1.000000e+00, double* [[TMP2]], align 8
+; LARGER-LIMIT-NEXT:    store double 2.000000e+00, double* [[TMP1]], align 8
+; LARGER-LIMIT-NEXT:    store double 3.000000e+00, double* [[TMP]], align 8
+; LARGER-LIMIT-NEXT:    store double 4.000000e+00, double* [[TMP5]], align 8
+; LARGER-LIMIT-NEXT:    store double 5.000000e+00, double* [[TMP4]], align 8
+; LARGER-LIMIT-NEXT:    store double 6.000000e+00, double* [[TMP3]], align 8
+; LARGER-LIMIT-NEXT:    ret void
 ;
 bb:
   %tmp = getelementptr inbounds %struct.ham, %struct.ham* %arg, i64 0, i32 0, i64 2

diff  --git a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
index 0cceb5ac4a73..5c04e11b4a78 100644
--- a/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/MSSA/simple.ll
@@ -477,10 +477,8 @@ bb2:
   ret i32 0
 }
 
-; TODO
-; We can remove redundant store, as noalias %p guarantees that the function does
-; only access it via %p. This also holds for the call to unknown_func even though
-; it could unwind
+; We cannot remove any stores, because @unknown_func may unwind and the caller
+; may read %p while unwinding.
 define void @test34(i32* noalias %p) {
 ; CHECK-LABEL: @test34(
 ; CHECK-NEXT:    store i32 1, i32* [[P:%.*]], align 4
@@ -636,9 +634,10 @@ entry:
   ret void
 }
 
-; I think this case is currently handled incorrectly by memdeps dse
-; throwing should leave store i32 1, not remove from the free.
 declare void @free(i8* nocapture)
+
+; We cannot remove `store i32 1, i32* %p`, because @unknown_func may unwind
+; and the caller may read %p while unwinding.
 define void @test41(i32* noalias %P) {
 ; CHECK-LABEL: @test41(
 ; CHECK-NEXT:    [[P2:%.*]] = bitcast i32* [[P:%.*]] to i8*