[llvm] 222aeb4 - [DSE] Remove stores in the same loop iteration

Mon May 31 02:23:07 PDT 2021

Author: David Green
Date: 2021-05-31T10:22:37+01:00
New Revision: 222aeb4d51a46c5a81c9e4ccb16d1d19dd21ec95

URL: https://github.com/llvm/llvm-project/commit/222aeb4d51a46c5a81c9e4ccb16d1d19dd21ec95
DIFF: https://github.com/llvm/llvm-project/commit/222aeb4d51a46c5a81c9e4ccb16d1d19dd21ec95.diff

LOG: [DSE] Remove stores in the same loop iteration

DSE will currently only remove stores in the same block unless they can
be guaranteed to be loop invariant. This expands that to any stores that
are in the same Loop, at the same loop level. This should still account
for where AA/MSSA will not handle aliasing between loops, but allow the
dead stores to be removed where they overlap in the same loop iteration.
It requires adding loop info to DSE, but that looks fairly harmless.

The test case this helps is from code like this, which can come up in
certain matrix operations:
  for(i=..)
    dst[i] = 0;
    for(j=..)
      dst[i] += src[i*n+j];

After LICM, this becomes:
for(i=..)
  dst[i] = 0;
  sum = 0;
  for(j=..)
    sum += src[i*n+j];
  dst[i] = sum;

The first store is dead, and with this patch is now removed.

Differntial Revision: https://reviews.llvm.org/D100464

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
    llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
    llvm/test/Other/opt-O2-pipeline.ll
    llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
    llvm/test/Other/opt-O3-pipeline.ll
    llvm/test/Other/opt-Os-pipeline.ll
    llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 94f3e0d3464fd..e6b9c2a0e5807 100644

--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -40,10 +40,12 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -853,6 +855,11 @@ struct DSEState {
   PostDominatorTree &PDT;
   const TargetLibraryInfo &TLI;
   const DataLayout &DL;
+  const LoopInfo &LI;
+
+  // Whether the function contains any irreducible control flow, useful for
+  // being accurately able to detect loops.
+  bool ContainsIrreducibleLoops;
 
   // All MemoryDefs that potentially could kill other MemDefs.
   SmallVector<MemoryDef *, 64> MemDefs;
@@ -876,14 +883,15 @@ struct DSEState {
   DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
 
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
-           PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
+           PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
+           const LoopInfo &LI)
       : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
-        DL(F.getParent()->getDataLayout()) {}
+        DL(F.getParent()->getDataLayout()), LI(LI) {}
 
   static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
                       DominatorTree &DT, PostDominatorTree &PDT,
-                      const TargetLibraryInfo &TLI) {
-    DSEState State(F, AA, MSSA, DT, PDT, TLI);
+                      const TargetLibraryInfo &TLI, const LoopInfo &LI) {
+    DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
     unsigned PO = 0;
@@ -911,6 +919,9 @@ struct DSEState {
         State.InvisibleToCallerAfterRet.insert({&AI, true});
       }
 
+    // Collect whether there is any irreducible control flow in the function.
+    State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
+
     return State;
   }
 
@@ -925,6 +936,12 @@ struct DSEState {
   isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
               const MemoryLocation &Later, const MemoryLocation &Earlier,
               int64_t &EarlierOff, int64_t &LaterOff) {
+    // AliasAnalysis does not always account for loops. Limit overwrite checks
+    // to dependencies for which we can guarantee they are independant of any
+    // loops they are in.
+    if (!isGuaranteedLoopIndependent(EarlierI, LaterI, Earlier))
+      return OW_Unknown;
+
     // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
     // get imprecise values here, though (except for unknown sizes).
     if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
@@ -1250,11 +1267,27 @@ struct DSEState {
     return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
   }
 
-  /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
-  /// loop. In particular, this guarantees that it only references a single
-  /// MemoryLocation during execution of the containing function.
-  bool IsGuaranteedLoopInvariant(Value *Ptr) {
-    auto IsGuaranteedLoopInvariantBase = [this](Value *Ptr) {
+  /// Returns true if a dependency between \p Current and \p KillingDef is
+  /// guaranteed to be loop invariant for the loops that they are in. Either
+  /// because they are known to be in the same block, in the same loop level or
+  /// by guaranteeing that \p CurrentLoc only references a single MemoryLocation
+  /// during execution of the containing function.
+  bool isGuaranteedLoopIndependent(const Instruction *Current,
+                                   const Instruction *KillingDef,
+                                   const MemoryLocation &CurrentLoc) {
+    // If the dependency is within the same block or loop level (being careful of
+    // irreducible loops), we know that AA will return a valid result for the
+    // memory dependency. (Both at the function level, outside of any loop,
+    // would also be valid but we currently disable that to limit compile time).
+    if (Current->getParent() == KillingDef->getParent())
+      return true;
+    const Loop *CurrentLI = LI.getLoopFor(Current->getParent());
+    if (!ContainsIrreducibleLoops && CurrentLI &&
+        CurrentLI == LI.getLoopFor(KillingDef->getParent()))
+      return true;
+
+    // Otherwise check the memory location is invariant to any loops.
+    auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) {
       Ptr = Ptr->stripPointerCasts();
       if (auto *I = dyn_cast<Instruction>(Ptr)) {
         if (isa<AllocaInst>(Ptr))
@@ -1268,7 +1301,7 @@ struct DSEState {
       return true;
     };
 
-    Ptr = Ptr->stripPointerCasts();
+    const Value *Ptr = CurrentLoc.Ptr->stripPointerCasts();
     if (auto *I = dyn_cast<Instruction>(Ptr)) {
       if (I->getParent()->isEntryBlock())
         return true;
@@ -1398,9 +1431,9 @@ struct DSEState {
 
       // AliasAnalysis does not account for loops. Limit elimination to
       // candidates for which we can guarantee they always store to the same
-      // memory location and not multiple locations in a loop.
-      if (Current->getBlock() != KillingDef->getBlock() &&
-          !IsGuaranteedLoopInvariant(const_cast<Value *>(CurrentLoc->Ptr))) {
+      // memory location and not located in 
diff erent loops.
+      if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
+        LLVM_DEBUG(dbgs() << "  ... not guaranteed loop independent\n");
         StepAgain = true;
         Current = CurrentDef->getDefiningAccess();
         WalkerStepLimit -= 1;
@@ -1836,12 +1869,13 @@ struct DSEState {
   }
 };
 
-bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
-                         DominatorTree &DT, PostDominatorTree &PDT,
-                         const TargetLibraryInfo &TLI) {
+static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+                                DominatorTree &DT, PostDominatorTree &PDT,
+                                const TargetLibraryInfo &TLI,
+                                const LoopInfo &LI) {
   bool MadeChange = false;
 
-  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
+  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI);
   // For each store:
   for (unsigned I = 0; I < State.MemDefs.size(); I++) {
     MemoryDef *KillingDef = State.MemDefs[I];
@@ -2014,8 +2048,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
   PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
 
-  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI);
+  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
 
 #ifdef LLVM_ENABLE_STATS
   if (AreStatisticsEnabled())
@@ -2029,6 +2064,7 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<LoopAnalysis>();
   return PA;
 }
 
@@ -2054,8 +2090,9 @@ class DSELegacyPass : public FunctionPass {
     MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
     PostDominatorTree &PDT =
         getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
-    bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI);
+    bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
 
 #ifdef LLVM_ENABLE_STATS
     if (AreStatisticsEnabled())
@@ -2077,6 +2114,8 @@ class DSELegacyPass : public FunctionPass {
     AU.addRequired<MemorySSAWrapperPass>();
     AU.addPreserved<PostDominatorTreeWrapperPass>();
     AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
   }
 };
 
@@ -2093,6 +2132,7 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
                     false)
 

diff  --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
index bce7bfc70d365..c18898164e25c 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@@ -515,8 +515,8 @@
 ; GCN-O2-NEXT:       Function Alias Analysis Results
 ; GCN-O2-NEXT:       Memory SSA
 ; GCN-O2-NEXT:       MemCpy Optimization
-; GCN-O2-NEXT:       Dead Store Elimination
 ; GCN-O2-NEXT:       Natural Loop Information
+; GCN-O2-NEXT:       Dead Store Elimination
 ; GCN-O2-NEXT:       Canonicalize natural loops
 ; GCN-O2-NEXT:       LCSSA Verifier
 ; GCN-O2-NEXT:       Loop-Closed SSA Form Pass
@@ -874,8 +874,8 @@
 ; GCN-O3-NEXT:       Function Alias Analysis Results
 ; GCN-O3-NEXT:       Memory SSA
 ; GCN-O3-NEXT:       MemCpy Optimization
-; GCN-O3-NEXT:       Dead Store Elimination
 ; GCN-O3-NEXT:       Natural Loop Information
+; GCN-O3-NEXT:       Dead Store Elimination
 ; GCN-O3-NEXT:       Canonicalize natural loops
 ; GCN-O3-NEXT:       LCSSA Verifier
 ; GCN-O3-NEXT:       Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
index 36370d6cd6d39..542a79820e4f2 100644
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -162,8 +162,8 @@
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MemCpy Optimization
-; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
index c39361dcc8a2d..4c5e126f229fa 100644
--- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
+++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
@@ -167,8 +167,8 @@
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MemCpy Optimization
-; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
index af77e09c24968..97b11fec08b9c 100644
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -167,8 +167,8 @@
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MemCpy Optimization
-; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
index e24d3d3f57bfc..b65e550ce9ebe 100644
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -148,8 +148,8 @@
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MemCpy Optimization
-; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll
index 6910209364d16..b1878084f5fde 100644
--- a/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll
@@ -111,7 +111,6 @@ define void @test_loop(i32 %N, i32* noalias nocapture readonly %A, i32* noalias
 ; CHECK:       for.body4.lr.ph:
 ; CHECK-NEXT:    [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]]
-; CHECK-NEXT:    store i32 0, i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]]
 ; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
 ; CHECK:       for.body4:
@@ -626,10 +625,13 @@ exit:
 define i16 @partial_override_overloop(i1 %c, i32 %i) {
 ; CHECK-LABEL: @partial_override_overloop(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FIRST:%.*]]
+; CHECK:       first:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i32 [[I:%.*]]
+; CHECK-NEXT:    store i16 1, i16* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    br label [[DO_BODY:%.*]]
 ; CHECK:       do.body:
-; CHECK-NEXT:    [[I_0:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[I_0:%.*]] = phi i16 [ 0, [[FIRST]] ], [ [[INC:%.*]], [[DO_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 [[I_0]]
 ; CHECK-NEXT:    store i16 2, i16* [[ARRAYIDX2]], align 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i16 [[I_0]], 4
@@ -643,12 +645,16 @@ define i16 @partial_override_overloop(i1 %c, i32 %i) {
 ; CHECK-NEXT:    ret i16 0
 ;
 entry:
+  ; Branch to first so MemoryLoc is not in the entry block.
+  br label %first
+
+first:
   %arrayidx = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i32 %i
   store i16 1, i16* %arrayidx, align 1
   br label %do.body
 
 do.body:
-  %i.0 = phi i16 [ 0, %entry ], [ %inc, %do.body ]
+  %i.0 = phi i16 [ 0, %first ], [ %inc, %do.body ]
   %arrayidx2 = getelementptr inbounds [10 x i16], [10 x i16]* @x, i16 0, i16 %i.0
   store i16 2, i16* %arrayidx2, align 1
   %exitcond = icmp eq i16 %i.0, 4