[llvm] a24b021 - [DSE] Remove stores in the same loop iteration

David Green via llvm-commits llvm-commits at lists.llvm.org
Sun Jun 20 09:05:09 PDT 2021


Author: David Green
Date: 2021-06-20T17:03:30+01:00
New Revision: a24b02193a305628066486a5eeee331054e06b5b

URL: https://github.com/llvm/llvm-project/commit/a24b02193a305628066486a5eeee331054e06b5b
DIFF: https://github.com/llvm/llvm-project/commit/a24b02193a305628066486a5eeee331054e06b5b.diff

LOG: [DSE] Remove stores in the same loop iteration

DSE will currently only remove stores in the same block unless they can
be guaranteed to be loop invariant. This expands that to any stores that
are in the same Loop, at the same loop level. This should still account
for where AA/MSSA will not handle aliasing between loops, but allow the
dead stores to be removed where they overlap in the same loop iteration.
It requires adding loop info to DSE, but that looks fairly harmless.

The test case this helps is from code like this, which can come up in
certain matrix operations:
  for(i=..)
    dst[i] = 0;
    for(j=..)
      dst[i] += src[i*n+j];

After LICM, this becomes:
for(i=..)
  dst[i] = 0;
  sum = 0;
  for(j=..)
    sum += src[i*n+j];
  dst[i] = sum;

The first store is dead, and with this patch is now removed.

Differntial Revision: https://reviews.llvm.org/D100464

Added: 
    

Modified: 
    llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
    llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
    llvm/test/Other/opt-O2-pipeline.ll
    llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
    llvm/test/Other/opt-O3-pipeline.ll
    llvm/test/Other/opt-Os-pipeline.ll
    llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 94f3e0d3464fd..4e840f6c03a19 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -40,10 +40,12 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -853,6 +855,11 @@ struct DSEState {
   PostDominatorTree &PDT;
   const TargetLibraryInfo &TLI;
   const DataLayout &DL;
+  const LoopInfo &LI;
+
+  // Whether the function contains any irreducible control flow, useful for
+  // being accurately able to detect loops.
+  bool ContainsIrreducibleLoops;
 
   // All MemoryDefs that potentially could kill other MemDefs.
   SmallVector<MemoryDef *, 64> MemDefs;
@@ -876,14 +883,15 @@ struct DSEState {
   DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
 
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
-           PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
+           PostDominatorTree &PDT, const TargetLibraryInfo &TLI,
+           const LoopInfo &LI)
       : F(F), AA(AA), BatchAA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI),
-        DL(F.getParent()->getDataLayout()) {}
+        DL(F.getParent()->getDataLayout()), LI(LI) {}
 
   static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
                       DominatorTree &DT, PostDominatorTree &PDT,
-                      const TargetLibraryInfo &TLI) {
-    DSEState State(F, AA, MSSA, DT, PDT, TLI);
+                      const TargetLibraryInfo &TLI, const LoopInfo &LI) {
+    DSEState State(F, AA, MSSA, DT, PDT, TLI, LI);
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
     unsigned PO = 0;
@@ -911,6 +919,9 @@ struct DSEState {
         State.InvisibleToCallerAfterRet.insert({&AI, true});
       }
 
+    // Collect whether there is any irreducible control flow in the function.
+    State.ContainsIrreducibleLoops = mayContainIrreducibleControl(F, &LI);
+
     return State;
   }
 
@@ -925,6 +936,12 @@ struct DSEState {
   isOverwrite(const Instruction *LaterI, const Instruction *EarlierI,
               const MemoryLocation &Later, const MemoryLocation &Earlier,
               int64_t &EarlierOff, int64_t &LaterOff) {
+    // AliasAnalysis does not always account for loops. Limit overwrite checks
+    // to dependencies for which we can guarantee they are independant of any
+    // loops they are in.
+    if (!isGuaranteedLoopIndependent(EarlierI, LaterI, Earlier))
+      return OW_Unknown;
+
     // FIXME: Vet that this works for size upper-bounds. Seems unlikely that we'll
     // get imprecise values here, though (except for unknown sizes).
     if (!Later.Size.isPrecise() || !Earlier.Size.isPrecise()) {
@@ -1250,11 +1267,33 @@ struct DSEState {
     return isRefSet(BatchAA.getModRefInfo(UseInst, DefLoc));
   }
 
+  /// Returns true if a dependency between \p Current and \p KillingDef is
+  /// guaranteed to be loop invariant for the loops that they are in. Either
+  /// because they are known to be in the same block, in the same loop level or
+  /// by guaranteeing that \p CurrentLoc only references a single MemoryLocation
+  /// during execution of the containing function.
+  bool isGuaranteedLoopIndependent(const Instruction *Current,
+                                   const Instruction *KillingDef,
+                                   const MemoryLocation &CurrentLoc) {
+    // If the dependency is within the same block or loop level (being careful
+    // of irreducible loops), we know that AA will return a valid result for the
+    // memory dependency. (Both at the function level, outside of any loop,
+    // would also be valid but we currently disable that to limit compile time).
+    if (Current->getParent() == KillingDef->getParent())
+      return true;
+    const Loop *CurrentLI = LI.getLoopFor(Current->getParent());
+    if (!ContainsIrreducibleLoops && CurrentLI &&
+        CurrentLI == LI.getLoopFor(KillingDef->getParent()))
+      return true;
+    // Otherwise check the memory location is invariant to any loops.
+    return isGuaranteedLoopInvariant(CurrentLoc.Ptr);
+  }
+
   /// Returns true if \p Ptr is guaranteed to be loop invariant for any possible
   /// loop. In particular, this guarantees that it only references a single
   /// MemoryLocation during execution of the containing function.
-  bool IsGuaranteedLoopInvariant(Value *Ptr) {
-    auto IsGuaranteedLoopInvariantBase = [this](Value *Ptr) {
+  bool isGuaranteedLoopInvariant(const Value *Ptr) {
+    auto IsGuaranteedLoopInvariantBase = [this](const Value *Ptr) {
       Ptr = Ptr->stripPointerCasts();
       if (auto *I = dyn_cast<Instruction>(Ptr)) {
         if (isa<AllocaInst>(Ptr))
@@ -1398,9 +1437,9 @@ struct DSEState {
 
       // AliasAnalysis does not account for loops. Limit elimination to
       // candidates for which we can guarantee they always store to the same
-      // memory location and not multiple locations in a loop.
-      if (Current->getBlock() != KillingDef->getBlock() &&
-          !IsGuaranteedLoopInvariant(const_cast<Value *>(CurrentLoc->Ptr))) {
+      // memory location and not located in 
diff erent loops.
+      if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
+        LLVM_DEBUG(dbgs() << "  ... not guaranteed loop independent\n");
         StepAgain = true;
         Current = CurrentDef->getDefiningAccess();
         WalkerStepLimit -= 1;
@@ -1529,8 +1568,16 @@ struct DSEState {
         return None;
       }
 
-      // For the KillingDef and EarlierAccess we only have to check if it reads
-      // the memory location.
+      // If this worklist walks back to the original memory access (and the
+      // pointer is not guarenteed loop invariant) then we cannot assume that a
+      // store kills itself.
+      if (EarlierAccess == UseAccess &&
+          !isGuaranteedLoopInvariant(CurrentLoc->Ptr)) {
+        LLVM_DEBUG(dbgs() << "    ... found not loop invariant self access\n");
+        return None;
+      }
+      // Otherwise, for the KillingDef and EarlierAccess we only have to check
+      // if it reads the memory location.
       // TODO: It would probably be better to check for self-reads before
       // calling the function.
       if (KillingDef == UseAccess || EarlierAccess == UseAccess) {
@@ -1550,16 +1597,18 @@ struct DSEState {
       //                  stores [0,1]
       if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
         if (isCompleteOverwrite(*CurrentLoc, EarlierMemInst, UseInst)) {
-          if (!isInvisibleToCallerAfterRet(DefUO) &&
-              UseAccess != EarlierAccess) {
-            BasicBlock *MaybeKillingBlock = UseInst->getParent();
-            if (PostOrderNumbers.find(MaybeKillingBlock)->second <
-                PostOrderNumbers.find(EarlierAccess->getBlock())->second) {
-
+          BasicBlock *MaybeKillingBlock = UseInst->getParent();
+          if (PostOrderNumbers.find(MaybeKillingBlock)->second <
+              PostOrderNumbers.find(EarlierAccess->getBlock())->second) {
+            if (!isInvisibleToCallerAfterRet(DefUO)) {
               LLVM_DEBUG(dbgs()
                          << "    ... found killing def " << *UseInst << "\n");
               KillingDefs.insert(UseInst);
             }
+          } else {
+            LLVM_DEBUG(dbgs()
+                       << "    ... found preceeding def " << *UseInst << "\n");
+            return None;
           }
         } else
           PushMemUses(UseDef);
@@ -1836,12 +1885,13 @@ struct DSEState {
   }
 };
 
-bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
-                         DominatorTree &DT, PostDominatorTree &PDT,
-                         const TargetLibraryInfo &TLI) {
+static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+                                DominatorTree &DT, PostDominatorTree &PDT,
+                                const TargetLibraryInfo &TLI,
+                                const LoopInfo &LI) {
   bool MadeChange = false;
 
-  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
+  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI, LI);
   // For each store:
   for (unsigned I = 0; I < State.MemDefs.size(); I++) {
     MemoryDef *KillingDef = State.MemDefs[I];
@@ -2014,8 +2064,9 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
   DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
   PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
 
-  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI);
+  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
 
 #ifdef LLVM_ENABLE_STATS
   if (AreStatisticsEnabled())
@@ -2029,6 +2080,7 @@ PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<MemorySSAAnalysis>();
+  PA.preserve<LoopAnalysis>();
   return PA;
 }
 
@@ -2054,8 +2106,9 @@ class DSELegacyPass : public FunctionPass {
     MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
     PostDominatorTree &PDT =
         getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
-    bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI);
+    bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, TLI, LI);
 
 #ifdef LLVM_ENABLE_STATS
     if (AreStatisticsEnabled())
@@ -2077,6 +2130,8 @@ class DSELegacyPass : public FunctionPass {
     AU.addRequired<MemorySSAWrapperPass>();
     AU.addPreserved<PostDominatorTreeWrapperPass>();
     AU.addPreserved<MemorySSAWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
   }
 };
 
@@ -2093,6 +2148,7 @@ INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
                     false)
 

diff  --git a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
index 021012aff54d2..25051b885f9d3 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/opt-pipeline.ll
@@ -517,8 +517,8 @@
 ; GCN-O2-NEXT:       Function Alias Analysis Results
 ; GCN-O2-NEXT:       Memory SSA
 ; GCN-O2-NEXT:       MemCpy Optimization
-; GCN-O2-NEXT:       Dead Store Elimination
 ; GCN-O2-NEXT:       Natural Loop Information
+; GCN-O2-NEXT:       Dead Store Elimination
 ; GCN-O2-NEXT:       Canonicalize natural loops
 ; GCN-O2-NEXT:       LCSSA Verifier
 ; GCN-O2-NEXT:       Loop-Closed SSA Form Pass
@@ -877,8 +877,8 @@
 ; GCN-O3-NEXT:       Function Alias Analysis Results
 ; GCN-O3-NEXT:       Memory SSA
 ; GCN-O3-NEXT:       MemCpy Optimization
-; GCN-O3-NEXT:       Dead Store Elimination
 ; GCN-O3-NEXT:       Natural Loop Information
+; GCN-O3-NEXT:       Dead Store Elimination
 ; GCN-O3-NEXT:       Canonicalize natural loops
 ; GCN-O3-NEXT:       LCSSA Verifier
 ; GCN-O3-NEXT:       Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Other/opt-O2-pipeline.ll b/llvm/test/Other/opt-O2-pipeline.ll
index 36370d6cd6d39..542a79820e4f2 100644
--- a/llvm/test/Other/opt-O2-pipeline.ll
+++ b/llvm/test/Other/opt-O2-pipeline.ll
@@ -162,8 +162,8 @@
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MemCpy Optimization
-; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
index c39361dcc8a2d..4c5e126f229fa 100644
--- a/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
+++ b/llvm/test/Other/opt-O3-pipeline-enable-matrix.ll
@@ -167,8 +167,8 @@
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MemCpy Optimization
-; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Other/opt-O3-pipeline.ll b/llvm/test/Other/opt-O3-pipeline.ll
index af77e09c24968..97b11fec08b9c 100644
--- a/llvm/test/Other/opt-O3-pipeline.ll
+++ b/llvm/test/Other/opt-O3-pipeline.ll
@@ -167,8 +167,8 @@
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MemCpy Optimization
-; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Other/opt-Os-pipeline.ll b/llvm/test/Other/opt-Os-pipeline.ll
index e24d3d3f57bfc..b65e550ce9ebe 100644
--- a/llvm/test/Other/opt-Os-pipeline.ll
+++ b/llvm/test/Other/opt-Os-pipeline.ll
@@ -148,8 +148,8 @@
 ; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:         Memory SSA
 ; CHECK-NEXT:         MemCpy Optimization
-; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Natural Loop Information
+; CHECK-NEXT:         Dead Store Elimination
 ; CHECK-NEXT:         Canonicalize natural loops
 ; CHECK-NEXT:         LCSSA Verifier
 ; CHECK-NEXT:         Loop-Closed SSA Form Pass

diff  --git a/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll b/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll
index 7bcb1ad79f0b0..e555e04f51d1c 100644
--- a/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/multiblock-loops.ll
@@ -111,7 +111,6 @@ define void @test_loop(i32 %N, i32* noalias nocapture readonly %A, i32* noalias
 ; CHECK:       for.body4.lr.ph:
 ; CHECK-NEXT:    [[I_028:%.*]] = phi i32 [ [[INC11:%.*]], [[FOR_COND_CLEANUP3:%.*]] ], [ 0, [[FOR_BODY4_LR_PH_PREHEADER]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[I_028]]
-; CHECK-NEXT:    store i32 0, i32* [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[I_028]], [[N]]
 ; CHECK-NEXT:    br label [[FOR_BODY4:%.*]]
 ; CHECK:       for.body4:
@@ -179,7 +178,6 @@ define i32 @test_if(i1 %c, i32* %p, i32 %i) {
 ; CHECK-NEXT:    [[PH:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[BB3:%.*]] ]
 ; CHECK-NEXT:    [[INC]] = add i32 [[PH]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i32 [[PH]]
-; CHECK-NEXT:    store i32 [[I:%.*]], i32* [[GEP]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB2:%.*]], label [[BB3]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    br label [[BB3]]
@@ -216,7 +214,6 @@ define i32 @test_if2(i1 %c, i32* %p, i32 %i) {
 ; CHECK-NEXT:    [[PH:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[BB2:%.*]] ], [ [[INC]], [[BB3:%.*]] ]
 ; CHECK-NEXT:    [[INC]] = add i32 [[PH]], 1
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i32 [[PH]]
-; CHECK-NEXT:    store i32 [[I:%.*]], i32* [[GEP]], align 4
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[BB2]], label [[BB3]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    store i32 2, i32* [[GEP]], align 4


        


More information about the llvm-commits mailing list