[llvm] [GVN] Permit load PRE to happen in more cases (PR #79324)

via llvm-commits llvm-commits at lists.llvm.org
Wed Jan 24 08:42:03 PST 2024


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-transforms

Author: David Sherwood (david-arm)

<details>
<summary>Changes</summary>

There are some loads that we fail to mark as available
in AnalyzeLoadAvailability because we don't do enough
analysis when the MemDepResult is not local. Consider
an example like this:

entry:
  br i1 %c1, label %A, label %C

A:
  %pi = getelementptr i8, ptr %p, i8 %i
  %pj = getelementptr i8, ptr %p, i8 %j
  %x = load i8, ptr %pi
  %y = load i8, ptr %pj
  %c2 = icmp slt i8 %x, %y
  br i1 %c2, label %C, label %B

B:
  br label %C

C:
  %k = phi i8 [ %i, %entry ], [%i, %A], [ %j, %B ]
  %pk = getelementptr i8, ptr %p, i8 %k
  %z = load i8, ptr %pk

Currently we analyse the load (%z) in block C and say
that the load from block B is unavailable, despite the
fact B only has a single predecessor that points to block
A where the load is in fact available. There is also
nothing between

  %y = load i8, ptr %pj

and

  %k = phi i8 [ %i, %entry ], [%i, %A], [ %j, %B ]

that could clobber the load.

This patch adds support for such cases by looking
through all predecessor paths to see if they all lead
to the block containing the load, and there is nothing
to clobber the load inbetween. This is similar to what
we already do for selects between two pointer values,
so we can reuse some existing code.

This leads to 0.7% and 1.2% improvements in the SPEC2017
benchmark omnetpp when running on the AArch64 machines
neoverse-v1 and neoverse-n1, respectively.

---
Full diff: https://github.com/llvm/llvm-project/pull/79324.diff


3 Files Affected:

- (modified) llvm/include/llvm/Transforms/Scalar/GVN.h (+2-1) 
- (modified) llvm/lib/Transforms/Scalar/GVN.cpp (+85-23) 
- (modified) llvm/test/Transforms/GVN/PRE/load-pre-licm.ll (+196) 


``````````diff
diff --git a/llvm/include/llvm/Transforms/Scalar/GVN.h b/llvm/include/llvm/Transforms/Scalar/GVN.h
index 4ba9b74ccb005d..113d0adc36f6e8 100644
--- a/llvm/include/llvm/Transforms/Scalar/GVN.h
+++ b/llvm/include/llvm/Transforms/Scalar/GVN.h
@@ -320,7 +320,8 @@ class GVNPass : public PassInfoMixin<GVNPass> {
   /// Given a local dependency (Def or Clobber) determine if a value is
   /// available for the load.
   std::optional<gvn::AvailableValue>
-  AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo, Value *Address);
+  AnalyzeLoadAvailability(LoadInst *Load, BasicBlock *DepBB,
+                          MemDepResult DepInfo, Value *Address);
 
   /// Given a list of non-local dependencies, determine if a value is
   /// available for the load in each specified block.  If it is, add it to
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index e36578f3de7ac4..675ef0695db07d 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -1143,34 +1143,100 @@ static void reportMayClobberedLoad(LoadInst *Load, MemDepResult DepInfo,
   ORE->emit(R);
 }
 
+static bool findDominatingValueInBlock(const MemoryLocation &Loc, Type *LoadTy,
+                                       Instruction *From,
+                                       BatchAAResults *BatchAA,
+                                       uint32_t &NumVisitedInsts,
+                                       Value *&FoundVal) {
+  for (auto *Inst = From; Inst != nullptr;
+       Inst = Inst->getPrevNonDebugInstruction()) {
+    // Stop the search if limit is reached.
+    if (++NumVisitedInsts > MaxNumVisitedInsts) {
+      return false;
+    }
+    if (isModSet(BatchAA->getModRefInfo(Inst, Loc))) {
+      return false;
+    }
+    if (auto *LI = dyn_cast<LoadInst>(Inst))
+      if (LI->getPointerOperand() == Loc.Ptr && LI->getType() == LoadTy) {
+        FoundVal = LI;
+        return true;
+      }
+  }
+  FoundVal = nullptr;
+  return true;
+}
+
 // Find non-clobbered value for Loc memory location in extended basic block
 // (chain of basic blocks with single predecessors) starting From instruction.
 static Value *findDominatingValue(const MemoryLocation &Loc, Type *LoadTy,
                                   Instruction *From, AAResults *AA) {
-  uint32_t NumVisitedInsts = 0;
   BasicBlock *FromBB = From->getParent();
+  uint32_t NumVisitedInsts = 0;
   BatchAAResults BatchAA(*AA);
-  for (BasicBlock *BB = FromBB; BB; BB = BB->getSinglePredecessor())
-    for (auto *Inst = BB == FromBB ? From : BB->getTerminator();
-         Inst != nullptr; Inst = Inst->getPrevNonDebugInstruction()) {
-      // Stop the search if limit is reached.
-      if (++NumVisitedInsts > MaxNumVisitedInsts)
-        return nullptr;
-      if (isModSet(BatchAA.getModRefInfo(Inst, Loc)))
-        return nullptr;
-      if (auto *LI = dyn_cast<LoadInst>(Inst))
-        if (LI->getPointerOperand() == Loc.Ptr && LI->getType() == LoadTy)
-          return LI;
-    }
+  for (BasicBlock *BB = FromBB; BB; BB = BB->getSinglePredecessor()) {
+    Value *FoundVal;
+    if (!findDominatingValueInBlock(Loc, LoadTy,
+                                    BB == FromBB ? From : BB->getTerminator(),
+                                    &BatchAA, NumVisitedInsts, FoundVal))
+      return nullptr;
+    else if (FoundVal)
+      return FoundVal;
+  }
   return nullptr;
 }
 
 std::optional<AvailableValue>
-GVNPass::AnalyzeLoadAvailability(LoadInst *Load, MemDepResult DepInfo,
-                                 Value *Address) {
+GVNPass::AnalyzeLoadAvailability(LoadInst *Load, BasicBlock *DepBB,
+                                 MemDepResult DepInfo, Value *Address) {
   assert(Load->isUnordered() && "rules below are incorrect for ordered access");
-  assert(DepInfo.isLocal() && "expected a local dependence");
 
+  // Even for an unknown memory dependence we can still walk backwards through
+  // any predecessor blocks to see if we can find an available load for the
+  // address. This load needs to be the same for every possible predecessor.
+  // Here is one such example:
+  //   LoadBB
+  //     |   \
+  //     |    MidBB
+  //     |   /
+  //    DepBB
+  // This is similar to what we already do for a SelectInst further down.
+  // There must be no instructions between the load and the end of DepBB
+  // that could clobber the load.
+  if (!DepInfo.isLocal()) {
+    if (!Address || pred_empty(DepBB))
+      return std::nullopt;
+
+    // First Check to see if there is anything in DepBB that would clobber the
+    // load.
+    auto Loc = MemoryLocation::get(Load).getWithNewPtr(Address);
+    Value *FoundVal;
+    uint32_t NumVisitedInsts = 0;
+    BatchAAResults BatchAA(*getAliasAnalysis());
+    if (!findDominatingValueInBlock(Loc, Load->getType(),
+                                    DepBB->getTerminator(), &BatchAA,
+                                    NumVisitedInsts, FoundVal))
+      return std::nullopt;
+    else if (FoundVal)
+      return AvailableValue::getLoad(cast<LoadInst>(FoundVal));
+
+    // Then look for a common dominating value along all the predecessor paths.
+    Value *LoadOfAddr = nullptr;
+    for (BasicBlock *PredBB : predecessors(DepBB)) {
+      FoundVal = findDominatingValue(
+          Loc, Load->getType(), PredBB->getTerminator(), getAliasAnalysis());
+      if (!FoundVal)
+        return std::nullopt;
+      if (!LoadOfAddr)
+        LoadOfAddr = FoundVal;
+      else if (FoundVal != LoadOfAddr)
+        return std::nullopt;
+    }
+
+    return AvailableValue::getLoad(cast<LoadInst>(LoadOfAddr));
+  }
+
+  assert(DepInfo.isLocal() && "expected a local dependence");
   Instruction *DepInst = DepInfo.getInst();
 
   const DataLayout &DL = Load->getModule()->getDataLayout();
@@ -1324,15 +1390,11 @@ void GVNPass::AnalyzeLoadAvailability(LoadInst *Load, LoadDepVect &Deps,
       continue;
     }
 
-    if (!DepInfo.isLocal()) {
-      UnavailableBlocks.push_back(DepBB);
-      continue;
-    }
-
     // The address being loaded in this non-local block may not be the same as
     // the pointer operand of the load if PHI translation occurs.  Make sure
     // to consider the right address.
-    if (auto AV = AnalyzeLoadAvailability(Load, DepInfo, Dep.getAddress())) {
+    if (auto AV =
+            AnalyzeLoadAvailability(Load, DepBB, DepInfo, Dep.getAddress())) {
       // subtlety: because we know this was a non-local dependency, we know
       // it's safe to materialize anywhere between the instruction within
       // DepInfo and the end of it's block.
@@ -2160,7 +2222,7 @@ bool GVNPass::processLoad(LoadInst *L) {
     return false;
   }
 
-  auto AV = AnalyzeLoadAvailability(L, Dep, L->getPointerOperand());
+  auto AV = AnalyzeLoadAvailability(L, nullptr, Dep, L->getPointerOperand());
   if (!AV)
     return false;
 
diff --git a/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll b/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
index c52f46b4f63ee9..040889f12d03f7 100644
--- a/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
+++ b/llvm/test/Transforms/GVN/PRE/load-pre-licm.ll
@@ -282,3 +282,199 @@ header:
   call void @hold(i32 %v1)
   br label %header
 }
+
+
+; In block C there is no load we can reuse from the entry block,
+; requiring the insertion of a critical edge to add the load.
+; Whereas other 2 predecessors go back to block A which already
+; has the loads.
+define i8 @test7a(i1 %c1, ptr %p, i8 %i, i8 %j) {
+; CHECK-LABEL: @test7a(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[ENTRY_C_CRIT_EDGE:%.*]]
+; CHECK:       entry.C_crit_edge:
+; CHECK-NEXT:    [[PK_PHI_TRANS_INSERT:%.*]] = getelementptr i8, ptr [[P:%.*]], i8 [[I:%.*]]
+; CHECK-NEXT:    [[Z_PRE:%.*]] = load i8, ptr [[PK_PHI_TRANS_INSERT]], align 1
+; CHECK-NEXT:    br label [[C:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[P]], i8 [[I]]
+; CHECK-NEXT:    [[PJ:%.*]] = getelementptr i8, ptr [[P]], i8 [[J:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[PI]], align 1
+; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[PJ]], align 1
+; CHECK-NEXT:    [[C2:%.*]] = icmp slt i8 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[C2]], label [[C]], label [[B:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    br label [[C]]
+; CHECK:       C:
+; CHECK-NEXT:    [[Z:%.*]] = phi i8 [ [[Z_PRE]], [[ENTRY_C_CRIT_EDGE]] ], [ [[X]], [[A]] ], [ [[Y]], [[B]] ]
+; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY_C_CRIT_EDGE]] ], [ [[I]], [[A]] ], [ [[J]], [[B]] ]
+; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[P]], i8 [[K]]
+; CHECK-NEXT:    ret i8 [[Z]]
+;
+entry:
+  br i1 %c1, label %A, label %C
+
+A:
+  %pi = getelementptr i8, ptr %p, i8 %i
+  %pj = getelementptr i8, ptr %p, i8 %j
+  %x = load i8, ptr %pi
+  %y = load i8, ptr %pj
+  %c2 = icmp slt i8 %x, %y
+  br i1 %c2, label %C, label %B
+
+B:
+  br label %C
+
+C:
+  %k = phi i8 [ %i, %entry ], [%i, %A], [ %j, %B ]
+  %pk = getelementptr i8, ptr %p, i8 %k
+  %z = load i8, ptr %pk
+  ret i8 %z
+}
+
+
+; In block Z two loads are missing (via entry and B blocks), which will
+; require 2 critical edges. Whereas via other predecessor (A) the
+; pre-existing load could potentially be reused.
+define i8 @test7b(i1 %c1, ptr %p, i8 %i, i8 %j) {
+; CHECK-LABEL: @test7b(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[C:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[P:%.*]], i8 [[I:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[PI]], align 1
+; CHECK-NEXT:    [[C2:%.*]] = icmp slt i8 [[X]], 3
+; CHECK-NEXT:    br i1 [[C2]], label [[C]], label [[B:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    br label [[C]]
+; CHECK:       C:
+; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY:%.*]] ], [ [[I]], [[A]] ], [ [[J:%.*]], [[B]] ]
+; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[P]], i8 [[K]]
+; CHECK-NEXT:    [[Z:%.*]] = load i8, ptr [[PK]], align 1
+; CHECK-NEXT:    ret i8 [[Z]]
+;
+entry:
+  br i1 %c1, label %A, label %C
+
+A:
+  %pi = getelementptr i8, ptr %p, i8 %i
+  %x = load i8, ptr %pi
+  %c2 = icmp slt i8 %x, 3
+  br i1 %c2, label %C, label %B
+
+B:
+  br label %C
+
+C:
+  %k = phi i8 [ %i, %entry ], [%i, %A], [ %j, %B ]
+  %pk = getelementptr i8, ptr %p, i8 %k
+  %z = load i8, ptr %pk
+  ret i8 %z
+}
+
+
+; In block D there is no load we can reuse from the entry block,
+; requiring the insertion of a critical edge to add the load.
+; Whereas other 2 predecessors go back to block A which already
+; has the loads. In the case of the incoming value from C we
+; have to walk back two blocks to get to A.
+define i8 @test7c(i1 %c1, i1 %c2, ptr %p, i8 %i, i8 %j) {
+; CHECK-LABEL: @test7c(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[ENTRY_D_CRIT_EDGE:%.*]]
+; CHECK:       entry.D_crit_edge:
+; CHECK-NEXT:    [[PK_PHI_TRANS_INSERT:%.*]] = getelementptr i8, ptr [[P:%.*]], i8 [[I:%.*]]
+; CHECK-NEXT:    [[Z_PRE:%.*]] = load i8, ptr [[PK_PHI_TRANS_INSERT]], align 1
+; CHECK-NEXT:    br label [[D:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[P]], i8 [[I]]
+; CHECK-NEXT:    [[PJ:%.*]] = getelementptr i8, ptr [[P]], i8 [[J:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[PI]], align 1
+; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[PJ]], align 1
+; CHECK-NEXT:    [[C3:%.*]] = icmp slt i8 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[C3]], label [[D]], label [[B:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    br i1 [[C2:%.*]], label [[C:%.*]], label [[D]]
+; CHECK:       C:
+; CHECK-NEXT:    br label [[D]]
+; CHECK:       D:
+; CHECK-NEXT:    [[Z:%.*]] = phi i8 [ [[Z_PRE]], [[ENTRY_D_CRIT_EDGE]] ], [ [[X]], [[A]] ], [ [[X]], [[B]] ], [ [[Y]], [[C]] ]
+; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY_D_CRIT_EDGE]] ], [ [[I]], [[A]] ], [ [[I]], [[B]] ], [ [[J]], [[C]] ]
+; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[P]], i8 [[K]]
+; CHECK-NEXT:    ret i8 [[Z]]
+;
+entry:
+  br i1 %c1, label %A, label %D
+
+A:
+  %pi = getelementptr i8, ptr %p, i8 %i
+  %pj = getelementptr i8, ptr %p, i8 %j
+  %x = load i8, ptr %pi
+  %y = load i8, ptr %pj
+  %c3 = icmp slt i8 %x, %y
+  br i1 %c3, label %D, label %B
+
+B:
+  br i1 %c2, label %C, label %D
+
+C:
+  br label %D
+
+D:
+  %k = phi i8 [ %i, %entry ], [ %i, %A ], [ %i, %B ], [ %j, %C ]
+  %pk = getelementptr i8, ptr %p, i8 %k
+  %z = load i8, ptr %pk
+  ret i8 %z
+}
+
+
+; Similar to test7a except there is a volatile load in block B from an
+; address that may overlap with %pj. We should still be able to
+; perform load PRE since the volatile load does not clobber anything.
+define i8 @test7d(i1 %c1, ptr %p, i8 %i, i8 %j, i32 %v) {
+; CHECK-LABEL: @test7d(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C1:%.*]], label [[A:%.*]], label [[ENTRY_C_CRIT_EDGE:%.*]]
+; CHECK:       entry.C_crit_edge:
+; CHECK-NEXT:    [[PK_PHI_TRANS_INSERT:%.*]] = getelementptr i8, ptr [[P:%.*]], i8 [[I:%.*]]
+; CHECK-NEXT:    [[Z_PRE:%.*]] = load i8, ptr [[PK_PHI_TRANS_INSERT]], align 1
+; CHECK-NEXT:    br label [[C:%.*]]
+; CHECK:       A:
+; CHECK-NEXT:    [[PI:%.*]] = getelementptr i8, ptr [[P]], i8 [[I]]
+; CHECK-NEXT:    [[PJ:%.*]] = getelementptr i8, ptr [[P]], i8 [[J:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = load i8, ptr [[PI]], align 1
+; CHECK-NEXT:    [[Y:%.*]] = load i8, ptr [[PJ]], align 1
+; CHECK-NEXT:    [[C2:%.*]] = icmp slt i8 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[C2]], label [[C]], label [[B:%.*]]
+; CHECK:       B:
+; CHECK-NEXT:    [[PJ2:%.*]] = getelementptr i8, ptr [[PJ]], i32 [[V:%.*]]
+; CHECK-NEXT:    [[Y2:%.*]] = load volatile i32, ptr [[PJ2]], align 4
+; CHECK-NEXT:    br label [[C]]
+; CHECK:       C:
+; CHECK-NEXT:    [[Z:%.*]] = phi i8 [ [[Z_PRE]], [[ENTRY_C_CRIT_EDGE]] ], [ [[X]], [[A]] ], [ [[Y]], [[B]] ]
+; CHECK-NEXT:    [[K:%.*]] = phi i8 [ [[I]], [[ENTRY_C_CRIT_EDGE]] ], [ [[I]], [[A]] ], [ [[J]], [[B]] ]
+; CHECK-NEXT:    [[PK:%.*]] = getelementptr i8, ptr [[P]], i8 [[K]]
+; CHECK-NEXT:    ret i8 [[Z]]
+;
+entry:
+  br i1 %c1, label %A, label %C
+
+A:
+  %pi = getelementptr i8, ptr %p, i8 %i
+  %pj = getelementptr i8, ptr %p, i8 %j
+  %x = load i8, ptr %pi
+  %y = load i8, ptr %pj
+  %c2 = icmp slt i8 %x, %y
+  br i1 %c2, label %C, label %B
+
+B:
+  %pj2 = getelementptr i8, ptr %pj, i32 %v
+  %y2 = load volatile i32, ptr %pj2
+  br label %C
+
+C:
+  %k = phi i8 [ %i, %entry ], [%i, %A], [ %j, %B ]
+  %pk = getelementptr i8, ptr %p, i8 %k
+  %z = load i8, ptr %pk
+  ret i8 %z
+}

``````````

</details>


https://github.com/llvm/llvm-project/pull/79324


More information about the llvm-commits mailing list