[llvm] [NewGVN][3/3] Load coercion for loads that can be replaced by a phi (PR #68669)

Tue Oct 10 00:49:55 PDT 2023

llvmbot wrote:




@llvm/pr-subscribers-llvm-transforms

Author: Konstantina Mitropoulou (kmitropoulou)

<details>
<summary>Changes</summary>

[NewGVN][3/3] Load coercion for loads that can be replaced by a phi

In the following two examples, there are two cases where the load can be
replaced by a phi:

1. MemoryPhi: In Example 1, load %V is dependent on a MemoryPhi. This
indicates that there are two memory definitions in BB1 and BB2 for %V. As
a result, we replace the load with a phi.

Example 1:
```
Before load coercion
BB1:                        BB2:
 1 = MemoryDef(liveOnEntry)  2 = MemoryDef(liveOnEntry)
 store i32 100, ptr %P       store i32 500, ptr %P
 br label %BB3               br label %BB3
                      \     /
                     BB3:
                      3 = MemoryPhi({BB1,1},{BB2,2})
                      %V = load i32, ptr %P

After load coercion
 BB1:                       BB2:
  store i32 100, ptr %P      store i32 500, ptr %P
  br label %BB3              br label
                      \     /
                     BB3:
                      %V = phi i32 [ 100, %BB1 ], [ 500, %BB2 ]

```

2. Parial load elimination: In Example 2, %V1 and %V2 have live-on-entry
defintions and their memory locations overlap. By emitting, a new load
%V2' in BB2, we can replace %V2 with a phi node.

Example 2:
```
Before load coercion
  BB1:                              BB2:
   %V1 = load <2 x i32>, ptr %P      br label %BB3
   br label %BB3                    /
               \              /
                 BB3:
                  %V2 = load i32, ptr %P

After load coercion
  BB1:                                BB2:
   %V1 = load <2 x i32>, ptr %P        %V2' = load i32, ptr %P
   %0 = bitcast <2 x i32> %V1 to i64  br label %BB3
   %1 = trunc i64 %0 to i32            /
   br label %BB3                      /
               \                /
                 BB3:
                  %V2 = phi i32 [ %1, %BB1], [ %V2', %BB2 ]
```

The code includes more cases like these. Please refer to the examples in the
code comments for more details.


---

Patch is 196.52 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/68669.diff


6 Files Affected:

- (modified) llvm/lib/Transforms/Scalar/NewGVN.cpp (+994-51) 
- (added) llvm/test/Transforms/NewGVN/load_coercion_between_loads.ll (+424) 
- (added) llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll (+341) 
- (added) llvm/test/Transforms/NewGVN/load_coercion_replace_load_with_phi.ll (+3788) 
- (modified) llvm/test/Transforms/NewGVN/pr14166-xfail.ll (-1) 
- (modified) llvm/test/Transforms/NewGVN/pr35125.ll (+1-1) 


``````````diff

diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 19ac9526b5f88b6..ace3ffceb8fb953 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -73,9 +73,11 @@
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionPrecedenceTracking.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
@@ -106,6 +108,7 @@
 #include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/VNCoercion.h"
 #include <algorithm>
 #include <cassert>
@@ -154,6 +157,10 @@ static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
 static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
                                     cl::Hidden);
 
+// Enables load coercion for non-constant values.
+static cl::opt<bool> EnableLoadCoercion("enable-load-coercion", cl::init(true),
+                                        cl::Hidden);
+
 //===----------------------------------------------------------------------===//
 //                                GVN Pass
 //===----------------------------------------------------------------------===//
@@ -495,6 +502,7 @@ class NewGVN {
   AssumptionCache *AC = nullptr;
   const DataLayout &DL;
   std::unique_ptr<PredicateInfo> PredInfo;
+  ImplicitControlFlowTracking *ICF = nullptr;
 
   // These are the only two things the create* functions should have
   // side-effects on due to allocating memory.
@@ -653,6 +661,16 @@ class NewGVN {
   // Deletion info.
   SmallPtrSet<Instruction *, 8> InstructionsToErase;
 
+  // Map candidate load to their depending instructions.
+  mutable std::map<Value *, DenseSet<std::pair<Instruction *, BasicBlock *>>>
+      LoadCoercion;
+
+  // Keep newly generated loads.
+  SmallVector<Instruction *, 2> NewLoadsInLoadCoercion;
+
+  // Keep newly generated instructions.
+  SmallVector<Instruction *, 2> NewlyGeneratedInsns;
+
 public:
   NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
          TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
@@ -776,9 +794,9 @@ class NewGVN {
   ExprResult checkExprResults(Expression *, Instruction *, Value *) const;
   ExprResult performSymbolicEvaluation(Instruction *,
                                        SmallPtrSetImpl<Value *> &) const;
-  const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
-                                                Instruction *,
-                                                MemoryAccess *) const;
+  const Expression *createLoadExpAndUpdateMemUses(LoadInst *, Value *,
+                                                  MemoryAccess *,
+                                                  MemoryAccess *) const;
   const Expression *performSymbolicLoadEvaluation(Instruction *) const;
   const Expression *performSymbolicStoreEvaluation(Instruction *) const;
   ExprResult performSymbolicCallEvaluation(Instruction *) const;
@@ -853,6 +871,7 @@ class NewGVN {
   // Utilities.
   void cleanupTables();
   std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
+  unsigned updateDFSNumbers(unsigned);
   void updateProcessedCount(const Value *V);
   void verifyMemoryCongruency() const;
   void verifyIterationSettled(Function &F);
@@ -893,6 +912,54 @@ class NewGVN {
   // Debug counter info.  When verifying, we have to reset the value numbering
   // debug counter to the same state it started in to get the same results.
   int64_t StartingVNCounter = 0;
+
+  // The following functions are used in load coercion:
+  // Try to add the load along with the depending instruction(s) in
+  // LoadCoercion map.
+  bool tryAddLoadDepInsnIntoLoadCoercionMap(LoadInst *, Instruction *,
+                                            BasicBlock *) const;
+  // Check if the candidate load can be optimized by another load which is also
+  // a live of entry definition and add it in LoadCoercion map.
+  bool findLiveOnEntryDependency(LoadInst *, LoadInst *, ArrayRef<BasicBlock *>,
+                                 bool) const;
+  // Collect the load instructions that can be optimized with load coercion.
+  // The filtering of the load instructions is based the type of their memory
+  // access.
+  bool performSymbolicLoadCoercionForNonConstantMemoryDef(LoadInst *,
+                                                          StoreInst *,
+                                                          MemoryAccess *) const;
+  const Expression *performSymbolicLoadCoercionForConstantMemoryDef(
+      Type *, Value *, LoadInst *, Instruction *, MemoryAccess *) const;
+  bool performSymbolicLoadCoercionForLiveOnEntryDef(LoadInst *,
+                                                    MemoryAccess *) const;
+  bool performSymbolicLoadCoercionForMemoryPhi(LoadInst *,
+                                               MemoryAccess *) const;
+  // Code generation for load coercion. Replaces the load with the right
+  // instruction or the right sequence of instructions.
+  bool implementLoadCoercion();
+  // Update MemorySSA with the load instructions that are emitted during load
+  // coercion.
+  void updateMemorySSA(Instruction *, Instruction *);
+  // Extract the value that will replace the load from the depending
+  // instruction.
+  Value *getExtractedValue(LoadInst *, Instruction *);
+  // If load coercion is successful, the uses of the optimized load might need
+  // to be added to new congruence classes in order to optimize the code
+  // further. For this reason, we run value numbering for all the uses of the
+  // optimized load. If load coercion has failed, then we need to add the load
+  // (and its uses) to the right congruence class.
+  // Emit the phi that replaces the load and it updates the SSA with the new
+  // phi.
+  Value *emitLoadCoercionPhi(LoadInst *, BasicBlock *,
+                             ArrayRef<std::pair<BasicBlock *, Instruction *>>);
+  // Check if the load can be replaced by a phi.
+  Value *tryReplaceLoadWithPhi(
+      LoadInst *, BasicBlock *,
+      SmallVectorImpl<std::pair<BasicBlock *, Instruction *>> &,
+      ArrayRef<BasicBlock *>);
+  void updateUsesAfterLoadCoercionImpl(LoadInst *,
+                                       SmallVectorImpl<Instruction *> &);
+  void updateUsesAfterLoadCoercion(LoadInst *, Value *);
 };
 
 } // end anonymous namespace
@@ -1439,12 +1506,380 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
   return createStoreExpression(SI, StoreAccess);
 }
 
+// A load can have one or more dependencies as the following examples show:
+//
+// Example 1:
+//  BB1:
+//   ...
+//   store i32 %V1, ptr %P
+//   ...
+//   %V2 = load i32, ptr %P
+//   ...
+//
+// Example 2:
+//  BB1:                       BB2:
+//   store i32 %V1, ptr %P     %V2 = load i32, ptr %P
+//   br label %BB3              br label %BB3
+//                      \      /
+//                     BB3:
+//                      %V3 = load i32, ptr %P
+//
+// In the first example, the load (%V2) has only one dependency. In the second
+// example, the load (%V3) has two dependencies. Therefore, we add the load
+// along with its two dependencies in LoadCoercion map. However, this is not
+// always the case as it is shown below:
+//
+// Example 3:
+//                   BB1:
+//                    %V1 = load <4 x i32>, ptr %P
+//                    br i1 %cond, label %BB2, label %BB3
+//                   /                          \
+//   BB2:                                      BB3:
+//    %V2 = load <2 x i32>, ptr %P              %V3 = load i32, ptr %P
+//    br label %BB4                             br label %BB4
+//		     \                         /
+//                  BB4:
+//                   %V4 = load i32, ptr %P
+//
+// The %V4 load can be optimized by any of the loads (%V1, %V2, %V3). The loads
+// %V2 and %V3 can also be optimized by %V1. For this reason, we need to do an
+// extra check before we add the load in the map. Hence, we check if the load is
+// already in the map and if the existing depending instruction dominates the
+// current depending instruction. If so, then we do not add the new depending
+// instruction in LoadCoercion map. If the current depending instruction
+// dominates the existing depending instruction, then we remove the existing
+// depending instruction from LoadCoercion map and we add the current depending
+// instruction. In Example 3, the %V4 load has only one dependency (%V1) and we
+// add only this one in LoadCoercion map.
+bool NewGVN::tryAddLoadDepInsnIntoLoadCoercionMap(
+    LoadInst *LI, Instruction *CurrentDepI, BasicBlock *CurrentDepIBB) const {
+  // Can't forward from non-atomic to atomic without violating memory model.
+  if (LI->isAtomic() > CurrentDepI->isAtomic())
+    return false;
+
+  if (auto *DepLI = dyn_cast<LoadInst>(CurrentDepI))
+    if (LI->getAlign() < DepLI->getAlign())
+      return false;
+
+  if (auto *DepSI = dyn_cast<StoreInst>(CurrentDepI))
+    if (LI->getAlign() < DepSI->getAlign())
+      return false;
+
+  // Check if LI already exists in LoadCoercion map.
+  auto It = LoadCoercion.find(LI);
+  if (It != LoadCoercion.end()) {
+    auto &ExistingDepInsns = It->second;
+    // Iterate over all the existing depending instructions of LI.
+    for (auto &P : llvm::make_early_inc_range(ExistingDepInsns)) {
+      Instruction *ExistingDepI = P.first;
+      if (MSSAWalker->getClobberingMemoryAccess(getMemoryAccess(CurrentDepI)) ==
+              MSSAWalker->getClobberingMemoryAccess(
+                  getMemoryAccess(ExistingDepI)) &&
+          isa<LoadInst>(ExistingDepI) && isa<LoadInst>(CurrentDepI)) {
+        // If the existing depending instruction dominates the current depending
+        // instruction, then we should not add the current depending instruction
+        // in LoadCoercion map (Example 3).
+        if (DT->dominates(ExistingDepI, CurrentDepI))
+          return true;
+
+        // If the current depending instruction dominates the existing one, then
+        // we remove the existing depending instruction from the LoadCoercion
+        // map. Next, we add the current depending instruction in LoadCoercion
+        // map.
+        if (DT->dominates(CurrentDepI, ExistingDepI))
+          ExistingDepInsns.erase(P);
+      }
+    }
+  }
+  // Add the load and the corresponding depending instruction in LoadCoercion
+  // map.
+  LoadCoercion[LI].insert(std::make_pair(CurrentDepI, CurrentDepIBB));
+  return true;
+}
+
+// Check if it is possible to apply load coercion between CandidateLI and
+// DependingLoad.
+bool NewGVN::findLiveOnEntryDependency(LoadInst *CandidateLI,
+                                       LoadInst *DependingLoad,
+                                       ArrayRef<BasicBlock *> DependingBlocks,
+                                       bool IsMemoryPhiDep) const {
+  int Offset = -1;
+
+  if (!DependingLoad || CandidateLI == DependingLoad ||
+      DependingLoad->getNumUses() == 0)
+    return false;
+
+  BasicBlock *DependingLoadBB = DependingLoad->getParent();
+  if (!ReachableBlocks.count(DependingLoadBB) ||
+      ICF->isDominatedByICFIFromSameBlock(CandidateLI))
+    return false;
+
+  if (InstructionsToErase.count(DependingLoad))
+    return false;
+
+  // We do not look deep in the CFG. We consider either instructions that
+  // dominate CandidateLI or instructions that are in one of the predecessors of
+  // CandidateLI.
+  if (DT->dominates(DependingLoad, CandidateLI))
+    Offset = analyzeLoadFromClobberingLoad(CandidateLI->getType(),
+                                           CandidateLI->getPointerOperand(),
+                                           DependingLoad, DL);
+  else {
+    BasicBlock *CandidateLIBB = CandidateLI->getParent();
+    auto It1 = llvm::find(DependingBlocks, CandidateLIBB);
+    auto It2 = llvm::find(DependingBlocks, DependingLoadBB);
+    auto Ite = DependingBlocks.end();
+    if (It1 == Ite && It2 != Ite && !isBackedge(DependingLoadBB, CandidateLIBB))
+      Offset = analyzeLoadFromClobberingLoad(CandidateLI->getType(),
+                                             CandidateLI->getPointerOperand(),
+                                             DependingLoad, DL);
+  }
+
+  bool IsLoadCoercionCandidate = false;
+  if (Offset >= 0) {
+    // If the candidate load depends on a MemoryPhi, then we do not consider the
+    // parent block of the depending instruction, but instead it is more
+    // convenient to consider the basic block of the MemoryPhi from which the
+    // value comes e.g.:
+    //                            BB1:
+    //                             %V1 = load i32, ptr %P
+    //                             br i1 %Cond, label %BB2, label %BB3
+    //                           /    \
+    //      BB2:                      BB3:
+    //       store i32 100, ptr %P    br label %BB4
+    //       br label %BB4             /
+    //                           \    /
+    //                          BB4:
+    //                           %V2 = load i32, ptr %P
+    //
+    BasicBlock *BB = IsMemoryPhiDep ? DependingBlocks.back() : DependingLoadBB;
+    IsLoadCoercionCandidate |=
+        tryAddLoadDepInsnIntoLoadCoercionMap(CandidateLI, DependingLoad, BB);
+  }
+  return IsLoadCoercionCandidate;
+}
+
+// Process load instructions that have MemoryPhi dependencies.
+bool NewGVN::performSymbolicLoadCoercionForMemoryPhi(
+    LoadInst *LI, MemoryAccess *DefiningAccess) const {
+  assert((!LI || LI->isSimple()) && "Not a simple load");
+  bool IsLoadCoercionCandidate = false;
+  if (auto *MemPhi = dyn_cast<MemoryPhi>(DefiningAccess)) {
+    // If the candidate load is dominated by a call that never returns, then we
+    // do not replace the load with a phi node.
+    if (ICF->isDominatedByICFIFromSameBlock(LI))
+      return false;
+
+    // The MemoryPhi of Example 1 indicates that the load is dependent on the
+    // store (1) in Basic block T and store (2) in basic block F. Therefore,
+    // both of the store instructions should be added in LoadCoercion map.
+    //
+    // Example 1:
+    //     BB1:                        BB2:
+    //      1 = MemoryDef(liveOnEntry)  2 = MemoryDef(liveOnEntry)
+    //      store i32 100, ptr %P       store i32 500, ptr %P
+    //      br label %BB3               br label %BB3
+    //                             \    /
+    //                            BB3:
+    //                             3 = MemoryPhi({BB1,1},{BB2,2})
+    //                             %V = load i32, ptr %P
+    //
+    // In Example 2, the load of BB3 has two dependencies: the store in BB1 as
+    // the MemoryPhi indicates and the load in BB2 which is not included in
+    // MemoryPhi. To find this dependency, we check if it is possible to apply
+    // load coercion to any of the instructions that have live on entry
+    // definition. We restrict our search to the MemoryPhi predecessors and the
+    // instructions that dominate the MemoryPhi.
+    //
+    // Example 2:
+    //     BB1:                        BB2:
+    //      1 = MemoryDef(liveOnEntry)  0 = MemoryDef(liveOnEntry)
+    //      store i32 100, ptr %P       %V1 = load i32, ptr %P
+    //      br label %BB3               br label %BB3
+    //                             \    /
+    //                            BB3:
+    //                             2 = MemoryPhi({BB1,1},{BB2,liveOnEntry})
+    //                             %V2 = load i32, ptr %P
+    //
+    // Iterate over all the operands of the memory phi and check if any of its
+    // operands can optimize the current load.
+    SmallVector<std::pair<MemoryAccess *, BasicBlock *>, 1>
+        LiveOnEntryMemAccesses;
+    for (Use &Op : MemPhi->incoming_values()) {
+      // Bail out if one of the operands is not a memory use or definition.
+      // TODO: Add support for MemoryPhi operands.
+      if (!isa<MemoryUseOrDef>(&Op)) {
+        LoadCoercion.erase(LI);
+        return false;
+      }
+
+      MemoryUseOrDef *MemAccess = cast<MemoryUseOrDef>(&Op);
+      int Offset = -1;
+      Instruction *DepI = nullptr;
+      BasicBlock *IncomingBB = MemPhi->getIncomingBlock(Op);
+
+      // We collect the MemoryPhi operands that have live on entry definitions
+      // and we process them later only if it is possible to optimize LI with
+      // the MemoryDef operand. The search for the live on entry definitions is
+      // expensive and we need to do it only if it is necessary.
+      if (MSSA->isLiveOnEntryDef(MemAccess))
+        LiveOnEntryMemAccesses.push_back(std::make_pair(MemAccess, IncomingBB));
+      else if (isa<MemoryDef>(&Op)) {
+        // Process MemoryDef operands.
+        DepI = MemAccess->getMemoryInst();
+        Offset = -1;
+
+        if (!ReachableBlocks.count(DepI->getParent())) {
+          LoadCoercion.erase(LI);
+          return false;
+        }
+
+        if (DT->dominates(LI, DepI)) {
+          // In this case, there is a loop. For now, we bail-out load
+          // coercion.
+          LoadCoercion.erase(LI);
+          return false;
+        }
+
+        if (auto *DepS = dyn_cast<StoreInst>(DepI))
+          Offset = analyzeLoadFromClobberingStore(
+              LI->getType(), LI->getPointerOperand(), DepS, DL);
+        else if (auto *DepL = dyn_cast<LoadInst>(DepI))
+          Offset = analyzeLoadFromClobberingLoad(
+              LI->getType(), LI->getPointerOperand(), DepL, DL);
+        else if (auto *DepCall = dyn_cast<CallInst>(DepI)) {
+          // TODO: Improve call coverage.
+          if (AA->doesNotAccessMemory(DepCall) || AA->onlyReadsMemory(DepCall))
+            continue;
+          LoadCoercion.erase(LI);
+          return false;
+        } else {
+          LoadCoercion.erase(LI);
+          return false;
+        }
+        if (Offset >= 0)
+          IsLoadCoercionCandidate |=
+              tryAddLoadDepInsnIntoLoadCoercionMap(LI, DepI, IncomingBB);
+        else {
+          LoadCoercion.erase(LI);
+          return false;
+        }
+      }
+    }
+
+    if (IsLoadCoercionCandidate) {
+      // Process the operands with live on entry definitions.
+      for (auto P : LiveOnEntryMemAccesses) {
+        MemoryAccess *MemAccess = P.first;
+        int Offset;
+        for (const auto &U : MemAccess->uses()) {
+          Offset = -1;
+          auto *MemUse = dyn_cast<MemoryUse>(U.getUser());
+          if (MemUse == nullptr)
+            continue;
+          LoadInst *DependingLoad = dyn_cast<LoadInst>(MemUse->getMemoryInst());
+          if (!DependingLoad)
+            continue;
+          SmallVector<BasicBlock *, 1> IncomingBB;
+          IncomingBB.push_back(P.second);
+          findLiveOnEntryDependency(LI, DependingLoad, IncomingBB, true);
+        }
+      }
+    }
+  }
+  return IsLoadCoercionCandidate;
+}
+
+// Find load coercion opportunities between instructions with live on entry
+// definitions.
+bool NewGVN::performSymbolicLoadCoercionForLiveOnEntryDef(
+    LoadInst *LI, MemoryAccess *DefiningAccess) const {
+  bool IsLoadCoercionCandidate = false;
+  for (const auto &U : MSSA->getLiveOnEntryDef()->uses()) {
+    if (auto *MemUse = dyn_cast<MemoryUse>(U.getUser())) {
+      // TODO: Add support for calls.
+      LoadInst *DependingLoad = dyn_cast<LoadInst>(MemUse->getMemoryInst());
+      if (!DependingLoad || LI == DependingLoad)
+        continue;
+
+      // If the two instructions have the same type, then there is a load
+      // coercion opportunity only if the LI and the DependingLoad are in
+      // different basic blocks and the basic block of the DependingLoad is one
+      // of the predecessors of the basic block of the LI. For any other case,
+      // the LI will be eliminated by adding the two loads in the same
+      // congruence class.
+      //
+      // Example 1: Here, we do not need to apply load coercion. The two load
+      // will be added in the same congruence class and %V2 will be eliminated.
+      //
+      //  BB1:
+      // ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/68669