[llvm] [NewGVN][1/3] Load coercion between load and store (PR #68659)

Wed Oct 18 20:47:50 PDT 2023

https://github.com/kmitropoulou updated https://github.com/llvm/llvm-project/pull/68659

>From b327849d0081b5872a7a8075ab6acb277ea79f0c Mon Sep 17 00:00:00 2001
From: Konstantina Mitropoulou <Konstantina.Mitropoulou at amd.com>
Date: Mon, 9 Oct 2023 21:05:14 -0700
Subject: [PATCH] [NewGVN][1/3] Load coercion between load and store

Load coercion consists of two phases:
1. Collection of the load instructions that can be optiimized with load coercion.
We collect pairs of candidate load and its depending instructions. The candidate
load is the laod that will be eliminated by the value that we will extract from
the depending instruction. The reason that we can eliminate the candidate load is
because its memory location overlaps with the memory location of the depending
instruction.
For example, in the following snippet, the candidate load is %V2 and the
depending instruction is the store.

```
Beofre load coercion               After load coercion
BB1:                               BB1:
 store i32 100, ptr %P              store i32 100, ptr %P
 %V1 = ...                   =>     %V1 = ...
 %V2 = load i32, ptr %P             %V3 = add i32 %V1, 100
 %V3 = add i32 %V1, %V2
```

2. Code generation for load coercion: This phase updatest the IR by eliminating
the candidate load and by updating its uses.

This patch implements load coercion between a load candidate and a store
depending instruction. The follow-up patches implement load coercion support for
instructions that have live on entry definitions and MemoryPhi definitions.
---
 llvm/lib/Transforms/Scalar/NewGVN.cpp         | 455 ++++++++++++++++--
 .../load_coercion_between_store_and_load.ll   | 341 +++++++++++++
 llvm/test/Transforms/NewGVN/pr14166-xfail.ll  |   1 -
 3 files changed, 757 insertions(+), 40 deletions(-)
 create mode 100644 llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll

diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 19ac9526b5f88b6..d8d84a4241acb81 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -76,6 +76,7 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
@@ -154,6 +155,10 @@ static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
 static cl::opt<bool> EnablePhiOfOps("enable-phi-of-ops", cl::init(true),
                                     cl::Hidden);
 
+// Enables load coercion for non-constant values.
+static cl::opt<bool> EnableLoadCoercion("enable-load-coercion", cl::init(true),
+                                        cl::Hidden);
+
 //===----------------------------------------------------------------------===//
 //                                GVN Pass
 //===----------------------------------------------------------------------===//
@@ -653,6 +658,16 @@ class NewGVN {
   // Deletion info.
   SmallPtrSet<Instruction *, 8> InstructionsToErase;
 
+  // Map candidate load to their depending instructions.
+  mutable std::map<Value *, DenseSet<std::pair<Instruction *, BasicBlock *>>>
+      LoadCoercion;
+
+  // Keep newly generated loads.
+  SmallVector<Instruction *, 2> NewLoadsInLoadCoercion;
+
+  // Keep newly generated instructions.
+  SmallVector<Instruction *, 2> NewlyGeneratedInsns;
+
 public:
   NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
          TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
@@ -776,9 +791,9 @@ class NewGVN {
   ExprResult checkExprResults(Expression *, Instruction *, Value *) const;
   ExprResult performSymbolicEvaluation(Instruction *,
                                        SmallPtrSetImpl<Value *> &) const;
-  const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
-                                                Instruction *,
-                                                MemoryAccess *) const;
+  const Expression *createLoadExpAndUpdateMemUses(LoadInst *, Value *,
+                                                  MemoryAccess *,
+                                                  MemoryAccess *) const;
   const Expression *performSymbolicLoadEvaluation(Instruction *) const;
   const Expression *performSymbolicStoreEvaluation(Instruction *) const;
   ExprResult performSymbolicCallEvaluation(Instruction *) const;
@@ -853,6 +868,7 @@ class NewGVN {
   // Utilities.
   void cleanupTables();
   std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
+  unsigned updateDFSNumbers(unsigned);
   void updateProcessedCount(const Value *V);
   void verifyMemoryCongruency() const;
   void verifyIterationSettled(Function &F);
@@ -893,6 +909,37 @@ class NewGVN {
   // Debug counter info.  When verifying, we have to reset the value numbering
   // debug counter to the same state it started in to get the same results.
   int64_t StartingVNCounter = 0;
+
+  // The following functions are used in load coercion:
+  // Try to add the load along with the depending instruction(s) in
+  // LoadCoercion map.
+  bool tryAddLoadDepInsnIntoLoadCoercionMap(LoadInst *, Instruction *,
+                                            BasicBlock *) const;
+  // Collect the load instructions that can be optimized with load coercion.
+  // The filtering of the load instructions is based the type of their memory
+  // access.
+  bool performSymbolicLoadCoercionForNonConstantMemoryDef(LoadInst *,
+                                                          StoreInst *,
+                                                          MemoryAccess *) const;
+  const Expression *performSymbolicLoadCoercionForConstantMemoryDef(
+      Type *, Value *, LoadInst *, Instruction *, MemoryAccess *) const;
+  // Code generation for load coercion. Replaces the load with the right
+  // instruction or the right sequence of instructions.
+  bool implementLoadCoercion();
+  // Update MemorySSA with the load instructions that are emitted during load
+  // coercion.
+  void updateMemorySSA(Instruction *, Instruction *);
+  // Extract the value that will replace the load from the depending
+  // instruction.
+  Value *getExtractedValue(LoadInst *, Instruction *);
+  // If load coercion is successful, the uses of the optimized load might need
+  // to be added to new congruence classes in order to optimize the code
+  // further. For this reason, we run value numbering for all the uses of the
+  // optimized load. If load coercion has failed, then we need to add the load
+  // (and its uses) to the right congruence class.
+  void updateUsesAfterLoadCoercionImpl(LoadInst *,
+                                       SmallVectorImpl<Instruction *> &);
+  void updateUsesAfterLoadCoercion(LoadInst *, Value *);
 };
 
 } // end anonymous namespace
@@ -1439,12 +1486,96 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const {
   return createStoreExpression(SI, StoreAccess);
 }
 
+// A load can have one or more dependencies as the following examples show:
+//
+// Example 1:
+//  BB1:
+//   ...
+//   store i32 %V1, ptr %P
+//   ...
+//   %V2 = load i32, ptr %P
+//   ...
+//
+// Example 2:
+//  BB1:                       BB2:
+//   store i32 %V1, ptr %P     %V2 = load i32, ptr %P
+//   br label %BB3              br label %BB3
+//                      \      /
+//                     BB3:
+//                      %V3 = load i32, ptr %P
+//
+// In the first example, the load (%V2) has only one dependency. In the second
+// example, the load (%V3) has two dependencies. Therefore, we add the load
+// along with its two dependencies in LoadCoercion map. However, this is not
+// always the case as it is shown below:
+//
+// Example 3:
+//                   BB1:
+//                    %V1 = load <4 x i32>, ptr %P
+//                    br i1 %cond, label %BB2, label %BB3
+//                   /                          \
+//   BB2:                                      BB3:
+//    %V2 = load <2 x i32>, ptr %P              %V3 = load i32, ptr %P
+//    br label %BB4                             br label %BB4
+//		     \                         /
+//                  BB4:
+//                   %V4 = load i32, ptr %P
+//
+// The %V4 load can be optimized by any of the loads (%V1, %V2, %V3). The loads
+// %V2 and %V3 can also be optimized by %V1. For this reason, we need to do an
+// extra check before we add the load in the map. Hence, we check if the load is
+// already in the map and if the existing depending instruction dominates the
+// current depending instruction. If so, then we do not add the new depending
+// instruction in LoadCoercion map. If the current depending instruction
+// dominates the existing depending instruction, then we remove the existing
+// depending instruction from LoadCoercion map and we add the current depending
+// instruction. In Example 3, the %V4 load has only one dependency (%V1) and we
+// add only this one in LoadCoercion map.
+bool NewGVN::tryAddLoadDepInsnIntoLoadCoercionMap(
+    LoadInst *LI, Instruction *CurrentDepI, BasicBlock *CurrentDepIBB) const {
+  // Can't forward from non-atomic to atomic without violating memory model.
+  if (LI->isAtomic() > CurrentDepI->isAtomic())
+    return false;
+
+  if (auto *DepLI = dyn_cast<LoadInst>(CurrentDepI))
+    if (LI->getAlign() < DepLI->getAlign())
+      return false;
+
+  if (auto *DepSI = dyn_cast<StoreInst>(CurrentDepI))
+    if (LI->getAlign() < DepSI->getAlign())
+      return false;
+
+  // Add the load and the corresponding depending instruction in LoadCoercion
+  // map.
+  LoadCoercion[LI].insert(std::make_pair(CurrentDepI, CurrentDepIBB));
+  return true;
+}
+
+// Find load coercion opportunities between load (LI) and store instructions
+// (DepSI).
+bool NewGVN::performSymbolicLoadCoercionForNonConstantMemoryDef(
+    LoadInst *LI, StoreInst *DepSI, MemoryAccess *DefiningAccess) const {
+  Type *LoadType = LI->getType();
+  bool IsLoadCoercionCandidate = false;
+  if (LI->isAtomic() > DepSI->isAtomic() ||
+      LoadType == DepSI->getValueOperand()->getType())
+    return false;
+
+  int Offset = analyzeLoadFromClobberingStore(
+      LoadType, lookupOperandLeader(LI->getPointerOperand()), DepSI, DL);
+  if (Offset >= 0) {
+    IsLoadCoercionCandidate |=
+        tryAddLoadDepInsnIntoLoadCoercionMap(LI, DepSI, DepSI->getParent());
+  }
+
+  return IsLoadCoercionCandidate;
+}
+
 // See if we can extract the value of a loaded pointer from a load, a store, or
 // a memory instruction.
-const Expression *
-NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
-                                    LoadInst *LI, Instruction *DepInst,
-                                    MemoryAccess *DefiningAccess) const {
+const Expression *NewGVN::performSymbolicLoadCoercionForConstantMemoryDef(
+    Type *LoadType, Value *LoadPtr, LoadInst *LI, Instruction *DepInst,
+    MemoryAccess *DefiningAccess) const {
   assert((!LI || LI->isSimple()) && "Not a simple load");
   if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
     // Can't forward from non-atomic to atomic without violating memory model.
@@ -1464,21 +1595,6 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
         }
       }
     }
-  } else if (auto *DepLI = dyn_cast<LoadInst>(DepInst)) {
-    // Can't forward from non-atomic to atomic without violating memory model.
-    if (LI->isAtomic() > DepLI->isAtomic())
-      return nullptr;
-    int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
-    if (Offset >= 0) {
-      // We can coerce a constant load into a load.
-      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
-        if (auto *PossibleConstant =
-                getConstantValueForLoad(C, Offset, LoadType, DL)) {
-          LLVM_DEBUG(dbgs() << "Coercing load from load " << *LI
-                            << " to constant " << *PossibleConstant << "\n");
-          return createConstantExpression(PossibleConstant);
-        }
-    }
   } else if (auto *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
     int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
     if (Offset >= 0) {
@@ -1510,11 +1626,24 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
       return createConstantExpression(UndefValue::get(LoadType));
   } else if (auto *InitVal =
                  getInitialValueOfAllocation(DepInst, TLI, LoadType))
-      return createConstantExpression(InitVal);
+    return createConstantExpression(InitVal);
 
   return nullptr;
 }
 
+const Expression *
+NewGVN::createLoadExpAndUpdateMemUses(LoadInst *LI, Value *LoadAddressLeader,
+                                      MemoryAccess *OriginalAccess,
+                                      MemoryAccess *DefiningAccess) const {
+  const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI,
+                                        DefiningAccess);
+  // If our MemoryLeader is not our defining access, add a use to the
+  // MemoryLeader, so that we get reprocessed when it changes.
+  if (LE->getMemoryLeader() != DefiningAccess)
+    addMemoryUsers(LE->getMemoryLeader(), OriginalAccess);
+  return LE;
+}
+
 const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
   auto *LI = cast<LoadInst>(I);
 
@@ -1531,6 +1660,22 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
   MemoryAccess *DefiningAccess =
       MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
 
+  // Do not apply load coercion to load instructions that are candidates of
+  // phi-of-ops optimization.
+  if (TempToBlock.count(LI))
+    return createLoadExpAndUpdateMemUses(LI, LoadAddressLeader, OriginalAccess,
+                                         DefiningAccess);
+
+  // Do not apply load coercion to load isntructions that are generated during
+  // load coercion.
+  auto It = llvm::find(NewLoadsInLoadCoercion, LI);
+  if (It != NewLoadsInLoadCoercion.end())
+    return createLoadExpAndUpdateMemUses(LI, LoadAddressLeader, OriginalAccess,
+                                         DefiningAccess);
+
+  // Check if we can apply load coercion.
+  bool IsLoadCoercionCandidate = false;
+
   if (!MSSA->isLiveOnEntryDef(DefiningAccess)) {
     if (auto *MD = dyn_cast<MemoryDef>(DefiningAccess)) {
       Instruction *DefiningInst = MD->getMemoryInst();
@@ -1542,19 +1687,34 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const {
       // certain memory operations that cause the memory to have a fixed value
       // (IE things like calloc).
       if (const auto *CoercionResult =
-              performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
-                                          DefiningInst, DefiningAccess))
+              performSymbolicLoadCoercionForConstantMemoryDef(
+                  LI->getType(), LoadAddressLeader, LI, DefiningInst,
+                  DefiningAccess))
         return CoercionResult;
+
+      if (EnableLoadCoercion) {
+        if (auto *DepSI = dyn_cast<StoreInst>(DefiningInst)) {
+          if (!isa<Constant>(lookupOperandLeader(DepSI->getValueOperand()))) {
+            IsLoadCoercionCandidate =
+                performSymbolicLoadCoercionForNonConstantMemoryDef(
+                    LI, DepSI, DefiningAccess);
+          }
+        }
+      }
     }
   }
 
-  const auto *LE = createLoadExpression(LI->getType(), LoadAddressLeader, LI,
-                                        DefiningAccess);
-  // If our MemoryLeader is not our defining access, add a use to the
-  // MemoryLeader, so that we get reprocessed when it changes.
-  if (LE->getMemoryLeader() != DefiningAccess)
-    addMemoryUsers(LE->getMemoryLeader(), OriginalAccess);
-  return LE;
+  // If LI is a candidate for load coercion, then we do not create a load
+  // expression and we remove it from PHINodeUses which keeps the candidates of
+  // phi-of-ops optimization.
+  if (EnableLoadCoercion && IsLoadCoercionCandidate) {
+    if (PHINodeUses.count(LI))
+      const_cast<NewGVN *>(this)->PHINodeUses.erase(LI);
+    return nullptr;
+  }
+  // Otherwise, we create a load expression.
+  return createLoadExpAndUpdateMemUses(LI, LoadAddressLeader, OriginalAccess,
+                                       DefiningAccess);
 }
 
 NewGVN::ExprResult
@@ -2986,6 +3146,9 @@ void NewGVN::cleanupTables() {
   MemoryToUsers.clear();
   RevisitOnReachabilityChange.clear();
   IntrinsicInstPred.clear();
+  LoadCoercion.clear();
+  NewLoadsInLoadCoercion.clear();
+  NewlyGeneratedInsns.clear();
 }
 
 // Assign local DFS number mapping to instructions, and leave space for Value
@@ -3021,6 +3184,17 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
   return std::make_pair(Start, End);
 }
 
+unsigned NewGVN::updateDFSNumbers(unsigned ICount) {
+  // Now a standard depth first ordering of the domtree is equivalent to RPO.
+  for (auto DTN : depth_first(DT->getRootNode())) {
+    BasicBlock *B = DTN->getBlock();
+    const auto &BlockRange = assignDFSNumbers(B, ICount);
+    BlockInstRange.insert({B, BlockRange});
+    ICount += BlockRange.second - BlockRange.first;
+  }
+  return ICount;
+}
+
 void NewGVN::updateProcessedCount(const Value *V) {
 #ifndef NDEBUG
   if (ProcessedCount.count(V) == 0) {
@@ -3458,13 +3632,7 @@ bool NewGVN::runGVN() {
       });
   }
 
-  // Now a standard depth first ordering of the domtree is equivalent to RPO.
-  for (auto *DTN : depth_first(DT->getRootNode())) {
-    BasicBlock *B = DTN->getBlock();
-    const auto &BlockRange = assignDFSNumbers(B, ICount);
-    BlockInstRange.insert({B, BlockRange});
-    ICount += BlockRange.second - BlockRange.first;
-  }
+  ICount = updateDFSNumbers(ICount);
   initializeCongruenceClasses(F);
 
   TouchedInstructions.resize(ICount);
@@ -3485,6 +3653,15 @@ bool NewGVN::runGVN() {
   verifyIterationSettled(F);
   verifyStoreExpressions();
 
+  if (EnableLoadCoercion && implementLoadCoercion()) {
+    // Update the newly generated instructions with the correct DFS numbers.
+    // TODO: Update DFS numbers faster.
+    InstrDFS.clear();
+    DFSToInstr.clear();
+    RevisitOnReachabilityChange.clear();
+    ICount = updateDFSNumbers(0);
+  }
+
   Changed |= eliminateInstructions(F);
 
   // Delete all instructions marked for deletion.
@@ -3821,6 +3998,206 @@ Value *NewGVN::findPHIOfOpsLeader(const Expression *E,
   return nullptr;
 }
 
+// Update MemorySSA for the newly emitted load instruction.
+void NewGVN::updateMemorySSA(Instruction *LoadToOptimize,
+                             Instruction *NewLoad) {
+  MemorySSAUpdater MemSSAUpdater(MSSA);
+  MemoryAccess *DefiningAccess = MSSA->getLiveOnEntryDef();
+  MemoryAccess *NewAccess = MemSSAUpdater.createMemoryAccessInBB(
+      NewLoad, DefiningAccess, NewLoad->getParent(),
+      MemorySSA::BeforeTerminator);
+  if (auto *NewDef = dyn_cast<MemoryDef>(NewAccess))
+    MemSSAUpdater.insertDef(NewDef, /*RenameUses=*/true);
+  else
+    MemSSAUpdater.insertUse(cast<MemoryUse>(NewAccess),
+                            /*RenameUses=*/true);
+}
+
+// Extract the correct value from the depending instruction.
+Value *NewGVN::getExtractedValue(LoadInst *LI, Instruction *DepI) {
+
+  Type *LoadTy = LI->getType();
+  Value *NewValue = nullptr;
+  Instruction *InsertPtr = nullptr;
+  // Emit the instructions that extract the coerced value from the depending
+  // instruction.
+  if (auto *Store = dyn_cast<StoreInst>(DepI)) {
+    int Offset = analyzeLoadFromClobberingStore(LoadTy, LI->getPointerOperand(),
+                                                Store, DL);
+    InsertPtr = Store->getNextNode();
+    NewValue = getValueForLoad(Store->getValueOperand(), Offset, LoadTy,
+                               InsertPtr, DL);
+  } else if (LoadInst *Load = dyn_cast<LoadInst>(DepI)) {
+    int Offset = analyzeLoadFromClobberingLoad(LoadTy, LI->getPointerOperand(),
+                                               Load, DL);
+    InsertPtr = Load->getNextNode();
+    NewValue = getValueForLoad(Load, Offset, LoadTy, InsertPtr, DL);
+  }
+
+  // Get the newly generated instructions and add them to NewLoadsInLoadCoercion
+  // and NewlyGeneratedInsns.
+  if (!isa<Constant>(NewValue) && !isa<Argument>(NewValue))
+    for (Instruction *CurInsn = DepI->getNextNode(); CurInsn != InsertPtr;
+         CurInsn = CurInsn->getNextNode()) {
+      if (LoadInst *NewLI = dyn_cast<LoadInst>(CurInsn)) {
+        updateMemorySSA(LI, NewLI);
+        NewLoadsInLoadCoercion.push_back(LI);
+      }
+      NewlyGeneratedInsns.push_back(CurInsn);
+    }
+
+  return NewValue;
+}
+
+void NewGVN::updateUsesAfterLoadCoercionImpl(
+    LoadInst *LI, SmallVectorImpl<Instruction *> &LIUses) {
+  // Run value numbering for the users of the candidate load instruction.
+  while (!LIUses.empty()) {
+    Instruction *I = LIUses.front();
+    assert(I != LI &&
+           "Vanity check that we do not process the optimized load.\n");
+    LIUses.erase(&*(LIUses.begin()));
+    if (InstructionsToErase.count(I))
+      continue;
+    CongruenceClass *OrigClass = ValueToClass.lookup(I);
+    valueNumberInstruction(I);
+    updateProcessedCount(I);
+    CongruenceClass *NewClass = ValueToClass.lookup(I);
+    if (OrigClass != NewClass) {
+      for (auto *User : I->users()) {
+        auto *UserI = cast<Instruction>(User);
+        LIUses.push_back(UserI);
+      }
+      if (auto *NewLI = dyn_cast<LoadInst>(I))
+        if (MemoryAccess *MA = getMemoryAccess(NewLI))
+          for (auto *MAU : MA->users()) {
+            Instruction *MAUInst = cast<Instruction>(MAU);
+            LIUses.push_back(MAUInst);
+          }
+      auto It = PredicateToUsers.find(I);
+      if (It != PredicateToUsers.end())
+        for (auto *PredI : PredicateToUsers[I]) {
+          LIUses.push_back(PredI);
+        }
+    }
+  }
+}
+
+void NewGVN::updateUsesAfterLoadCoercion(LoadInst *LI, Value *NewValue) {
+  SmallVector<Instruction *, 2> LIUses;
+  if (!NewValue) {
+    Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
+    MemoryAccess *OriginalAccess = getMemoryAccess(LI);
+    MemoryAccess *DefiningAccess =
+        MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
+    const Expression *Symbolized = createLoadExpAndUpdateMemUses(
+        LI, LoadAddressLeader, OriginalAccess, DefiningAccess);
+    performCongruenceFinding(LI, Symbolized);
+    for (Use &U : LI->uses())
+      LIUses.push_back(cast<Instruction>(U.getUser()));
+    updateUsesAfterLoadCoercionImpl(LI, LIUses);
+  } else {
+    CongruenceClass *LIClass = ValueToClass.lookup(LI);
+    InstructionsToErase.insert(LI);
+    LI->replaceAllUsesWith(NewValue);
+    // Collect the instructions for which we have to run value numbering.
+    // These are all the instructions that are in the same congruence class as
+    // LI and all the newly generated instructions.
+    for (auto *Member : *LIClass) {
+      auto *I = cast<Instruction>(Member);
+      if (LI == I)
+        continue;
+      LIUses.push_back(I);
+    }
+    for (Instruction *I : NewlyGeneratedInsns) {
+      TOPClass->insert(I);
+      ValueToClass[I] = TOPClass;
+      LIUses.push_back(I);
+    }
+
+    updateUsesAfterLoadCoercionImpl(LI, LIUses);
+
+    if (isa<PHINode>(NewValue)) {
+      NewValue->takeName(LI);
+    }
+    if (Instruction *I = dyn_cast<Instruction>(NewValue))
+      I->setDebugLoc(LI->getDebugLoc());
+    LLVM_DEBUG(dbgs() << "Load coersion: The load " << *LI
+                      << " was eliminated and its uses were replaced by "
+                      << *NewValue << "\n");
+  }
+  LIUses.clear();
+  NewlyGeneratedInsns.clear();
+}
+
+// Iterate over the load instructions of LoadCoercion map and replace them with
+// the right sequence of instructions.
+bool NewGVN::implementLoadCoercion() {
+  bool AnythingReplaced = false;
+  for (const auto &P : LoadCoercion) {
+    LoadInst *LI = cast<LoadInst>(P.first);
+    auto DependingInsns = P.second;
+    Value *NewValue = nullptr;
+    Instruction *FirstDepI = (*DependingInsns.begin()).first;
+    MemoryAccess *OriginalAccess = getMemoryAccess(LI);
+    MemoryAccess *DefiningAccess =
+        MSSAWalker->getClobberingMemoryAccess(OriginalAccess);
+    // Firstly, we check if we can extract the correct value from the depending
+    // instruction. This happens when LI is dominated by its only depending
+    // instruction:
+    //
+    // Example 1
+    // BB1:                               BB1:
+    //  store i32 100, ptr %P              store i32 100, ptr %P
+    //  %V1 = ...                   =>     %V1 = ...
+    //  %V2 = load i32, ptr %P             %V3 = add i32 %V1, 100
+    //  %V3 = add i32 %V1, %V2
+    //
+    // Example 2
+    // Before load coercion
+    //   BB1:
+    //    store i32 100, ptr %P
+    //    br i1 %Cond, label %BB2, label %BB3
+    //          /           \
+    // BB2                  BB3
+    //  ...                  ...
+    //  br label %BB4        br label %BB4
+    //            \         /
+    //           BB4:
+    //             %V1 = ...
+    //             %V2 = load i32, ptr %P
+    //             %V3 = add i32 %V1, %V2
+    //
+    // After load coercion
+    //   BB1:
+    //    store i32 100, ptr %P
+    //    br i1 %Cond, label %BB2, label %BB3
+    //          /           \
+    // BB2                  BB3
+    //  ...                  ...
+    //  br label %BB4        br label %BB4
+    //            \         /
+    //           BB4:
+    //             %V1 = ...
+    //             %V3 = add i32 %V1, 100
+    //
+    if (DependingInsns.size() == 1 && DT->dominates(FirstDepI, LI) &&
+        !isa<MemoryPhi>(DefiningAccess))
+      NewValue = getExtractedValue(LI, FirstDepI);
+    // If we could not eliminate the load, then we need run value numbering for
+    // the load (the load does not have an expression up to this point) and its
+    // uses.
+    if (!NewValue)
+      updateUsesAfterLoadCoercion(LI, NewValue);
+    else {
+      // Run value numbering for all the new instructions and their uses.
+      updateUsesAfterLoadCoercion(LI, NewValue);
+      AnythingReplaced = true;
+    }
+  }
+  return AnythingReplaced;
+}
+
 bool NewGVN::eliminateInstructions(Function &F) {
   // This is a non-standard eliminator. The normal way to eliminate is
   // to walk the dominator tree in order, keeping track of available
diff --git a/llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll b/llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll
new file mode 100644
index 000000000000000..dcb166f6d29c1a6
--- /dev/null
+++ b/llvm/test/Transforms/NewGVN/load_coercion_between_store_and_load.ll
@@ -0,0 +1,341 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -p=gvn < %s | FileCheck %s -check-prefixes=GVN,OLDGVN
+; RUN: opt -S -p=newgvn < %s | FileCheck %s -check-prefixes=GVN,NEWGVN
+
+define float @test1(i32 %V1, ptr %P) {
+; GVN-LABEL: @test1(
+; GVN-NEXT:    store i32 [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP1:%.*]] = bitcast i32 [[V1]] to float
+; GVN-NEXT:    ret float [[TMP1]]
+;
+  store i32 %V1, ptr %P, align 1
+  %V2 = load float, ptr %P, align 1
+  ret float %V2
+}
+
+define float @test2(ptr %V1, ptr %P) {
+; GVN-LABEL: @test2(
+; GVN-NEXT:    store ptr [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[V1]] to i64
+; GVN-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; GVN-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
+; GVN-NEXT:    ret float [[TMP3]]
+;
+  store ptr %V1, ptr %P, align 1
+  %V2 = load float, ptr %P, align 1
+  ret float %V2
+}
+
+define i8 @test3(i32 %V1, ptr %P) {
+; GVN-LABEL: @test3(
+; GVN-NEXT:    store i32 [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP1:%.*]] = trunc i32 [[V1]] to i8
+; GVN-NEXT:    ret i8 [[TMP1]]
+;
+  store i32 %V1, ptr %P, align 1
+  %V2 = load i8, ptr %P, align 1
+  ret i8 %V2
+}
+
+define float @test4(i64 %V1, ptr %P) {
+; GVN-LABEL: @test4(
+; GVN-NEXT:    store i64 [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP1:%.*]] = trunc i64 [[V1]] to i32
+; GVN-NEXT:    [[TMP2:%.*]] = bitcast i32 [[TMP1]] to float
+; GVN-NEXT:    ret float [[TMP2]]
+;
+  store i64 %V1, ptr %P, align 1
+  %V2 = load float, ptr %P, align 1
+  ret float %V2
+}
+
+define i8 @test5(ptr %P, ptr %T) {
+; GVN-LABEL: @test5(
+; GVN-NEXT:    [[V1:%.*]] = load i8, ptr [[T:%.*]], align 1
+; GVN-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[P:%.*]], i32 2
+; GVN-NEXT:    store i8 [[V1]], ptr [[P2]], align 1
+; GVN-NEXT:    ret i8 [[V1]]
+;
+  %V1 = load i8, ptr %T, align 1
+  %P2 = getelementptr i8, ptr %P, i32 2
+  store i8 %V1, ptr %P2, align 1
+  %V2 = load i8, ptr %P2, align 1
+  ret i8 %V2
+}
+
+define ptr @test6(i64 %V1, ptr %P) {
+; GVN-LABEL: @test6(
+; GVN-NEXT:    store i64 [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP1:%.*]] = inttoptr i64 [[V1]] to ptr
+; GVN-NEXT:    ret ptr [[TMP1]]
+;
+  store i64 %V1, ptr %P, align 1
+  %V2 = load ptr, ptr %P, align 1
+  ret ptr %V2
+}
+
+define i32 @test7(double %V1, ptr %P) {
+; GVN-LABEL: @test7(
+; GVN-NEXT:    store double [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP1:%.*]] = bitcast double [[V1]] to i64
+; GVN-NEXT:    [[TMP2:%.*]] = trunc i64 [[TMP1]] to i32
+; GVN-NEXT:    ret i32 [[TMP2]]
+;
+  store double %V1, ptr %P, align 1
+  %V2 = load i32, ptr %P, align 1
+  ret i32 %V2
+}
+
+define i8 @test8(i32 %V1, ptr %P) {
+; OLDGVN-LABEL: @test8(
+; OLDGVN-NEXT:    store i32 [[V1:%.*]], ptr [[P:%.*]], align 1
+; OLDGVN-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P]], i32 2
+; OLDGVN-NEXT:    [[TMP1:%.*]] = lshr i32 [[V1]], 16
+; OLDGVN-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+; OLDGVN-NEXT:    ret i8 [[TMP2]]
+;
+; NEWGVN-LABEL: @test8(
+; NEWGVN-NEXT:    store i32 [[V1:%.*]], ptr [[P:%.*]], align 1
+; NEWGVN-NEXT:    [[TMP1:%.*]] = lshr i32 [[V1]], 16
+; NEWGVN-NEXT:    [[TMP2:%.*]] = trunc i32 [[TMP1]] to i8
+; NEWGVN-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[P]], i32 2
+; NEWGVN-NEXT:    ret i8 [[TMP2]]
+;
+  store i32 %V1, ptr %P, align 1
+  %P1 = getelementptr i8, ptr %P, i32 2
+  %V2 = load i8, ptr %P1, align 1
+  ret i8 %V2
+}
+
+define double @test9(i64 %V, ptr %P, i1 %cond) {
+;   Entry
+;    /  \
+;   T    F
+;
+; GVN-LABEL: @test9(
+; GVN-NEXT:  Entry:
+; GVN-NEXT:    store i64 [[V:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP0:%.*]] = bitcast i64 [[V]] to double
+; GVN-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
+; GVN:       T:
+; GVN-NEXT:    ret double [[TMP0]]
+; GVN:       F:
+; GVN-NEXT:    ret double [[TMP0]]
+;
+Entry:
+  %A = load i64 , ptr %P, align 1
+  store i64 %V, ptr %P, align 1
+  br i1 %cond, label %T, label %F
+T:
+  %B = load double, ptr %P, align 1
+  ret double %B
+
+F:
+  %C = load double, ptr %P, align 1
+  ret double %C
+}
+
+define <{i8, float}> @test10(i32 %V0, ptr %P) {
+; OLDGVN-LABEL: @test10(
+; OLDGVN-NEXT:    store i32 [[V0:%.*]], ptr [[P:%.*]], align 1
+; OLDGVN-NEXT:    [[TMP1:%.*]] = bitcast i32 [[V0]] to float
+; OLDGVN-NEXT:    [[TMP2:%.*]] = trunc i32 [[V0]] to i8
+; OLDGVN-NEXT:    [[I1:%.*]] = insertvalue <{ i8, float }> poison, i8 [[TMP2]], 0
+; OLDGVN-NEXT:    [[I2:%.*]] = insertvalue <{ i8, float }> [[I1]], float [[TMP1]], 1
+; OLDGVN-NEXT:    ret <{ i8, float }> [[I2]]
+;
+; NEWGVN-LABEL: @test10(
+; NEWGVN-NEXT:    store i32 [[V0:%.*]], ptr [[P:%.*]], align 1
+; NEWGVN-NEXT:    [[TMP1:%.*]] = trunc i32 [[V0]] to i8
+; NEWGVN-NEXT:    [[TMP2:%.*]] = bitcast i32 [[V0]] to float
+; NEWGVN-NEXT:    [[I1:%.*]] = insertvalue <{ i8, float }> poison, i8 [[TMP1]], 0
+; NEWGVN-NEXT:    [[I2:%.*]] = insertvalue <{ i8, float }> [[I1]], float [[TMP2]], 1
+; NEWGVN-NEXT:    ret <{ i8, float }> [[I2]]
+;
+  store i32 %V0, ptr %P, align 1
+  %V1 = load float, ptr %P, align 1
+  %V2 = load i8, ptr %P, align 1
+  %I1 = insertvalue <{i8, float}> poison, i8 %V2, 0
+  %I2 = insertvalue <{i8, float}> %I1, float %V1, 1
+  ret <{i8, float}> %I2
+}
+
+define <{i8, float}> @test11(i32 %V0, ptr %P, i1 %cond) {
+;   Entry
+;    /  \
+;   T    F
+;
+; GVN-LABEL: @test11(
+; GVN-NEXT:  Entry:
+; GVN-NEXT:    store i32 [[V0:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP0:%.*]] = trunc i32 [[V0]] to i8
+; GVN-NEXT:    [[TMP1:%.*]] = bitcast i32 [[V0]] to float
+; GVN-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
+; GVN:       T:
+; GVN-NEXT:    [[I1:%.*]] = insertvalue <{ i8, float }> poison, float [[TMP1]], 1
+; GVN-NEXT:    ret <{ i8, float }> [[I1]]
+; GVN:       F:
+; GVN-NEXT:    [[I2:%.*]] = insertvalue <{ i8, float }> poison, i8 [[TMP0]], 0
+; GVN-NEXT:    ret <{ i8, float }> [[I2]]
+;
+Entry:
+  store i32 %V0, ptr %P, align 1
+  br i1 %cond, label %T, label %F
+
+T:
+  %V1 = load float, ptr %P, align 1
+  %I1 = insertvalue <{i8, float}> poison, float %V1, 1
+  ret <{i8, float}> %I1
+
+F:
+  %V2 = load i8, ptr %P, align 1
+  %I2 = insertvalue <{i8, float}> poison, i8 %V2, 0
+  ret <{i8, float}> %I2
+}
+
+define <{float, float}> @test12(i32 %V0, ptr %P, i1 %cond) {
+;   Entry
+;    /  \
+;   T    F
+;
+; GVN-LABEL: @test12(
+; GVN-NEXT:  Entry:
+; GVN-NEXT:    store i32 [[V0:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP0:%.*]] = bitcast i32 [[V0]] to float
+; GVN-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
+; GVN:       T:
+; GVN-NEXT:    [[I1:%.*]] = insertvalue <{ float, float }> poison, float [[TMP0]], 1
+; GVN-NEXT:    ret <{ float, float }> [[I1]]
+; GVN:       F:
+; GVN-NEXT:    [[I2:%.*]] = insertvalue <{ float, float }> poison, float [[TMP0]], 0
+; GVN-NEXT:    ret <{ float, float }> [[I2]]
+;
+Entry:
+  store i32 %V0, ptr %P, align 1
+  br i1 %cond, label %T, label %F
+
+T:
+  %V1 = load float, ptr %P, align 1
+  %I1 = insertvalue <{float, float}> poison, float %V1, 1
+  ret <{float, float}> %I1
+
+F:
+  %V2 = load float, ptr %P, align 1
+  %I2 = insertvalue <{float, float}> poison, float %V2, 0
+  ret <{float, float}> %I2
+}
+
+define i8 @test13(ptr %P, i32 %V1) {
+; GVN-LABEL: @test13(
+; GVN-NEXT:    store i32 [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP1:%.*]] = trunc i32 [[V1]] to i8
+; GVN-NEXT:    [[V5:%.*]] = add i8 [[TMP1]], [[TMP1]]
+; GVN-NEXT:    ret i8 [[V5]]
+;
+  store i32 %V1, ptr %P, align 1
+  %V2 = load i8, ptr %P, align 1
+  %V3 = load i64, ptr %P, align 1
+  %V4 = trunc i32 %V1 to i8
+  %V5 = add i8 %V2, %V4
+  ret i8 %V5
+}
+
+define i8 @test14(ptr %P, i32 %V1) {
+; GVN-LABEL: @test14(
+; GVN-NEXT:    store i32 [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[TMP1:%.*]] = trunc i32 [[V1]] to i8
+; GVN-NEXT:    [[V5:%.*]] = add i8 [[TMP1]], [[TMP1]]
+; GVN-NEXT:    ret i8 [[V5]]
+;
+  store i32 %V1, ptr %P, align 1
+  %V2 = load i8, ptr %P, align 1
+  %V3 = load i8, ptr %P, align 1
+  %V5 = add i8 %V2, %V3
+  ret i8 %V5
+}
+
+define i16 @test15(ptr %P, i1 %Cond) {
+;   Entry
+;    /  \
+;   T    F
+;    \  /
+;     vv
+;    Exit
+;
+; GVN-LABEL: @test15(
+; GVN-NEXT:  Entry:
+; GVN-NEXT:    store i32 13, ptr [[P:%.*]], align 1
+; GVN-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[F:%.*]]
+; GVN:       T:
+; GVN-NEXT:    br label [[EXIT:%.*]]
+; GVN:       F:
+; GVN-NEXT:    br label [[EXIT]]
+; GVN:       Exit:
+; GVN-NEXT:    ret i16 13
+;
+Entry:
+  store i32 13, ptr %P, align 1
+  br i1 %Cond, label %T, label %F
+
+T:
+  %V1 = load i16, ptr %P, align 1
+  br label %Exit
+
+F:
+  %V2 = load i32, ptr %P, align 1
+  br label %Exit
+
+Exit:
+  %V3 = load i16, ptr %P, align 1
+  ret i16 %V3
+}
+
+define i64 @test16(ptr %V1) {
+; GVN-LABEL: @test16(
+; GVN-NEXT:  Entry:
+; GVN-NEXT:    store ptr [[V1:%.*]], ptr inttoptr (i64 16 to ptr), align 8
+; GVN-NEXT:    [[V3:%.*]] = load i64, ptr [[V1]], align 4
+; GVN-NEXT:    ret i64 [[V3]]
+;
+Entry:
+  store ptr %V1, ptr inttoptr (i64 16 to ptr), align 8
+  %V2 = load ptr, ptr inttoptr (i64 16 to ptr), align 8
+  %V3 = load i64, ptr %V2
+  ret i64 %V3
+}
+
+declare void @foo1(ptr, i32) #0
+
+define i32 @test17(ptr %P, i32 %V1) {
+; GVN-LABEL: @test17(
+; GVN-NEXT:    store i32 [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[V2:%.*]] = call i32 @foo1(ptr [[P]], i32 [[V1]]) #[[ATTR0:[0-9]+]]
+; GVN-NEXT:    [[V3:%.*]] = load i32, ptr [[P]], align 1
+; GVN-NEXT:    [[V4:%.*]] = add i32 [[V2]], [[V3]]
+; GVN-NEXT:    ret i32 [[V4]]
+;
+  store i32 %V1, ptr %P, align 1
+  %V2 = call i32 @foo1(ptr %P, i32 %V1) #0
+  %V3 = load i32, ptr %P, align 1
+  %V4 = add i32 %V2, %V3
+  ret i32 %V4
+}
+
+declare void @foo2(ptr, i32) #1
+
+define i32 @test18(ptr %P, i32 %V1) {
+; GVN-LABEL: @test18(
+; GVN-NEXT:    store i32 [[V1:%.*]], ptr [[P:%.*]], align 1
+; GVN-NEXT:    [[V2:%.*]] = call i32 @foo2(ptr [[P]], i32 [[V1]]) #[[ATTR1:[0-9]+]]
+; GVN-NEXT:    [[V4:%.*]] = add i32 [[V2]], [[V1]]
+; GVN-NEXT:    ret i32 [[V4]]
+;
+  store i32 %V1, ptr %P, align 1
+  %V2 = call i32 @foo2(ptr %P, i32 %V1) #1
+  %V3 = load i32, ptr %P, align 1
+  %V4 = add i32 %V2, %V3
+  ret i32 %V4
+}
+
+attributes #0 = { willreturn }
+attributes #1 = { readonly }
diff --git a/llvm/test/Transforms/NewGVN/pr14166-xfail.ll b/llvm/test/Transforms/NewGVN/pr14166-xfail.ll
index 1e722361d7c545f..ceb9d2c67e02c4d 100644
--- a/llvm/test/Transforms/NewGVN/pr14166-xfail.ll
+++ b/llvm/test/Transforms/NewGVN/pr14166-xfail.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -disable-basic-aa -passes=newgvn -S < %s | FileCheck %s
 ; NewGVN fails this due to missing load coercion
 target datalayout = "e-p:32:32:32"