[llvm] [X86][SimplifyCFG] Support hoisting load/store with conditional faulting (PR #96878)

Sat Aug 3 09:45:28 PDT 2024

https://github.com/KanRobert updated https://github.com/llvm/llvm-project/pull/96878

>From 8416d9fe4ae22540b30435477776118411cb14ce Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Thu, 27 Jun 2024 17:10:00 +0800
Subject: [PATCH 1/3] [X86][SimplifyCFG] Support hoisting load/store with
 conditional faulting

---
 .../Transforms/Utils/SimplifyCFGOptions.h     |  10 +
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   8 +
 llvm/lib/Passes/PassBuilder.cpp               |   2 +
 .../lib/Transforms/Scalar/SimplifyCFGPass.cpp |  25 +
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 243 ++++++-
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   2 +
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |   8 +
 llvm/test/CodeGen/ARM/O3-pipeline.ll          |   2 +
 llvm/test/CodeGen/LoongArch/opt-pipeline.ll   |   2 +
 llvm/test/CodeGen/PowerPC/O3-pipeline.ll      |   2 +
 llvm/test/CodeGen/RISCV/O3-pipeline.ll        |   2 +
 .../CodeGen/X86/hoist-loads-stores-with-cf.ll |  32 +
 llvm/test/CodeGen/X86/opt-pipeline.ll         |   2 +
 llvm/test/Other/new-pm-print-pipeline.ll      |   4 +-
 .../X86/hoist-loads-stores-with-cf.ll         | 623 ++++++++++++++++++
 15 files changed, 960 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll
 create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll

diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
index 2ea9d64f03cb6..3ddc3d6d00893 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -27,10 +27,12 @@ struct SimplifyCFGOptions {
   bool ConvertSwitchToLookupTable = false;
   bool NeedCanonicalLoop = true;
   bool HoistCommonInsts = false;
+  bool HoistLoadsStoresWithCondFaulting = false;
   bool SinkCommonInsts = false;
   bool SimplifyCondBranch = true;
   bool SpeculateBlocks = true;
   bool SpeculateUnpredictables = false;
+  bool RunInCodeGen = false;
 
   AssumptionCache *AC = nullptr;
 
@@ -59,6 +61,10 @@ struct SimplifyCFGOptions {
     HoistCommonInsts = B;
     return *this;
   }
+  SimplifyCFGOptions &hoistLoadsStoresWithCondFaulting(bool B) {
+    HoistLoadsStoresWithCondFaulting = B;
+    return *this;
+  }
   SimplifyCFGOptions &sinkCommonInsts(bool B) {
     SinkCommonInsts = B;
     return *this;
@@ -80,6 +86,10 @@ struct SimplifyCFGOptions {
     SpeculateUnpredictables = B;
     return *this;
   }
+  SimplifyCFGOptions &runInCodeGen(bool B) {
+    RunInCodeGen = B;
+    return *this;
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index eb74c6f3e5d44..fcbd7cafaf303 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -47,6 +47,7 @@
 #include "llvm/Target/CGPassBuilderOption.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
 #include "llvm/Transforms/Utils.h"
 #include <cassert>
 #include <optional>
@@ -852,6 +853,13 @@ void TargetPassConfig::addIRPasses() {
       !DisableAtExitBasedGlobalDtorLowering)
     addPass(createLowerGlobalDtorsLegacyPass());
 
+  // Hoist loads/stores to reduce branches when profitable.
+  if (getOptLevel() != CodeGenOptLevel::None)
+    addPass(createCFGSimplificationPass(
+        SimplifyCFGOptions()
+            .runInCodeGen(true)
+            .hoistLoadsStoresWithCondFaulting(true)));
+
   // Make sure that no unreachable blocks are instruction selected.
   addPass(createUnreachableBlockEliminationPass());
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 5dbb1e2f49871..e981db0f88366 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -843,6 +843,8 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
       Result.needCanonicalLoops(Enable);
     } else if (ParamName == "hoist-common-insts") {
       Result.hoistCommonInsts(Enable);
+    } else if (ParamName == "hoist-loads-stores-with-cond-faulting") {
+      Result.hoistLoadsStoresWithCondFaulting(Enable);
     } else if (ParamName == "sink-common-insts") {
       Result.sinkCommonInsts(Enable);
     } else if (ParamName == "speculate-unpredictables") {
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 11de37f7a7c10..4ff2fff17a460 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -73,6 +73,11 @@ static cl::opt<bool> UserHoistCommonInsts(
     "hoist-common-insts", cl::Hidden, cl::init(false),
     cl::desc("hoist common instructions (default = false)"));
 
+static cl::opt<bool> UserHoistLoadsStoresWithCondFaulting(
+    "hoist-loads-stores-with-cond-faulting", cl::Hidden, cl::init(false),
+    cl::desc("Hoist loads/stores if the target supports conditional faulting "
+             "(default = false)"));
+
 static cl::opt<bool> UserSinkCommonInsts(
     "sink-common-insts", cl::Hidden, cl::init(false),
     cl::desc("Sink common instructions (default = false)"));
@@ -272,6 +277,21 @@ static bool simplifyFunctionCFGImpl(Function &F, const TargetTransformInfo &TTI,
                                     const SimplifyCFGOptions &Options) {
   DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
 
+  // In codegen, we use unreachableblockelim to remove dead blocks b/c it
+  // doesn't really have any well defined semantics for unreachable code.
+  //
+  // The first character is not capitalized to reduce the code diff FTTB.
+  auto removeUnreachableBlocks = [&](Function &F, DomTreeUpdater *DTU = nullptr,
+                                     MemorySSAUpdater *MSSAU = nullptr) {
+    return !Options.RunInCodeGen &&
+           llvm::removeUnreachableBlocks(F, DTU, MSSAU);
+  };
+
+  // Only run simplifycfg in codegen if we'd like to hoist loads/stores.
+  if (Options.RunInCodeGen && (!Options.HoistLoadsStoresWithCondFaulting ||
+                               !TTI.hasConditionalLoadStoreForType()))
+    return false;
+
   bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr);
   EverChanged |=
       tailMergeBlocksWithSimilarFunctionTerminators(F, DT ? &DTU : nullptr);
@@ -326,6 +346,9 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
     Options.NeedCanonicalLoop = UserKeepLoops;
   if (UserHoistCommonInsts.getNumOccurrences())
     Options.HoistCommonInsts = UserHoistCommonInsts;
+  if (UserHoistLoadsStoresWithCondFaulting.getNumOccurrences())
+    Options.HoistLoadsStoresWithCondFaulting =
+        UserHoistLoadsStoresWithCondFaulting;
   if (UserSinkCommonInsts.getNumOccurrences())
     Options.SinkCommonInsts = UserSinkCommonInsts;
   if (UserSpeculateUnpredictables.getNumOccurrences())
@@ -354,6 +377,8 @@ void SimplifyCFGPass::printPipeline(
      << "switch-to-lookup;";
   OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
   OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;";
+  OS << (Options.HoistLoadsStoresWithCondFaulting ? "" : "no-")
+     << "hoist-loads-stores-with-cond-faulting;";
   OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts;";
   OS << (Options.SpeculateBlocks ? "" : "no-") << "speculate-blocks;";
   OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch;";
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index ccdfe47ef81e7..8ebe2acdcc667 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -117,6 +117,12 @@ static cl::opt<bool>
     HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true),
                 cl::desc("Hoist common instructions up to the parent block"));
 
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+    "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+    cl::init(true),
+    cl::desc("Hoist loads/stores if the target supports "
+             "conditional faulting"));
+
 static cl::opt<unsigned>
     HoistCommonSkipLimit("simplifycfg-hoist-common-skip-limit", cl::Hidden,
                          cl::init(20),
@@ -275,6 +281,9 @@ class SimplifyCFGOpt {
   bool hoistSuccIdenticalTerminatorToSwitchOrIf(
       Instruction *TI, Instruction *I1,
       SmallVectorImpl<Instruction *> &OtherSuccTIs);
+  bool
+  hoistLoadStoreWithCondFaultingFromSuccessors(BranchInst *BI,
+                                               BasicBlock *ThenBB = nullptr);
   bool speculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
   bool simplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
                                   BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2978,6 +2987,218 @@ static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
   return BIEndProb < Likely;
 }
 
+/// Hoist load/store instructions from the conditional successor blocks up into
+/// the block.
+///
+/// \param BI conditional branch instruction for the successor blocks.
+/// \param ThenBB the successor block to hoist from. If NULL, both of the
+///        successors are considered.
+///
+/// We are looking for code like the following:
+/// \code
+///   BB:
+///     ...
+///     %cond = icmp ult %x, %y
+///     br i1 %cond, label %TrueBB, label %FalseBB
+///   FalseBB:
+///     store i32 1, ptr %q, align 4
+///     ...
+///   TrueBB:
+///     %0 = load i32, ptr %b, align 4
+///     store i32 %0, ptr %p, align 4
+///     ...
+/// \endcode
+//
+/// We are going to transform this into:
+///
+/// \code
+///   BB:
+///     ...
+///     %cond = icmp ult %x, %y
+///     %0 = cload i32, ptr %b, %cond
+///     cstore i32 %0, ptr %p, %cond
+///     cstore i32 1, ptr %q, ~%cond
+///     br i1 %cond, label %TrueBB, label %FalseBB
+///   FalseBB:
+///     ...
+///   TrueBB:
+///     ...
+/// \endcode
+///
+/// where cload/cstore is represented by intrinsic like llvm.masked.load/store,
+/// e.g.
+///
+/// \code
+///   %vcond = bitcast i1 %cond to <1 x i1>
+///   %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+///                         (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
+///   %0 = bitcast <1 x i32> %v0 to i32
+///   call void @llvm.masked.store.v1i32.p0
+//                          (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+///   %cond.not = xor i1 %cond, true
+///   %vcond.not = bitcast i1 %cond.not to <1 x i>
+///   call void @llvm.masked.store.v1i32.p0
+///              (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+/// \endcode
+///
+/// \returns true if any load/store is hosited.
+///
+/// Note that this tranform should be run
+/// * before SpeculativelyExecuteBB so that the latter can have more chance.
+/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
+///   are handled first.
+bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
+    BranchInst *BI, BasicBlock *ThenBB) {
+  if (!HoistLoadsStoresWithCondFaulting ||
+      !Options.HoistLoadsStoresWithCondFaulting ||
+      !TTI.hasConditionalLoadStoreForType())
+    return false;
+
+  if (!BI || !BI->isConditional())
+    return false;
+
+  if (!isProfitableToSpeculate(BI, ThenBB != BI->getSuccessor(0), TTI))
+    return false;
+
+  SmallVector<BasicBlock *, 2> Successors;
+  if (ThenBB)
+    Successors.push_back(ThenBB);
+  else
+    Successors.append({BI->getSuccessor(0), BI->getSuccessor(1)});
+
+  // If either of the blocks has it's address taken, then we can't do this fold,
+  // because the code we'd hoist would no longer run when we jump into the block
+  // by it's address.
+  for (auto *Succ : Successors)
+    if (Succ->hasAddressTaken())
+      return false;
+
+  // Not use isa<AllocaInst>(getUnderlyingObject(I.getOperand(0)) to avoid
+  // checking all intermediate operands dominate the branch.
+  auto IsLoadFromAlloca = [](const Instruction &I) {
+    return isa<LoadInst>(I) && isa<AllocaInst>((I.getOperand(0)));
+  };
+
+  // Collect hoisted loads/stores.
+  SmallSetVector<Instruction *, 4> HoistedInsts;
+  // Not hoist load/store if
+  // 1. target does not have corresponding conditional faulting load/store.
+  // 2. it's volatile or atomic.
+  // 3. there is a load/store that can not be hoisted in the same bb.
+  // 4. there is a non-load/store that's not safe to speculatively execute
+  //    in the same bb.
+  // 5. any operand of it does not dominate the branch.
+  // 6. it's a store and a memory read is skipped.
+  auto HoistInstsInBB = [&](BasicBlock *BB) {
+    bool SkipMemoryRead = false;
+    // A more efficient way to check domination. An operand dominates the
+    // BranchInst if
+    // 1. it's not defined in the same bb as the instruction.
+    // 2. it's to be hoisted.
+    //
+    // b/c BB is only predecessor and BranchInst does not define any value.
+    auto OpsDominatesBranch = [&](Instruction &I) {
+      return llvm::all_of(I.operands(), [&](Value *Op) {
+        if (auto *J = dyn_cast<Instruction>(Op)) {
+          if (HoistedInsts.contains(J))
+            return true;
+          if (J->getParent() == I.getParent())
+            return false;
+        }
+        return true;
+      });
+    };
+    for (auto &I : *BB) {
+      auto *LI = dyn_cast<LoadInst>(&I);
+      auto *SI = dyn_cast<StoreInst>(&I);
+      if (LI || SI) {
+        bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
+        if (!IsSimple || !OpsDominatesBranch(I))
+          return false;
+        auto *Type = getLoadStoreType(&I);
+        // a load from alloca is always safe.
+        if (!IsLoadFromAlloca(I) && !TTI.hasConditionalLoadStoreForType(Type))
+          return false;
+        // Conservative aliasing check.
+        if (SI && SkipMemoryRead)
+          return false;
+        HoistedInsts.insert(&I);
+      } else if (!I.isTerminator() && !isSafeToSpeculativelyExecute(&I))
+        return false;
+      else if (I.mayReadFromMemory())
+        SkipMemoryRead = true;
+    }
+    return true;
+  };
+
+  for (auto *Succ : Successors)
+    if (!HoistInstsInBB(Succ))
+      return false;
+
+  if (HoistedInsts.empty())
+    return false;
+
+  // Put newly added instructions before the BranchInst.
+  IRBuilder<> Builder(BI);
+  auto &Context = BI->getParent()->getContext();
+  auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+  auto *Cond = BI->getOperand(0);
+  auto *VCond = Builder.CreateBitCast(Cond, VCondTy);
+  Value *VCondNot = nullptr;
+  for (auto *I : HoistedInsts) {
+    // Only need to move the position for load from alloca.
+    if (IsLoadFromAlloca(*I)) {
+      I->moveBefore(BI);
+      continue;
+    }
+
+    bool InvertCond = I->getParent() == BI->getSuccessor(1);
+    // Construct the inverted condition if need.
+    if (InvertCond && !VCondNot)
+      VCondNot = Builder.CreateBitCast(
+          Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+
+    auto *Mask = InvertCond ? VCondNot : VCond;
+    auto *Op0 = I->getOperand(0);
+    if (auto *LI = dyn_cast<LoadInst>(I)) {
+      // Load
+      auto *Ty = I->getType();
+      // NOTE: Now we assume conditional faulting load/store is supported for
+      // scalar only when creating new instructions, but it's easy to extend it
+      // for vector types in the future.
+      assert(!Ty->isVectorTy() && "not implemented");
+      auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
+                                          LI->getAlign(), Mask);
+      auto *S0 = Builder.CreateBitCast(V0, Ty);
+      V0->copyMetadata(*I);
+      I->replaceAllUsesWith(S0);
+    } else {
+      // Store
+      assert(!Op0->getType()->isVectorTy() && "not implemented");
+      auto *StoredVal =
+          Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+      auto *VStore = Builder.CreateMaskedStore(
+          StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+      VStore->copyMetadata(*I);
+      // FIXME: DIAssignID is not supported for masked store yet.
+      // (Verifier::visitDIAssignIDMetadata)
+      at::deleteAssignmentMarkers(VStore);
+      VStore->eraseMetadataIf([](unsigned MDKind, MDNode *Node) {
+        return Node->getMetadataID() == Metadata::DIAssignIDKind;
+      });
+    }
+  }
+
+  // Erase the hoisted instrutions in reverse order to avoid use-w/o-define
+  // error.
+  std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(), [&](auto I) {
+    if (!IsLoadFromAlloca(*I))
+      I->eraseFromParent();
+  });
+
+  return true;
+}
+
 /// Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
@@ -7519,31 +7740,43 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     return requestResimplify();
 
   // We have a conditional branch to two blocks that are only reachable
-  // from BI.  We know that the condbr dominates the two blocks, so see if
-  // there is any identical code in the "then" and "else" blocks.  If so, we
-  // can hoist it up to the branching block.
+  // from BI.  We know that the condbr dominates the two blocks, so see
+  //
+  // * if there is any identical code in the "then" and "else" blocks.
+  // * if there is any different load/store in the "then" and "else" blocks.
+  //
+  // If so, we can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
       if (HoistCommon && hoistCommonCodeFromSuccessors(
                              BI->getParent(), !Options.HoistCommonInsts))
         return requestResimplify();
+      if (hoistLoadStoreWithCondFaultingFromSuccessors(BI))
+        return requestResimplify();
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
       Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
       if (Succ0TI->getNumSuccessors() == 1 &&
-          Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+          Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) {
+        if (hoistLoadStoreWithCondFaultingFromSuccessors(BI,
+                                                         BI->getSuccessor(0)))
+          return requestResimplify();
         if (speculativelyExecuteBB(BI, BI->getSuccessor(0)))
           return requestResimplify();
+      }
     }
   } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
     // execute Successor #1 if it branches to Successor #0.
     Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
     if (Succ1TI->getNumSuccessors() == 1 &&
-        Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+        Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) {
+      if (hoistLoadStoreWithCondFaultingFromSuccessors(BI, BI->getSuccessor(1)))
+        return requestResimplify();
       if (speculativelyExecuteBB(BI, BI->getSuccessor(1)))
         return requestResimplify();
+    }
   }
 
   // If this is a branch on something for which we know the constant value in
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 017349aa32af3..0f72420127f03 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -52,7 +52,9 @@
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
+; CHECK-NEXT:       Simplify the CFG
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
+; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Post-Dominator Tree Construction
 ; CHECK-NEXT:       Branch Probability Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 15f23eda241b4..b17819b478d3c 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -214,7 +214,9 @@
 ; GCN-O1-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O1-NEXT:      Lazy Block Frequency Analysis
 ; GCN-O1-NEXT:      Expand memcmp() to load/stores
+; GCN-O1-NEXT:      Simplify the CFG
 ; GCN-O1-NEXT:      Remove unreachable blocks from the CFG
+; GCN-O1-NEXT:      Dominator Tree Construction
 ; GCN-O1-NEXT:      Natural Loop Information
 ; GCN-O1-NEXT:      Post-Dominator Tree Construction
 ; GCN-O1-NEXT:      Branch Probability Analysis
@@ -500,7 +502,9 @@
 ; GCN-O1-OPTS-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O1-OPTS-NEXT:      Lazy Block Frequency Analysis
 ; GCN-O1-OPTS-NEXT:      Expand memcmp() to load/stores
+; GCN-O1-OPTS-NEXT:      Simplify the CFG
 ; GCN-O1-OPTS-NEXT:      Remove unreachable blocks from the CFG
+; GCN-O1-OPTS-NEXT:      Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Natural Loop Information
 ; GCN-O1-OPTS-NEXT:      Post-Dominator Tree Construction
 ; GCN-O1-OPTS-NEXT:      Branch Probability Analysis
@@ -805,7 +809,9 @@
 ; GCN-O2-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O2-NEXT:      Lazy Block Frequency Analysis
 ; GCN-O2-NEXT:      Expand memcmp() to load/stores
+; GCN-O2-NEXT:      Simplify the CFG
 ; GCN-O2-NEXT:      Remove unreachable blocks from the CFG
+; GCN-O2-NEXT:      Dominator Tree Construction
 ; GCN-O2-NEXT:      Natural Loop Information
 ; GCN-O2-NEXT:      Post-Dominator Tree Construction
 ; GCN-O2-NEXT:      Branch Probability Analysis
@@ -1118,7 +1124,9 @@
 ; GCN-O3-NEXT:      Lazy Branch Probability Analysis
 ; GCN-O3-NEXT:      Lazy Block Frequency Analysis
 ; GCN-O3-NEXT:      Expand memcmp() to load/stores
+; GCN-O3-NEXT:      Simplify the CFG
 ; GCN-O3-NEXT:      Remove unreachable blocks from the CFG
+; GCN-O3-NEXT:      Dominator Tree Construction
 ; GCN-O3-NEXT:      Natural Loop Information
 ; GCN-O3-NEXT:      Post-Dominator Tree Construction
 ; GCN-O3-NEXT:      Branch Probability Analysis
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index aa92c7ab4fb9f..8bccc4d8d9ac8 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -30,7 +30,9 @@
 ; CHECK-NEXT:      Expand memcmp() to load/stores
 ; CHECK-NEXT:      Lower Garbage Collection Instructions
 ; CHECK-NEXT:      Shadow Stack GC Lowering
+; CHECK-NEXT:      Simplify the CFG
 ; CHECK-NEXT:      Remove unreachable blocks from the CFG
+; CHECK-NEXT:      Dominator Tree Construction
 ; CHECK-NEXT:      Natural Loop Information
 ; CHECK-NEXT:      Post-Dominator Tree Construction
 ; CHECK-NEXT:      Branch Probability Analysis
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index c5c5342e303c7..61ff8ccda5904 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -53,7 +53,9 @@
 ; LAXX-NEXT:       Expand memcmp() to load/stores
 ; LAXX-NEXT:       Lower Garbage Collection Instructions
 ; LAXX-NEXT:       Shadow Stack GC Lowering
+; LAXX-NEXT:       Simplify the CFG
 ; LAXX-NEXT:       Remove unreachable blocks from the CFG
+; LAXX-NEXT:       Dominator Tree Construction
 ; LAXX-NEXT:       Natural Loop Information
 ; LAXX-NEXT:       Post-Dominator Tree Construction
 ; LAXX-NEXT:       Branch Probability Analysis
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index be0fbf3abcda2..9304825f4e642 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -54,7 +54,9 @@
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
+; CHECK-NEXT:       Simplify the CFG
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
+; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Post-Dominator Tree Construction
 ; CHECK-NEXT:       Branch Probability Analysis
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index d6d0cca6ddae7..1cd4075ccfff6 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -54,7 +54,9 @@
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
+; CHECK-NEXT:       Simplify the CFG
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
+; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Post-Dominator Tree Construction
 ; CHECK-NEXT:       Branch Probability Analysis
diff --git a/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll b/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll
new file mode 100644
index 0000000000000..b8b4eb3a627e5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64 -mattr=+cf | FileCheck %s
+
+;; Test masked.load/store.v1* is generated in simplifycfg and not falls back to branch+load/store in following passes.
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: basic:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    cfcmovel (%rsi), %eax
+; CHECK-NEXT:    cfcmovel %eax, (%rdx)
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    cfcmovneq %rax, (%rdx)
+; CHECK-NEXT:    movw $2, %ax
+; CHECK-NEXT:    cfcmovnew %ax, (%rcx)
+; CHECK-NEXT:    retq
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i64 1, ptr %p, align 8
+  store i16 2, ptr %q, align 8
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  store i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 275a42e02d746..bb400d9696a4f 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -51,7 +51,9 @@
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
 ; CHECK-NEXT:       Shadow Stack GC Lowering
+; CHECK-NEXT:       Simplify the CFG
 ; CHECK-NEXT:       Remove unreachable blocks from the CFG
+; CHECK-NEXT:       Dominator Tree Construction
 ; CHECK-NEXT:       Natural Loop Information
 ; CHECK-NEXT:       Post-Dominator Tree Construction
 ; CHECK-NEXT:       Branch Probability Analysis
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index f2e80814f347a..12f88d60d66ce 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -49,8 +49,8 @@
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(print<stack-lifetime><may>,print<stack-lifetime><must>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-17
 ; CHECK-17: function(print<stack-lifetime><may>,print<stack-lifetime><must>)
 
-; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
-; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
+; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
+; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
 
 ; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only>,loop-vectorize<interleave-forced-only;vectorize-forced-only>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-19
 ; CHECK-19: function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,loop-vectorize<interleave-forced-only;vectorize-forced-only;>)
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
new file mode 100644
index 0000000000000..dc18d2e5055a8
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -0,0 +1,623 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+
+;; Redundant bitcast will be opimized out in instcombine pass.
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i64 1, ptr %p, align 8, !dbg !8
+  store i16 2, ptr %q, align 8, !dbg !8
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4,  !dbg !9
+  store i32 %0, ptr %p, align 4, !DIAssignID !13
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; simplifycfg is probably run before sroa, alloca here is not optimized away yet.
+define void @alloca(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @alloca(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[Q_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store ptr [[P:%.*]], ptr [[P_ADDR]], align 8
+; CHECK-NEXT:    store ptr [[Q:%.*]], ptr [[Q_ADDR]], align 8
+; CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Q_ADDR]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP2]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <1 x i1> [[TMP1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p.addr = alloca ptr
+  %q.addr = alloca ptr
+  %a.addr = alloca i32
+  store ptr %p, ptr %p.addr
+  store ptr %q, ptr %q.addr
+  store i32 %a, ptr %a.addr
+  %0 = load i32, ptr %a.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  %1 = load ptr, ptr %q.addr
+  %2 = load i32, ptr %1
+  %3 = load ptr, ptr %p.addr
+  store i32 %2, ptr %3
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; Successor 1 branches to successor 0.
+define void @succ1to0(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @succ1to0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+  ret void
+
+if.then:
+  %0 = load i32, ptr %q
+  store i32 %0, ptr %p
+  br label %if.end
+}
+
+;; Successor 1 branches to successor 0 and there is a phi node.
+define i32 @succ1to0_phi(ptr %p)  {
+; CHECK-LABEL: @succ1to0_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
+;
+entry:
+  %cond = icmp eq ptr %p, null
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %0 = load i32, ptr %p
+  br label %if.true
+
+if.true:
+  %res = phi i32 [ %0, %if.false ], [ 0, %entry ]
+  ret i32 %res
+}
+
+;; Successor 0 branches to successor 1.
+define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @succ0to1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i32 1, ptr %q
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b
+  store i32 %0, ptr %p
+  br label %if.false
+
+if.end:
+  ret void
+}
+
+;; Load after store can be hoisted.
+define i64 @load_after_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_after_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ZEXT]], [[TMP4]]
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[COND]], i64 [[ADD]], i64 0
+; CHECK-NEXT:    ret i64 [[COMMON_RET_OP]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.end
+
+if.true:
+  store i32 1, ptr %b
+  %0 = load i16, ptr %p
+  %1 = load i64, ptr %q
+  %zext = zext i16 %0 to i64
+  %add = add i64 %zext, %1
+  ret i64 %add
+
+if.end:
+  ret i64 0
+}
+
+;; Speculatable memory read doesn't prevent the hoist.
+define i32 @load_skip_speculatable_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_speculatable_memory_read(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
+; CHECK-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
+; CHECK-NEXT:    [[PHI:%.*]] = select i1 [[COND]], i32 0, i32 [[READ]]
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %read = call i32 @read_memory_only()
+  %0 = load i32, ptr %q
+  br label %if.end
+
+if.true:
+  %1 = load i32, ptr %b
+  store i32 %1, ptr %p
+  br label %if.end
+
+if.end:
+  %phi = phi i32 [%read, %if.false], [0, %if.true]
+  ret i32 %phi
+}
+
+;; Source of the load can be a GEP.
+;; TODO: Make the hoist happen.
+define i32 @load_from_gep(ptr %p)  {
+; CHECK-LABEL: @load_from_gep(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    br label [[IF_TRUE]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %cond = icmp eq ptr %p, null
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %arrayidx = getelementptr inbounds i8, ptr %p, i64 16
+  %0 = load i32, ptr %arrayidx
+  br label %if.true
+
+if.true:
+  %res = phi i32 [ %0, %if.false ], [ 0, %entry ]
+  ret i32 %res
+}
+
+;; Not do hoist if the cost of instructions to be hoisted is expensive.
+define i32 @not_cheap_to_hoist(i32 %a, ptr %b, ptr %p, ptr %q, i32 %v0, i32 %v1, i32 %v2, i1 %cc) {
+; CHECK-LABEL: @not_cheap_to_hoist(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]])
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 2, <1 x i1> [[TMP5]])
+; CHECK-NEXT:    br i1 [[COND]], label [[COMMON_RET:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[VVVV:%.*]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
+; CHECK:       if.false:
+; CHECK-NEXT:    [[V:%.*]] = udiv i32 [[A]], 12345
+; CHECK-NEXT:    [[VV:%.*]] = mul i32 [[V]], [[V0:%.*]]
+; CHECK-NEXT:    [[VVV:%.*]] = mul i32 [[VV]], [[V1:%.*]]
+; CHECK-NEXT:    [[VVVV]] = select i1 [[CC:%.*]], i32 [[V2:%.*]], i32 [[VVV]]
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i64 1, ptr %p
+  store i16 2, ptr %q
+
+  %v = udiv i32 %a, 12345
+  %vv = mul i32 %v, %v0
+  %vvv = mul i32 %vv, %v1
+  %vvvv = select i1 %cc, i32 %v2, i32 %vvv
+  ret i32 %vvvv
+
+if.true:
+  %0 = load i32, ptr %b
+  store i32 %0, ptr %p
+  br label %if.end
+
+if.end:
+  ret i32 0
+}
+
+;; Not hoist if there is more than 1 prodecessor.
+define void @not_single_predecessor(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @not_single_predecessor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       if.end:
+; CHECK-NEXT:    br label [[IF_THEN]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+  br label %if.then
+
+if.then:
+  %1 = load i32, ptr %q
+  store i32 %1, ptr %p
+  br label %if.end
+}
+
+;; Not hoist b/c i8 is not supported by conditional faulting
+define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_supported_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i8 1, ptr [[Q:%.*]], align 1
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[P:%.*]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i8 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i8 1, ptr %q
+  br label %if.end
+
+if.true:
+  %0 = load i8, ptr %b
+  store i8 %0, ptr %p
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; Not hoist if the terminator is not br.
+define void @not_br_terminator(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_br_terminator(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[A:%.*]], label [[IF_END:%.*]] [
+; CHECK-NEXT:      i32 1, label [[IF_FALSE:%.*]]
+; CHECK-NEXT:      i32 2, label [[IF_TRUE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_FALSE]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i32 %a, label %if.end [
+  i32 1, label %if.false
+  i32 2, label %if.true
+  ]
+
+if.false:
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  store i32 %0, ptr %p, align 4
+  br label %if.false
+
+if.end:
+  ret void
+}
+
+;; Not hoist if the instruction to be hoist is atomic/volatile.
+define void @not_simple(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store atomic i32 1, ptr [[Q:%.*]] seq_cst, align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store atomic i32 1, ptr %q seq_cst, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  store i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; Not hoist if there is a load/store that can not be hoisted in the same bb.
+define void @not_hoistable_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store volatile i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  store volatile i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; Not hoist if there is an instruction that has side effect in the same bb.
+define void @not_hoistable_sideeffect(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_sideeffect(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[RMW:%.*]] = atomicrmw xchg ptr [[Q]], double 4.000000e+00 seq_cst, align 8
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  %rmw= atomicrmw xchg ptr %q, double 4.0 seq_cst
+  store i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; Not hoist b/c the operand of store does not dominate the branch.
+;; TODO: Could we improve it?
+define void @not_ops_dominate_br(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_ops_dominate_br(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], 2
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %add = add i32 %a, 2
+  store i32 %add, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %1 = load i32, ptr %b, align 4
+  store i32 %1, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; Not hoist there's a store and a memory read is skipped.
+define void @not_store_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_store_skip_memory_read(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    call void @read_memory_only()
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  call void @read_memory_only()
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %1 = load i32, ptr %b, align 4
+  store i32 %1, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+;; Not hoist if the branch is predictable and the `then` BB is not likely to execute.
+define void @not_likely_to_execute(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @not_likely_to_execute(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF13:![0-9]+]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.then, label %if.end, !prof !14
+
+if.end:
+  ret void
+
+if.then:
+  %0 = load i32, ptr %q
+  store i32 %0, ptr %p
+  br label %if.end
+}
+
+declare i32 @read_memory_only() readonly nounwind willreturn speculatable
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !DILocation(line: 1, column: 2, scope: !10)
+!9 = !DILocation(line: 1, column: 3, scope: !10)
+!10 = distinct !DISubprogram(name: "basic", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, unit: !0)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = distinct !DIAssignID()
+!14 = !{!"branch_weights", i32 1, i32 99}

>From fabf9942783d30c1a5bb34eb95667b7f7327f162 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Sat, 3 Aug 2024 16:50:47 +0800
Subject: [PATCH 2/3] address review comment: all/nothing logic

---
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 358 ++++++------------
 .../CodeGen/X86/hoist-loads-stores-with-cf.ll |  32 +-
 .../X86/hoist-loads-stores-with-cf.ll         | 319 +++++++---------
 3 files changed, 281 insertions(+), 428 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 8ebe2acdcc667..8dbe5e55a5032 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -281,9 +281,6 @@ class SimplifyCFGOpt {
   bool hoistSuccIdenticalTerminatorToSwitchOrIf(
       Instruction *TI, Instruction *I1,
       SmallVectorImpl<Instruction *> &OtherSuccTIs);
-  bool
-  hoistLoadStoreWithCondFaultingFromSuccessors(BranchInst *BI,
-                                               BasicBlock *ThenBB = nullptr);
   bool speculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
   bool simplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
                                   BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2987,216 +2984,10 @@ static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
   return BIEndProb < Likely;
 }
 
-/// Hoist load/store instructions from the conditional successor blocks up into
-/// the block.
-///
-/// \param BI conditional branch instruction for the successor blocks.
-/// \param ThenBB the successor block to hoist from. If NULL, both of the
-///        successors are considered.
-///
-/// We are looking for code like the following:
-/// \code
-///   BB:
-///     ...
-///     %cond = icmp ult %x, %y
-///     br i1 %cond, label %TrueBB, label %FalseBB
-///   FalseBB:
-///     store i32 1, ptr %q, align 4
-///     ...
-///   TrueBB:
-///     %0 = load i32, ptr %b, align 4
-///     store i32 %0, ptr %p, align 4
-///     ...
-/// \endcode
-//
-/// We are going to transform this into:
-///
-/// \code
-///   BB:
-///     ...
-///     %cond = icmp ult %x, %y
-///     %0 = cload i32, ptr %b, %cond
-///     cstore i32 %0, ptr %p, %cond
-///     cstore i32 1, ptr %q, ~%cond
-///     br i1 %cond, label %TrueBB, label %FalseBB
-///   FalseBB:
-///     ...
-///   TrueBB:
-///     ...
-/// \endcode
-///
-/// where cload/cstore is represented by intrinsic like llvm.masked.load/store,
-/// e.g.
-///
-/// \code
-///   %vcond = bitcast i1 %cond to <1 x i1>
-///   %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
-///                         (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
-///   %0 = bitcast <1 x i32> %v0 to i32
-///   call void @llvm.masked.store.v1i32.p0
-//                          (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
-///   %cond.not = xor i1 %cond, true
-///   %vcond.not = bitcast i1 %cond.not to <1 x i>
-///   call void @llvm.masked.store.v1i32.p0
-///              (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
-/// \endcode
-///
-/// \returns true if any load/store is hosited.
-///
-/// Note that this tranform should be run
-/// * before SpeculativelyExecuteBB so that the latter can have more chance.
-/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
-///   are handled first.
-bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
-    BranchInst *BI, BasicBlock *ThenBB) {
-  if (!HoistLoadsStoresWithCondFaulting ||
-      !Options.HoistLoadsStoresWithCondFaulting ||
-      !TTI.hasConditionalLoadStoreForType())
-    return false;
-
-  if (!BI || !BI->isConditional())
-    return false;
-
-  if (!isProfitableToSpeculate(BI, ThenBB != BI->getSuccessor(0), TTI))
-    return false;
-
-  SmallVector<BasicBlock *, 2> Successors;
-  if (ThenBB)
-    Successors.push_back(ThenBB);
-  else
-    Successors.append({BI->getSuccessor(0), BI->getSuccessor(1)});
-
-  // If either of the blocks has it's address taken, then we can't do this fold,
-  // because the code we'd hoist would no longer run when we jump into the block
-  // by it's address.
-  for (auto *Succ : Successors)
-    if (Succ->hasAddressTaken())
-      return false;
-
-  // Not use isa<AllocaInst>(getUnderlyingObject(I.getOperand(0)) to avoid
-  // checking all intermediate operands dominate the branch.
-  auto IsLoadFromAlloca = [](const Instruction &I) {
-    return isa<LoadInst>(I) && isa<AllocaInst>((I.getOperand(0)));
-  };
-
-  // Collect hoisted loads/stores.
-  SmallSetVector<Instruction *, 4> HoistedInsts;
-  // Not hoist load/store if
-  // 1. target does not have corresponding conditional faulting load/store.
-  // 2. it's volatile or atomic.
-  // 3. there is a load/store that can not be hoisted in the same bb.
-  // 4. there is a non-load/store that's not safe to speculatively execute
-  //    in the same bb.
-  // 5. any operand of it does not dominate the branch.
-  // 6. it's a store and a memory read is skipped.
-  auto HoistInstsInBB = [&](BasicBlock *BB) {
-    bool SkipMemoryRead = false;
-    // A more efficient way to check domination. An operand dominates the
-    // BranchInst if
-    // 1. it's not defined in the same bb as the instruction.
-    // 2. it's to be hoisted.
-    //
-    // b/c BB is only predecessor and BranchInst does not define any value.
-    auto OpsDominatesBranch = [&](Instruction &I) {
-      return llvm::all_of(I.operands(), [&](Value *Op) {
-        if (auto *J = dyn_cast<Instruction>(Op)) {
-          if (HoistedInsts.contains(J))
-            return true;
-          if (J->getParent() == I.getParent())
-            return false;
-        }
-        return true;
-      });
-    };
-    for (auto &I : *BB) {
-      auto *LI = dyn_cast<LoadInst>(&I);
-      auto *SI = dyn_cast<StoreInst>(&I);
-      if (LI || SI) {
-        bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
-        if (!IsSimple || !OpsDominatesBranch(I))
-          return false;
-        auto *Type = getLoadStoreType(&I);
-        // a load from alloca is always safe.
-        if (!IsLoadFromAlloca(I) && !TTI.hasConditionalLoadStoreForType(Type))
-          return false;
-        // Conservative aliasing check.
-        if (SI && SkipMemoryRead)
-          return false;
-        HoistedInsts.insert(&I);
-      } else if (!I.isTerminator() && !isSafeToSpeculativelyExecute(&I))
-        return false;
-      else if (I.mayReadFromMemory())
-        SkipMemoryRead = true;
-    }
-    return true;
-  };
-
-  for (auto *Succ : Successors)
-    if (!HoistInstsInBB(Succ))
-      return false;
-
-  if (HoistedInsts.empty())
-    return false;
-
-  // Put newly added instructions before the BranchInst.
-  IRBuilder<> Builder(BI);
-  auto &Context = BI->getParent()->getContext();
-  auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
-  auto *Cond = BI->getOperand(0);
-  auto *VCond = Builder.CreateBitCast(Cond, VCondTy);
-  Value *VCondNot = nullptr;
-  for (auto *I : HoistedInsts) {
-    // Only need to move the position for load from alloca.
-    if (IsLoadFromAlloca(*I)) {
-      I->moveBefore(BI);
-      continue;
-    }
-
-    bool InvertCond = I->getParent() == BI->getSuccessor(1);
-    // Construct the inverted condition if need.
-    if (InvertCond && !VCondNot)
-      VCondNot = Builder.CreateBitCast(
-          Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
-
-    auto *Mask = InvertCond ? VCondNot : VCond;
-    auto *Op0 = I->getOperand(0);
-    if (auto *LI = dyn_cast<LoadInst>(I)) {
-      // Load
-      auto *Ty = I->getType();
-      // NOTE: Now we assume conditional faulting load/store is supported for
-      // scalar only when creating new instructions, but it's easy to extend it
-      // for vector types in the future.
-      assert(!Ty->isVectorTy() && "not implemented");
-      auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
-                                          LI->getAlign(), Mask);
-      auto *S0 = Builder.CreateBitCast(V0, Ty);
-      V0->copyMetadata(*I);
-      I->replaceAllUsesWith(S0);
-    } else {
-      // Store
-      assert(!Op0->getType()->isVectorTy() && "not implemented");
-      auto *StoredVal =
-          Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
-      auto *VStore = Builder.CreateMaskedStore(
-          StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
-      VStore->copyMetadata(*I);
-      // FIXME: DIAssignID is not supported for masked store yet.
-      // (Verifier::visitDIAssignIDMetadata)
-      at::deleteAssignmentMarkers(VStore);
-      VStore->eraseMetadataIf([](unsigned MDKind, MDNode *Node) {
-        return Node->getMetadataID() == Metadata::DIAssignIDKind;
-      });
-    }
-  }
-
-  // Erase the hoisted instrutions in reverse order to avoid use-w/o-define
-  // error.
-  std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(), [&](auto I) {
-    if (!IsLoadFromAlloca(*I))
-      I->eraseFromParent();
-  });
-
-  return true;
+static bool isSafeCheapLoadStore(const Instruction &I,
+                                 const TargetTransformInfo &TTI) {
+  return (isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         TTI.hasConditionalLoadStoreForType(getLoadStoreType(&I));
 }
 
 /// Speculate a conditional basic block flattening the CFG.
@@ -3273,6 +3064,9 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   SmallVector<Instruction *, 4> SpeculatedDbgIntrinsics;
 
   unsigned SpeculatedInstructions = 0;
+  bool HositLoadsStores = HoistLoadsStoresWithCondFaulting &&
+                          Options.HoistLoadsStoresWithCondFaulting;
+  SmallVector<Instruction *, 2> SpeculatedConditionalLoadsStores;
   Value *SpeculatedStoreValue = nullptr;
   StoreInst *SpeculatedStore = nullptr;
   EphemeralValueTracker EphTracker;
@@ -3301,16 +3095,24 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
 
     // Only speculatively execute a single instruction (not counting the
     // terminator) for now.
-    ++SpeculatedInstructions;
+    bool IsSafeCheapLoadStore =
+        HositLoadsStores && isSafeCheapLoadStore(I, TTI);
+    // Not count load/store into cost if target supports conditional faulting
+    // b/c it's cheap to speculate it.
+    if (IsSafeCheapLoadStore)
+      SpeculatedConditionalLoadsStores.push_back(&I);
+    else
+      ++SpeculatedInstructions;
+
     if (SpeculatedInstructions > 1)
       return false;
 
     // Don't hoist the instruction if it's unsafe or expensive.
-    if (!isSafeToSpeculativelyExecute(&I) &&
+    if (!IsSafeCheapLoadStore && !isSafeToSpeculativelyExecute(&I) &&
         !(HoistCondStores && (SpeculatedStoreValue = isSafeToSpeculateStore(
                                   &I, BB, ThenBB, EndBB))))
       return false;
-    if (!SpeculatedStoreValue &&
+    if (!IsSafeCheapLoadStore && !SpeculatedStoreValue &&
         computeSpeculationCost(&I, TTI) >
             PHINodeFoldingThreshold * TargetTransformInfo::TCC_Basic)
       return false;
@@ -3343,11 +3145,11 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
 
   // Check that we can insert the selects and that it's not too expensive to do
   // so.
-  bool Convert = SpeculatedStore != nullptr;
+  bool Convert =
+      SpeculatedStore != nullptr || !SpeculatedConditionalLoadsStores.empty();
   InstructionCost Cost = 0;
   Convert |= validateAndCostRequiredSelects(BB, ThenBB, EndBB,
-                                            SpeculatedInstructions,
-                                            Cost, TTI);
+                                            SpeculatedInstructions, Cost, TTI);
   if (!Convert || Cost > Budget)
     return false;
 
@@ -3435,6 +3237,102 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI,
   BB->splice(BI->getIterator(), ThenBB, ThenBB->begin(),
              std::prev(ThenBB->end()));
 
+  // If the target supports conditional faulting,
+  // we are looking for code like the following:
+  // \code
+  //   BB:
+  //     ...
+  //     %cond = icmp ult %x, %y
+  //     br i1 %cond, label %TrueBB, label %FalseBB
+  //   FalseBB:
+  //     store i32 1, ptr %q, align 4
+  //     ...
+  //   TrueBB:
+  //     %0 = load i32, ptr %b, align 4
+  //     store i32 %0, ptr %p, align 4
+  //     ...
+  // \endcode
+  //
+  // and transform it into:
+  //
+  // \code
+  //   BB:
+  //     ...
+  //     %cond = icmp ult %x, %y
+  //     %0 = cload i32, ptr %b, %cond
+  //     cstore i32 %0, ptr %p, %cond
+  //     cstore i32 1, ptr %q, ~%cond
+  //     br i1 %cond, label %TrueBB, label %FalseBB
+  //   FalseBB:
+  //     ...
+  //   TrueBB:
+  //     ...
+  // \endcode
+  //
+  // where cload/cstore is represented by intrinsic like llvm.masked.load/store,
+  // e.g.
+  //
+  // \code
+  //   %vcond = bitcast i1 %cond to <1 x i1>
+  //   %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+  //                         (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
+  //   %0 = bitcast <1 x i32> %v0 to i32
+  //   call void @llvm.masked.store.v1i32.p0
+  //                          (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+  //   %cond.not = xor i1 %cond, true
+  //   %vcond.not = bitcast i1 %cond.not to <1 x i>
+  //   call void @llvm.masked.store.v1i32.p0
+  //              (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+  // \endcode
+  //
+  // So we need to turn hoisted load/store into cload/cstore.
+  auto &Context = BI->getParent()->getContext();
+  auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+  auto *Cond = BI->getOperand(0);
+  Value *VCond = nullptr;
+  Value *VCondNot = nullptr;
+  // Construct the condition if need.
+  if (!SpeculatedConditionalLoadsStores.empty()) {
+    IRBuilder<> Builder(SpeculatedConditionalLoadsStores.back());
+    if (Invert)
+      VCondNot = Builder.CreateBitCast(
+          Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+    else
+      VCond = Builder.CreateBitCast(Cond, VCondTy);
+  }
+  auto *Mask = Invert ? VCondNot : VCond;
+  for (auto *I : SpeculatedConditionalLoadsStores) {
+    IRBuilder<> Builder(I);
+    // NOTE: Now we assume conditional faulting load/store is supported for
+    // scalar only when creating new instructions, but it's easy to extend it
+    // for vector types in the future.
+    assert(!getLoadStoreType(I)->isVectorTy() && "not implemented");
+    auto *Op0 = I->getOperand(0);
+    if (auto *LI = dyn_cast<LoadInst>(I)) {
+      // Load
+      auto *Ty = I->getType();
+      auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
+                                          LI->getAlign(), Mask);
+      auto *S0 = Builder.CreateBitCast(V0, Ty);
+      V0->copyMetadata(*I);
+      I->replaceAllUsesWith(S0);
+    } else {
+      // Store
+      auto *StoredVal =
+          Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+      auto *VStore = Builder.CreateMaskedStore(
+          StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+      VStore->copyMetadata(*I);
+      // FIXME: DIAssignID is not supported for masked store yet.
+      // (Verifier::visitDIAssignIDMetadata)
+      at::deleteAssignmentMarkers(VStore);
+      VStore->eraseMetadataIf([](unsigned MDKind, MDNode *Node) {
+        return Node->getMetadataID() == Metadata::DIAssignIDKind;
+      });
+    }
+    I->eraseFromParent();
+  }
+
   // Insert selects and rewrite the PHI operands.
   IRBuilder<NoFolder> Builder(BI);
   for (PHINode &PN : EndBB->phis()) {
@@ -7740,43 +7638,31 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     return requestResimplify();
 
   // We have a conditional branch to two blocks that are only reachable
-  // from BI.  We know that the condbr dominates the two blocks, so see
-  //
-  // * if there is any identical code in the "then" and "else" blocks.
-  // * if there is any different load/store in the "then" and "else" blocks.
-  //
-  // If so, we can hoist it up to the branching block.
+  // from BI.  We know that the condbr dominates the two blocks, so see if
+  // there is any identical code in the "then" and "else" blocks.  If so, we
+  // can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
       if (HoistCommon && hoistCommonCodeFromSuccessors(
                              BI->getParent(), !Options.HoistCommonInsts))
         return requestResimplify();
-      if (hoistLoadStoreWithCondFaultingFromSuccessors(BI))
-        return requestResimplify();
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
       Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
       if (Succ0TI->getNumSuccessors() == 1 &&
-          Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) {
-        if (hoistLoadStoreWithCondFaultingFromSuccessors(BI,
-                                                         BI->getSuccessor(0)))
-          return requestResimplify();
+          Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
         if (speculativelyExecuteBB(BI, BI->getSuccessor(0)))
           return requestResimplify();
-      }
     }
   } else if (BI->getSuccessor(1)->getSinglePredecessor()) {
     // If Successor #0 has multiple preds, we may be able to conditionally
     // execute Successor #1 if it branches to Successor #0.
     Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
     if (Succ1TI->getNumSuccessors() == 1 &&
-        Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) {
-      if (hoistLoadStoreWithCondFaultingFromSuccessors(BI, BI->getSuccessor(1)))
-        return requestResimplify();
+        Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
       if (speculativelyExecuteBB(BI, BI->getSuccessor(1)))
         return requestResimplify();
-    }
   }
 
   // If this is a branch on something for which we know the constant value in
diff --git a/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll b/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll
index b8b4eb3a627e5..182e7591e4f1f 100644
--- a/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll
+++ b/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll
@@ -1,31 +1,33 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64 -mattr=+cf | FileCheck %s
 
 ;; Test masked.load/store.v1* is generated in simplifycfg and not falls back to branch+load/store in following passes.
-define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
 ; CHECK-LABEL: basic:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    cfcmovel (%rsi), %eax
-; CHECK-NEXT:    cfcmovel %eax, (%rdx)
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    cfcmovneq %rax, (%rdx)
-; CHECK-NEXT:    movw $2, %ax
-; CHECK-NEXT:    cfcmovnew %ax, (%rcx)
+; CHECK-NEXT:    andb $1, %dil
+; CHECK-NEXT:    negb %dil
+; CHECK-NEXT:    cfcmovnew (%rdx), %ax
+; CHECK-NEXT:    cfcmovnel (%rcx), %edi
+; CHECK-NEXT:    cfcmovneq (%rsi), %r8
+; CHECK-NEXT:    cfcmovnew %ax, (%rsi)
+; CHECK-NEXT:    cfcmovnel %edi, (%rdx)
+; CHECK-NEXT:    cfcmovneq %r8, (%rcx)
 ; CHECK-NEXT:    retq
 entry:
-  %cond = icmp eq i32 %a, 0
   br i1 %cond, label %if.true, label %if.false
 
 if.false:
-  store i64 1, ptr %p, align 8
-  store i16 2, ptr %q, align 8
   br label %if.end
 
 if.true:
-  %0 = load i32, ptr %b, align 4
-  store i32 %0, ptr %p, align 4
-  br label %if.end
+  %0 = load i16, ptr %p, align 2
+  %1 = load i32, ptr %q, align 4
+  %2 = load i64, ptr %b, align 8
+  store i16 %0, ptr %b, align 2
+  store i32 %1, ptr %p, align 4
+  store i64 %2, ptr %q, align 8
+  br label %if.false
 
 if.end:
   ret void
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
index dc18d2e5055a8..9e4e76bc9d9f3 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -1,78 +1,39 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -S | FileCheck %s
 
-;; Redundant bitcast will be opimized out in instcombine pass.
-define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+;; Basic case: check masked.load/store is generated for i16/i32/i64.
+define void @basic(i1 %cond, ptr %b, ptr %p, ptr %q) {
 ; CHECK-LABEL: @basic(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !dbg [[DBG8:![0-9]+]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND:%.*]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[B:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast <1 x i64> [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i16 [[TMP2]] to <1 x i16>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> [[TMP7]], ptr [[B]], i32 2, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP8]], ptr [[P]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i64 [[TMP6]] to <1 x i64>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> [[TMP9]], ptr [[Q]], i32 8, <1 x i1> [[TMP0]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %cond = icmp eq i32 %a, 0
   br i1 %cond, label %if.true, label %if.false
 
 if.false:
-  store i64 1, ptr %p, align 8, !dbg !8
-  store i16 2, ptr %q, align 8, !dbg !8
   br label %if.end
 
 if.true:
-  %0 = load i32, ptr %b, align 4,  !dbg !9
-  store i32 %0, ptr %p, align 4, !DIAssignID !13
-  br label %if.end
-
-if.end:
-  ret void
-}
-
-;; simplifycfg is probably run before sroa, alloca here is not optimized away yet.
-define void @alloca(ptr %p, ptr %q, i32 %a) {
-; CHECK-LABEL: @alloca(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    [[Q_ADDR:%.*]] = alloca ptr, align 8
-; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    store ptr [[P:%.*]], ptr [[P_ADDR]], align 8
-; CHECK-NEXT:    store ptr [[Q:%.*]], ptr [[Q_ADDR]], align 8
-; CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
-; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[Q_ADDR]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP2]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = load ptr, ptr [[P_ADDR]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <1 x i1> [[TMP1]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  %p.addr = alloca ptr
-  %q.addr = alloca ptr
-  %a.addr = alloca i32
-  store ptr %p, ptr %p.addr
-  store ptr %q, ptr %q.addr
-  store i32 %a, ptr %a.addr
-  %0 = load i32, ptr %a.addr
-  %tobool = icmp ne i32 %0, 0
-  br i1 %tobool, label %if.then, label %if.end
-
-if.then:
-  %1 = load ptr, ptr %q.addr
-  %2 = load i32, ptr %1
-  %3 = load ptr, ptr %p.addr
-  store i32 %2, ptr %3
-  br label %if.end
+  %0 = load i16, ptr %p, align 2
+  %1 = load i32, ptr %q, align 4
+  %2 = load i64, ptr %b, align 8
+  store i16 %0, ptr %b, align 2
+  store i32 %1, ptr %p, align 4
+  store i64 %2, ptr %q, align 8
+  br label %if.false
 
 if.end:
   ret void
@@ -83,13 +44,12 @@ define void @succ1to0(ptr %p, ptr %q, i32 %a) {
 ; CHECK-LABEL: @succ1to0(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[TOBOOL]], true
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32 [[TMP3]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP4]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP1]])
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -110,12 +70,11 @@ define i32 @succ1to0_phi(ptr %p)  {
 ; CHECK-LABEL: @succ1to0_phi(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 0, i32 [[TMP4]]
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 0, i32 [[TMP3]]
 ; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
 ;
 entry:
@@ -161,8 +120,38 @@ if.end:
   ret void
 }
 
+;; Both of successor 0 and successor 1 have a single predecessor.
+define void @single_predecessor(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @single_predecessor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+  store i32 1, ptr %q
+  ret void
+
+if.then:
+  %0 = load i32, ptr %q
+  store i32 %0, ptr %p
+  ret void
+}
+
 ;; Load after store can be hoisted.
-define i64 @load_after_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+define i64 @load_after_store(i32 %a, ptr %b, ptr %p) {
 ; CHECK-LABEL: @load_after_store(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
@@ -170,12 +159,9 @@ define i64 @load_after_store(i32 %a, ptr %b, ptr %p, ptr %q) {
 ; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
 ; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
-; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ZEXT]], [[TMP4]]
-; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = select i1 [[COND]], i64 [[ADD]], i64 0
-; CHECK-NEXT:    ret i64 [[COMMON_RET_OP]]
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i64 [[ZEXT]], i64 0
+; CHECK-NEXT:    ret i64 [[SPEC_SELECT]]
 ;
 entry:
   %cond = icmp eq i32 %a, 0
@@ -184,66 +170,54 @@ entry:
 if.true:
   store i32 1, ptr %b
   %0 = load i16, ptr %p
-  %1 = load i64, ptr %q
   %zext = zext i16 %0 to i64
-  %add = add i64 %zext, %1
-  ret i64 %add
+  ret i64 %zext
 
 if.end:
   ret i64 0
 }
 
 ;; Speculatable memory read doesn't prevent the hoist.
-define i32 @load_skip_speculatable_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+define void @load_skip_speculatable_memory_read(i32 %a, ptr %p, ptr %q) {
 ; CHECK-LABEL: @load_skip_speculatable_memory_read(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
-; CHECK-NEXT:    [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
 ; CHECK-NEXT:    [[READ:%.*]] = call i32 @read_memory_only()
-; CHECK-NEXT:    [[PHI:%.*]] = select i1 [[COND]], i32 0, i32 [[READ]]
-; CHECK-NEXT:    ret i32 [[PHI]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32 [[READ]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP1]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    ret void
 ;
 entry:
   %cond = icmp eq i32 %a, 0
   br i1 %cond, label %if.true, label %if.false
 
 if.false:
-  %read = call i32 @read_memory_only()
-  %0 = load i32, ptr %q
+  store i32 1, ptr %q
   br label %if.end
 
 if.true:
-  %1 = load i32, ptr %b
-  store i32 %1, ptr %p
-  br label %if.end
+  %read = call i32 @read_memory_only()
+  store i32 %read, ptr %p
+  br label %if.false
 
 if.end:
-  %phi = phi i32 [%read, %if.false], [0, %if.true]
-  ret i32 %phi
+  ret void
 }
 
 ;; Source of the load can be a GEP.
-;; TODO: Make the hoist happen.
 define i32 @load_from_gep(ptr %p)  {
 ; CHECK-LABEL: @load_from_gep(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
-; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
-; CHECK:       if.false:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    br label [[IF_TRUE]]
-; CHECK:       if.true:
-; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    ret i32 [[RES]]
+; CHECK-NEXT:    [[TMP0:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i1 [[TMP0]] to <1 x i1>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[ARRAYIDX]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <1 x i32> [[TMP2]] to i32
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 0, i32 [[TMP3]]
+; CHECK-NEXT:    ret i32 [[SPEC_SELECT]]
 ;
 entry:
   %cond = icmp eq ptr %p, null
@@ -264,24 +238,21 @@ define i32 @not_cheap_to_hoist(i32 %a, ptr %b, ptr %p, ptr %q, i32 %v0, i32 %v1,
 ; CHECK-LABEL: @not_cheap_to_hoist(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
-; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]])
-; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 2, <1 x i1> [[TMP5]])
-; CHECK-NEXT:    br i1 [[COND]], label [[COMMON_RET:%.*]], label [[IF_FALSE:%.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
 ; CHECK:       common.ret:
-; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[VVVV:%.*]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[COMMON_RET_OP:%.*]] = phi i32 [ [[VVVV:%.*]], [[IF_FALSE]] ], [ 0, [[IF_TRUE]] ]
 ; CHECK-NEXT:    ret i32 [[COMMON_RET_OP]]
 ; CHECK:       if.false:
+; CHECK-NEXT:    store i64 1, ptr [[P:%.*]], align 8
+; CHECK-NEXT:    store i16 2, ptr [[Q:%.*]], align 2
 ; CHECK-NEXT:    [[V:%.*]] = udiv i32 [[A]], 12345
 ; CHECK-NEXT:    [[VV:%.*]] = mul i32 [[V]], [[V0:%.*]]
 ; CHECK-NEXT:    [[VVV:%.*]] = mul i32 [[VV]], [[V1:%.*]]
 ; CHECK-NEXT:    [[VVVV]] = select i1 [[CC:%.*]], i32 [[V2:%.*]], i32 [[VVV]]
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P]], align 4
 ; CHECK-NEXT:    br label [[COMMON_RET]]
 ;
 entry:
@@ -540,47 +511,12 @@ if.end:
   ret void
 }
 
-;; Not hoist there's a store and a memory read is skipped.
-define void @not_store_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
-; CHECK-LABEL: @not_store_skip_memory_read(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
-; CHECK:       if.false:
-; CHECK-NEXT:    call void @read_memory_only()
-; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    br label [[IF_END:%.*]]
-; CHECK:       if.true:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    br label [[IF_END]]
-; CHECK:       if.end:
-; CHECK-NEXT:    ret void
-;
-entry:
-  %cond = icmp eq i32 %a, 0
-  br i1 %cond, label %if.true, label %if.false
-
-if.false:
-  call void @read_memory_only()
-  store i32 1, ptr %q, align 4
-  br label %if.end
-
-if.true:
-  %1 = load i32, ptr %b, align 4
-  store i32 %1, ptr %p, align 4
-  br label %if.end
-
-if.end:
-  ret void
-}
-
 ;; Not hoist if the branch is predictable and the `then` BB is not likely to execute.
 define void @not_likely_to_execute(ptr %p, ptr %q, i32 %a) {
 ; CHECK-LABEL: @not_likely_to_execute(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF0:![0-9]+]]
 ; CHECK:       if.end:
 ; CHECK-NEXT:    ret void
 ; CHECK:       if.then:
@@ -590,7 +526,7 @@ define void @not_likely_to_execute(ptr %p, ptr %q, i32 %a) {
 ;
 entry:
   %tobool = icmp ne i32 %a, 0
-  br i1 %tobool, label %if.then, label %if.end, !prof !14
+  br i1 %tobool, label %if.then, label %if.end, !prof !0
 
 if.end:
   ret void
@@ -601,23 +537,52 @@ if.then:
   br label %if.end
 }
 
+;; Now the optimization hoist-loads-stores-with-cond-faulting is run in codegen,
+;; which is after sroa and alloca is optimized away. So we don't need to do the transform
+;; for this case. But in the future, it is probably moved before sroa.
+define void @not_alloca(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @not_alloca(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[Q_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    store ptr [[P:%.*]], ptr [[P_ADDR]], align 8
+; CHECK-NEXT:    store ptr [[Q:%.*]], ptr [[Q_ADDR]], align 8
+; CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[Q_ADDR]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %p.addr = alloca ptr
+  %q.addr = alloca ptr
+  %a.addr = alloca i32
+  store ptr %p, ptr %p.addr
+  store ptr %q, ptr %q.addr
+  store i32 %a, ptr %a.addr
+  %0 = load i32, ptr %a.addr
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+  %1 = load ptr, ptr %q.addr
+  %2 = load i32, ptr %1
+  %3 = load ptr, ptr %p.addr
+  store i32 %2, ptr %3
+  br label %if.end
+
+if.end:
+  ret void
+}
+
 declare i32 @read_memory_only() readonly nounwind willreturn speculatable
 
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
-!1 = !DIFile(filename: "test.c", directory: "/tmp")
-!2 = !{i32 7, !"Dwarf Version", i32 5}
-!3 = !{i32 2, !"Debug Info Version", i32 3}
-!4 = !{i32 1, !"wchar_size", i32 4}
-!5 = !{i32 8, !"PIC Level", i32 2}
-!6 = !{i32 7, !"PIE Level", i32 2}
-!7 = !{i32 7, !"uwtable", i32 2}
-!8 = !DILocation(line: 1, column: 2, scope: !10)
-!9 = !DILocation(line: 1, column: 3, scope: !10)
-!10 = distinct !DISubprogram(name: "basic", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, unit: !0)
-!11 = !DISubroutineType(types: !12)
-!12 = !{}
-!13 = distinct !DIAssignID()
-!14 = !{!"branch_weights", i32 1, i32 99}
+!0 = !{!"branch_weights", i32 1, i32 99}

>From 75a84ba4279c72198e2d23a084383461c15554b9 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Sun, 4 Aug 2024 00:43:41 +0800
Subject: [PATCH 3/3] add TODO

---
 .../X86/hoist-loads-stores-with-cf.ll         | 61 ++++++++++---------
 1 file changed, 31 insertions(+), 30 deletions(-)

diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
index 9e4e76bc9d9f3..592221b23d1d2 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -120,36 +120,6 @@ if.end:
   ret void
 }
 
-;; Both of successor 0 and successor 1 have a single predecessor.
-define void @single_predecessor(ptr %p, ptr %q, i32 %a) {
-; CHECK-LABEL: @single_predecessor(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
-; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
-; CHECK:       common.ret:
-; CHECK-NEXT:    ret void
-; CHECK:       if.end:
-; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
-; CHECK:       if.then:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
-; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    br label [[COMMON_RET]]
-;
-entry:
-  %tobool = icmp ne i32 %a, 0
-  br i1 %tobool, label %if.end, label %if.then
-
-if.end:
-  store i32 1, ptr %q
-  ret void
-
-if.then:
-  %0 = load i32, ptr %q
-  store i32 %0, ptr %p
-  ret void
-}
-
 ;; Load after store can be hoisted.
 define i64 @load_after_store(i32 %a, ptr %b, ptr %p) {
 ; CHECK-LABEL: @load_after_store(
@@ -233,6 +203,37 @@ if.true:
   ret i32 %res
 }
 
+;; Both of successor 0 and successor 1 have a single predecessor.
+;; TODO: Support transform for this case.
+define void @single_predecessor(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @single_predecessor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    ret void
+; CHECK:       if.end:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[Q]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[COMMON_RET]]
+;
+entry:
+  %tobool = icmp ne i32 %a, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+  store i32 1, ptr %q
+  ret void
+
+if.then:
+  %0 = load i32, ptr %q
+  store i32 %0, ptr %p
+  ret void
+}
+
 ;; Not do hoist if the cost of instructions to be hoisted is expensive.
 define i32 @not_cheap_to_hoist(i32 %a, ptr %b, ptr %p, ptr %q, i32 %v0, i32 %v1, i32 %v2, i1 %cc) {
 ; CHECK-LABEL: @not_cheap_to_hoist(