[llvm] [X86] Support hoisting load/store with conditional faulting (PR #95515)

Wed Jun 19 19:16:21 PDT 2024

https://github.com/KanRobert updated https://github.com/llvm/llvm-project/pull/95515

>From 36622803bba7ec7e1365ec3a2813b290cbfbc7bd Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Wed, 5 Jun 2024 15:04:27 +0800
Subject: [PATCH 1/2] [X86] Support hoisting load/store with conditional
 faulting

1. Add TTI interface for conditional load/store.
2. Hoist load/store from successors with masked load/store if
   the targets support conditional faulting.
3. Mark 1 x i16/i32/i64 masked load/store legal so that it's not
   legalized in pass scalarize-masked-mem-intrin.
3. Visit 1 x i16/i32/i64 masked load/store to build a scalarized
   masked load/store node to avoid error in
   DAGTypeLegalizer::ScalarizeVectorResult.
4. Lower scalarized masked load/store to CFCMOV.
---
 .../llvm/Analysis/TargetTransformInfo.h       |   8 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   1 +
 llvm/include/llvm/CodeGen/TargetLowering.h    |   6 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   5 +
 llvm/lib/CodeGen/CodeGenPrepare.cpp           |   4 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  18 +
 llvm/lib/Target/X86/X86ISelLowering.h         |   2 +
 .../lib/Target/X86/X86TargetTransformInfo.cpp |  32 +-
 llvm/lib/Target/X86/X86TargetTransformInfo.h  |   1 +
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     | 185 ++++++++-
 llvm/test/CodeGen/X86/apx/cf.ll               |  15 +
 .../X86/hoist-load-store-with-cf.ll           | 357 ++++++++++++++++++
 12 files changed, 627 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/apx/cf.ll
 create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a..37afda39a1c9c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1113,6 +1113,10 @@ class TargetTransformInfo {
   /// \return the number of registers in the target-provided register class.
   unsigned getNumberOfRegisters(unsigned ClassID) const;
 
+  /// \return true if the target supports load/store that enables fault
+  /// suppression of memory operands when the source condition is false.
+  bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
+
   /// \return the target-provided register class ID for the provided type,
   /// accounting for type promotion and other type-legalization techniques that
   /// the target might apply. However, it specifically does not account for the
@@ -1956,6 +1960,7 @@ class TargetTransformInfo::Concept {
   virtual bool preferToKeepConstantsAttached(const Instruction &Inst,
                                              const Function &Fn) const = 0;
   virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
+  virtual bool hasConditionalFaultingLoadStoreForType(Type *Ty) const = 0;
   virtual unsigned getRegisterClassForType(bool Vector,
                                            Type *Ty = nullptr) const = 0;
   virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
@@ -2543,6 +2548,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getNumberOfRegisters(unsigned ClassID) const override {
     return Impl.getNumberOfRegisters(ClassID);
   }
+  bool hasConditionalFaultingLoadStoreForType(Type *Ty) const override {
+    return Impl.hasConditionalFaultingLoadStoreForType(Ty);
+  }
   unsigned getRegisterClassForType(bool Vector,
                                    Type *Ty = nullptr) const override {
     return Impl.getRegisterClassForType(Vector, Ty);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43..a4aa836ed82d3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -457,6 +457,7 @@ class TargetTransformInfoImplBase {
   }
 
   unsigned getNumberOfRegisters(unsigned ClassID) const { return 8; }
+  bool hasConditionalFaultingLoadStoreForType(Type *Ty) const { return false; }
 
   unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
     return Vector ? 1 : 0;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 06f7ee2a589c8..4a58513958ccf 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2161,6 +2161,12 @@ class TargetLoweringBase {
     return false;
   }
 
+  /// Translate a masked load intrinsic to its target-specific instrinsic.
+  virtual bool scalarizeMaskedLoad(CallInst *CI) const { return false; }
+
+  /// Translate a masked store intrinsic to its target-specific instrinsic.
+  virtual bool scalarizeMaskedStore(CallInst *CI) const { return false; }
+
   /// Perform a load-linked operation on Addr, returning a "Value *" with the
   /// corresponding pointee type. This may entail some non-trivial operations to
   /// truncate or reconstruct types that will be illegal in the backend. See
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f..46936f266bf46 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -722,6 +722,11 @@ unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const {
   return TTIImpl->getNumberOfRegisters(ClassID);
 }
 
+bool TargetTransformInfo::hasConditionalFaultingLoadStoreForType(
+    Type *Ty) const {
+  return TTIImpl->hasConditionalFaultingLoadStoreForType(Ty);
+}
+
 unsigned TargetTransformInfo::getRegisterClassForType(bool Vector,
                                                       Type *Ty) const {
   return TTIImpl->getRegisterClassForType(Vector, Ty);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index f8fdba2e35dd7..4205c02d4ddae 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2526,6 +2526,10 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
       return optimizeGatherScatterInst(II, II->getArgOperand(0));
     case Intrinsic::masked_scatter:
       return optimizeGatherScatterInst(II, II->getArgOperand(1));
+    case Intrinsic::masked_load:
+      return TLI->scalarizeMaskedLoad(II);
+    case Intrinsic::masked_store:
+      return TLI->scalarizeMaskedStore(II);
     }
 
     SmallVector<Value *, 2> PtrOps;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f27c935812f51..2b9e8c139b1cd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30672,6 +30672,24 @@ X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
              : AtomicExpansionKind::CmpXChg;
 }
 
+bool X86TargetLowering::scalarizeMaskedLoad(CallInst *CI) const {
+  // Transform <1 x ty> masked load to x86-specific ty masked load
+  // b/c the former only accepts vector op, otherwsie
+  // DAGTypeLegalizer::ScalarizeVectorResult would crash.
+  //
+  // FIXME: Extend masked load for scalar op and remove this.
+  return false;
+}
+
+bool X86TargetLowering::scalarizeMaskedStore(CallInst *CI) const {
+  // Transform <1 x ty> masked store to x86-specific ty masked store
+  // b/c the former only accepts vector op, otherwsie
+  // DAGTypeLegalizer::ScalarizeVectorResult would crash.
+  //
+  // FIXME: Extend masked store for scalar op and remove this.
+  return false;
+}
+
 void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
   Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3c5c903bc0d98..59b03c9f87c95 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1737,6 +1737,8 @@ namespace llvm {
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
+    bool scalarizeMaskedLoad(CallInst *CI) const override;
+    bool scalarizeMaskedStore(CallInst *CI) const override;
     void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
     void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index de0144331dba3..1100be925b127 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -176,6 +176,23 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
   return 8;
 }
 
+bool X86TTIImpl::hasConditionalFaultingLoadStoreForType(Type *Ty) const {
+  // Conditional faulting is supported by CFCMOV, which only accepts
+  // 16/32/64-bit operands.
+  // NOTE: Though VMOVSS/VMOVSD suppresses memory fault with zero mask, it has
+  // performance penalty.
+  if (!ST->hasCF() || !Ty || !Ty->isIntegerTy())
+    return false;
+  switch (cast<IntegerType>(Ty)->getBitWidth()) {
+  default:
+    return false;
+  case 16:
+  case 32:
+  case 64:
+    return true;
+  }
+}
+
 TypeSize
 X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
   unsigned PreferVectorWidth = ST->getPreferVectorWidth();
@@ -5891,14 +5908,21 @@ bool X86TTIImpl::canMacroFuseCmp() {
 }
 
 bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
+  bool IsSingleElementVector =
+      isa<VectorType>(DataTy) &&
+      cast<FixedVectorType>(DataTy)->getNumElements() == 1;
+  Type *ScalarTy = DataTy->getScalarType();
+
+  if (ST->hasCF() && IsSingleElementVector &&
+      hasConditionalFaultingLoadStoreForType(ScalarTy))
+    return true;
+
   if (!ST->hasAVX())
     return false;
 
-  // The backend can't handle a single element vector.
-  if (isa<VectorType>(DataTy) &&
-      cast<FixedVectorType>(DataTy)->getNumElements() == 1)
+  // The backend can't handle a single element vector w/o CFCMOV.
+  if (IsSingleElementVector)
     return false;
-  Type *ScalarTy = DataTy->getScalarType();
 
   if (ScalarTy->isPointerTy())
     return true;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index e14dc9fc09051..701648c6a2b3a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,6 +132,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
   /// @{
 
   unsigned getNumberOfRegisters(unsigned ClassID) const;
+  bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
   TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(ElementCount VF);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 4e2dc7f2b2f4e..0b1ec69237f81 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -131,6 +131,12 @@ static cl::opt<bool> HoistCondStores(
     "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
     cl::desc("Hoist conditional stores if an unconditional store precedes"));
 
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+    "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+    cl::init(true),
+    cl::desc("Hoist loads/stores if the target supports "
+             "conditional faulting"));
+
 static cl::opt<bool> MergeCondStores(
     "simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
     cl::desc("Hoist conditional stores even if an unconditional store does not "
@@ -275,6 +281,7 @@ class SimplifyCFGOpt {
   bool hoistSuccIdenticalTerminatorToSwitchOrIf(
       Instruction *TI, Instruction *I1,
       SmallVectorImpl<Instruction *> &OtherSuccTIs);
+  bool hoistLoadStoreWithCondFaultingFromSuccessors(BasicBlock *BB);
   bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
   bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
                                   BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2960,6 +2967,172 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
   return HaveRewritablePHIs;
 }
 
+/// Hoist load/store instructions from the conditional successor blocks up into
+/// the block.
+///
+/// We are looking for code like the following:
+/// \code
+///   BB:
+///     ...
+///     %cond = icmp ult %x, %y
+///     br i1 %cond, label %TrueBB, label %FalseBB
+///   FalseBB:
+///     store i32 1, ptr %q, align 4
+///     ...
+///   TrueBB:
+///     %0 = load i32, ptr %b, align 4
+///     store i32 %0, ptr %p, align 4
+///     ...
+/// \endcode
+//
+/// We are going to transform this into:
+///
+/// \code
+///   BB:
+///     ...
+///     %cond = icmp ult %x, %y
+///     %0 = cload i32, ptr %b, %cond
+///     cstore i32 %0, ptr %p, %cond
+///     cstore i32 1, ptr %q, ~%cond
+///     br i1 %cond, label %TrueBB, label %FalseBB
+///   FalseBB:
+///     ...
+///   TrueBB:
+///     ...
+/// \endcode
+///
+/// where cload/cstore is represented by intrinsic like llvm.masked.load/store,
+/// e.g.
+///
+/// \code
+///   %vcond = bitcast i1 %cond to <1 x i1>
+///   %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+///                         (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
+///   %0 = bitcast <1 x i32> %v0 to i32
+///   call void @llvm.masked.store.v1i32.p0
+//                          (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+///   %cond.not = xor i1 %cond, true
+///   %vcond.not = bitcast i1 %cond.not to <1 x i>
+///   call void @llvm.masked.store.v1i32.p0
+///              (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+/// \endcode
+///
+/// \returns true if any load/store is hosited.
+///
+/// Note that this tranform should be run
+/// * before SpeculativelyExecuteBB so that the latter can have more chance.
+/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
+///   are handled first.
+bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
+    BasicBlock *BB) {
+  auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
+
+  BasicBlock *IfTrueBB = BI->getSuccessor(0);
+  BasicBlock *IfFalseBB = BI->getSuccessor(1);
+
+  // If either of the blocks has it's address taken, then we can't do this fold,
+  // because the code we'd hoist would no longer run when we jump into the block
+  // by it's address.
+  for (auto *Succ : {IfTrueBB, IfFalseBB})
+    if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+      return false;
+
+  // Collect hoisted loads/stores.
+  SmallSetVector<Instruction *, 4> HoistedInsts;
+  // Not hoist load/store if
+  // 1. target does not have corresponding conditional faulting load/store.
+  // 2. it's volatile or atomic.
+  // 3. there is a load/store that can not be hoisted in the same bb.
+  // 4. there is a non-load/store that may have side effects in the same bb.
+  // 5. any operand of it does not dominate the branch.
+  // 6. it's a store and a memory read is skipped.
+  auto HoistInstsInBB = [&](BasicBlock *BB) {
+    bool SkipMemoryRead = false;
+    // A more efficient way to check domination. An operand dominates the
+    // BranchInst if
+    // 1. it's not defined in the same bb as the instruction.
+    // 2. it's to be hoisted.
+    //
+    // b/c BB is only predecessor and BranchInst does not define any value.
+    auto OpsDominatesBranch = [&](Instruction &I) {
+      return llvm::none_of(I.operands(), [&](Value *Op) {
+        if (auto *J = dyn_cast<Instruction>(Op)) {
+          if (HoistedInsts.contains(J))
+            return false;
+          if (J->getParent() == I.getParent())
+            return true;
+        }
+        return false;
+      });
+    };
+    for (auto &I : *BB) {
+      auto *LI = dyn_cast<LoadInst>(&I);
+      auto *SI = dyn_cast<StoreInst>(&I);
+      if (LI || SI) {
+        auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
+        bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
+        if (!TTI.hasConditionalFaultingLoadStoreForType(Type) || !IsSimple ||
+            !OpsDominatesBranch(I))
+          return false;
+        if (SI && SkipMemoryRead)
+          return false;
+        HoistedInsts.insert(&I);
+      } else if (I.mayHaveSideEffects())
+        return false;
+      else if (I.mayReadFromMemory())
+        SkipMemoryRead = true;
+    }
+    return true;
+  };
+
+  if (!HoistInstsInBB(IfTrueBB) || !HoistInstsInBB(IfFalseBB) ||
+      HoistedInsts.empty())
+    return false;
+
+  // Put newly added instructions before the BranchInst.
+  IRBuilder<> Builder(BI);
+  auto &Context = BB->getContext();
+  auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+  auto *Cond = BI->getOperand(0);
+  auto *VCond = Builder.CreateBitCast(Cond, VCondTy);
+  Value *VCondNot = nullptr;
+  for (auto *I : HoistedInsts) {
+    bool InvertCond = I->getParent() == IfFalseBB;
+    // Construct the inverted condition if need.
+    if (InvertCond && !VCondNot)
+      VCondNot = Builder.CreateBitCast(
+          Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+
+    auto *Mask = InvertCond ? VCondNot : VCond;
+    auto *Op0 = I->getOperand(0);
+    if (auto *LI = dyn_cast<LoadInst>(I)) {
+      // Load
+      auto *Ty = I->getType();
+      auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
+                                          LI->getAlign(), Mask);
+      auto *S0 = Builder.CreateBitCast(V0, Ty);
+      V0->copyMetadata(*I);
+      I->replaceAllUsesWith(S0);
+    } else {
+      // Store
+      auto *StoredVal =
+          Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+      auto *VStore = Builder.CreateMaskedStore(
+          StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+      VStore->copyMetadata(*I);
+    }
+  }
+
+  // Erase the hoisted instrutions in reverse order to avoid use-w/o-define
+  // error.
+  std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(),
+                [](auto I) { I->eraseFromParent(); });
+
+  return true;
+}
+
 /// Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
@@ -7419,14 +7592,20 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     return requestResimplify();
 
   // We have a conditional branch to two blocks that are only reachable
-  // from BI.  We know that the condbr dominates the two blocks, so see if
-  // there is any identical code in the "then" and "else" blocks.  If so, we
-  // can hoist it up to the branching block.
+  // from BI.  We know that the condbr dominates the two blocks, so see
+  //
+  // * if there is any identical code in the "then" and "else" blocks.
+  // * if there is any different load/store in the "then" and "else" blocks.
+  //
+  // If so, we can hoist it up to the branching block.
   if (BI->getSuccessor(0)->getSinglePredecessor()) {
     if (BI->getSuccessor(1)->getSinglePredecessor()) {
       if (HoistCommon && hoistCommonCodeFromSuccessors(
                              BI->getParent(), !Options.HoistCommonInsts))
         return requestResimplify();
+      if (HoistLoadsStoresWithCondFaulting &&
+          hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+        return requestResimplify();
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to Successor #1.
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
new file mode 100644
index 0000000000000..df0333e236817
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -mattr=+cf -verify-machineinstrs | FileCheck %s
+
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+entry:
+  %cond = icmp eq i32 %a, 0
+  %0 = bitcast i1 %cond to <1 x i1>
+  %1 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i32> poison)
+  call void @llvm.masked.store.v1i32.p0(<1 x i32> %1, ptr %p, i32 4, <1 x i1> %0)
+  %2 = xor i1 %cond, true
+  %3 = bitcast i1 %2 to <1 x i1>
+  call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr %p, i32 8, <1 x i1> %3)
+  call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr %q, i32 8, <1 x i1> %3)
+  ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
new file mode 100644
index 0000000000000..9c8c7c2ad6597
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -0,0 +1,357 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+
+;; The redundant bitcast + insertelement will be opimized out in instcombine pass.
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i64 1, ptr %p, align 8, !dbg !8
+  store i16 2, ptr %q, align 8, !dbg !8
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4,  !dbg !9
+  store i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_supported_type(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i8 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i8 1, ptr [[Q:%.*]], align 1
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1
+; CHECK-NEXT:    store i8 [[TMP0]], ptr [[P:%.*]], align 1
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i8 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i8 1, ptr %q
+  br label %if.end
+
+if.true:
+  %0 = load i8, ptr %b
+  store i8 %0, ptr %p
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @not_br_terminator(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_br_terminator(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    switch i32 [[A:%.*]], label [[IF_END:%.*]] [
+; CHECK-NEXT:      i32 1, label [[IF_FALSE:%.*]]
+; CHECK-NEXT:      i32 2, label [[IF_TRUE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_FALSE]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  switch i32 %a, label %if.end [
+  i32 1, label %if.false
+  i32 2, label %if.true
+  ]
+
+if.false:
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  store i32 %0, ptr %p, align 4
+  br label %if.false
+
+if.end:
+  ret void
+}
+
+define void @not_single_predecessor(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_single_predecessor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    ret void
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_FALSE]]
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  store i32 %0, ptr %p, align 4
+  br label %if.false
+
+if.end:
+  ret void
+}
+
+define void @not_simple(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store atomic i32 1, ptr [[Q:%.*]] seq_cst, align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store atomic i32 1, ptr %q seq_cst, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  store i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @not_hoistable_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store volatile i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  store volatile i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @not_hoistable_sideeffect(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_sideeffect(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    [[RMW:%.*]] = atomicrmw xchg ptr [[Q]], double 4.000000e+00 seq_cst, align 8
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %0 = load i32, ptr %b, align 4
+  %rmw= atomicrmw xchg ptr %q, double 4.0 seq_cst
+  store i32 %0, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @not_ops_dominate_br(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_ops_dominate_br(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[A]], 2
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  %add = add i32 %a, 2
+  store i32 %add, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %1 = load i32, ptr %b, align 4
+  store i32 %1, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @load_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_memory_read(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT:    [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT:    call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_END:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    call void @read_memory_only()
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  call void @read_memory_only()
+  %0 = load i32, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %1 = load i32, ptr %b, align 4
+  store i32 %1, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @not_store_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_store_skip_memory_read(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK:       if.false:
+; CHECK-NEXT:    call void @read_memory_only()
+; CHECK-NEXT:    store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.true:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT:    store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cond = icmp eq i32 %a, 0
+  br i1 %cond, label %if.true, label %if.false
+
+if.false:
+  call void @read_memory_only()
+  store i32 1, ptr %q, align 4
+  br label %if.end
+
+if.true:
+  %1 = load i32, ptr %b, align 4
+  store i32 %1, ptr %p, align 4
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @read_memory_only() readonly nounwind willreturn
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !DILocation(line: 1, column: 2, scope: !10)
+!9 = !DILocation(line: 1, column: 3, scope: !10)
+!10 = distinct !DISubprogram(name: "basic", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, unit: !0)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}

>From 7deea0aab99c8fa5e617a214a921490184ef1aca Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Thu, 20 Jun 2024 10:13:07 +0800
Subject: [PATCH 2/2] Remove redundant code

---
 llvm/include/llvm/CodeGen/TargetLowering.h |  6 ------
 llvm/lib/CodeGen/CodeGenPrepare.cpp        |  4 ----
 llvm/lib/Target/X86/X86ISelLowering.cpp    | 18 ------------------
 llvm/lib/Target/X86/X86ISelLowering.h      |  2 --
 4 files changed, 30 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 4a58513958ccf..06f7ee2a589c8 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2161,12 +2161,6 @@ class TargetLoweringBase {
     return false;
   }
 
-  /// Translate a masked load intrinsic to its target-specific instrinsic.
-  virtual bool scalarizeMaskedLoad(CallInst *CI) const { return false; }
-
-  /// Translate a masked store intrinsic to its target-specific instrinsic.
-  virtual bool scalarizeMaskedStore(CallInst *CI) const { return false; }
-
   /// Perform a load-linked operation on Addr, returning a "Value *" with the
   /// corresponding pointee type. This may entail some non-trivial operations to
   /// truncate or reconstruct types that will be illegal in the backend. See
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 4205c02d4ddae..f8fdba2e35dd7 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -2526,10 +2526,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, ModifyDT &ModifiedDT) {
       return optimizeGatherScatterInst(II, II->getArgOperand(0));
     case Intrinsic::masked_scatter:
       return optimizeGatherScatterInst(II, II->getArgOperand(1));
-    case Intrinsic::masked_load:
-      return TLI->scalarizeMaskedLoad(II);
-    case Intrinsic::masked_store:
-      return TLI->scalarizeMaskedStore(II);
     }
 
     SmallVector<Value *, 2> PtrOps;
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2b9e8c139b1cd..f27c935812f51 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -30672,24 +30672,6 @@ X86TargetLowering::shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const {
              : AtomicExpansionKind::CmpXChg;
 }
 
-bool X86TargetLowering::scalarizeMaskedLoad(CallInst *CI) const {
-  // Transform <1 x ty> masked load to x86-specific ty masked load
-  // b/c the former only accepts vector op, otherwsie
-  // DAGTypeLegalizer::ScalarizeVectorResult would crash.
-  //
-  // FIXME: Extend masked load for scalar op and remove this.
-  return false;
-}
-
-bool X86TargetLowering::scalarizeMaskedStore(CallInst *CI) const {
-  // Transform <1 x ty> masked store to x86-specific ty masked store
-  // b/c the former only accepts vector op, otherwsie
-  // DAGTypeLegalizer::ScalarizeVectorResult would crash.
-  //
-  // FIXME: Extend masked store for scalar op and remove this.
-  return false;
-}
-
 void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
   IRBuilder<> Builder(AI);
   Builder.CollectMetadataToCopy(AI, {LLVMContext::MD_pcsections});
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 59b03c9f87c95..3c5c903bc0d98 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1737,8 +1737,6 @@ namespace llvm {
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandLogicAtomicRMWInIR(AtomicRMWInst *AI) const;
-    bool scalarizeMaskedLoad(CallInst *CI) const override;
-    bool scalarizeMaskedStore(CallInst *CI) const override;
     void emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;
     void emitCmpArithAtomicRMWIntrinsic(AtomicRMWInst *AI) const override;