[llvm] [X86][SimplifyCFG] Support hoisting load/store with conditional faulting (PR #96878)
Shengchen Kan via llvm-commits
llvm-commits at lists.llvm.org
Fri Aug 2 07:00:05 PDT 2024
https://github.com/KanRobert updated https://github.com/llvm/llvm-project/pull/96878
>From 97dd259f4448ea18316f7a4f6678705e181ad36b Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Thu, 27 Jun 2024 17:10:00 +0800
Subject: [PATCH] [X86][SimplifyCFG] Support hoisting load/store with
conditional faulting
---
.../Transforms/Utils/SimplifyCFGOptions.h | 10 +
llvm/lib/CodeGen/TargetPassConfig.cpp | 8 +
llvm/lib/Passes/PassBuilder.cpp | 2 +
.../lib/Transforms/Scalar/SimplifyCFGPass.cpp | 25 +
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 273 +++++++-
llvm/test/CodeGen/AArch64/O3-pipeline.ll | 2 +
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 8 +
llvm/test/CodeGen/ARM/O3-pipeline.ll | 2 +
llvm/test/CodeGen/LoongArch/opt-pipeline.ll | 2 +
llvm/test/CodeGen/PowerPC/O3-pipeline.ll | 2 +
llvm/test/CodeGen/RISCV/O3-pipeline.ll | 2 +
.../CodeGen/X86/hoist-loads-stores-with-cf.ll | 32 +
llvm/test/CodeGen/X86/opt-pipeline.ll | 2 +
llvm/test/Other/new-pm-print-pipeline.ll | 4 +-
.../X86/hoist-loads-stores-with-cf.ll | 623 ++++++++++++++++++
15 files changed, 976 insertions(+), 21 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll
create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
diff --git a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
index 2ea9d64f03cb6..3ddc3d6d00893 100644
--- a/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
+++ b/llvm/include/llvm/Transforms/Utils/SimplifyCFGOptions.h
@@ -27,10 +27,12 @@ struct SimplifyCFGOptions {
bool ConvertSwitchToLookupTable = false;
bool NeedCanonicalLoop = true;
bool HoistCommonInsts = false;
+ bool HoistLoadsStoresWithCondFaulting = false;
bool SinkCommonInsts = false;
bool SimplifyCondBranch = true;
bool SpeculateBlocks = true;
bool SpeculateUnpredictables = false;
+ bool RunInCodeGen = false;
AssumptionCache *AC = nullptr;
@@ -59,6 +61,10 @@ struct SimplifyCFGOptions {
HoistCommonInsts = B;
return *this;
}
+ SimplifyCFGOptions &hoistLoadsStoresWithCondFaulting(bool B) {
+ HoistLoadsStoresWithCondFaulting = B;
+ return *this;
+ }
SimplifyCFGOptions &sinkCommonInsts(bool B) {
SinkCommonInsts = B;
return *this;
@@ -80,6 +86,10 @@ struct SimplifyCFGOptions {
SpeculateUnpredictables = B;
return *this;
}
+ SimplifyCFGOptions &runInCodeGen(bool B) {
+ RunInCodeGen = B;
+ return *this;
+ }
};
} // namespace llvm
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index eb74c6f3e5d44..fcbd7cafaf303 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -47,6 +47,7 @@
#include "llvm/Target/CGPassBuilderOption.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar/SimplifyCFG.h"
#include "llvm/Transforms/Utils.h"
#include <cassert>
#include <optional>
@@ -852,6 +853,13 @@ void TargetPassConfig::addIRPasses() {
!DisableAtExitBasedGlobalDtorLowering)
addPass(createLowerGlobalDtorsLegacyPass());
+ // Hoist loads/stores to reduce branches when profitable.
+ if (getOptLevel() != CodeGenOptLevel::None)
+ addPass(createCFGSimplificationPass(
+ SimplifyCFGOptions()
+ .runInCodeGen(true)
+ .hoistLoadsStoresWithCondFaulting(true)));
+
// Make sure that no unreachable blocks are instruction selected.
addPass(createUnreachableBlockEliminationPass());
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 5dbb1e2f49871..e981db0f88366 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -843,6 +843,8 @@ Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
Result.needCanonicalLoops(Enable);
} else if (ParamName == "hoist-common-insts") {
Result.hoistCommonInsts(Enable);
+ } else if (ParamName == "hoist-loads-stores-with-cond-faulting") {
+ Result.hoistLoadsStoresWithCondFaulting(Enable);
} else if (ParamName == "sink-common-insts") {
Result.sinkCommonInsts(Enable);
} else if (ParamName == "speculate-unpredictables") {
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 11de37f7a7c10..4ff2fff17a460 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -73,6 +73,11 @@ static cl::opt<bool> UserHoistCommonInsts(
"hoist-common-insts", cl::Hidden, cl::init(false),
cl::desc("hoist common instructions (default = false)"));
+static cl::opt<bool> UserHoistLoadsStoresWithCondFaulting(
+ "hoist-loads-stores-with-cond-faulting", cl::Hidden, cl::init(false),
+ cl::desc("Hoist loads/stores if the target supports conditional faulting "
+ "(default = false)"));
+
static cl::opt<bool> UserSinkCommonInsts(
"sink-common-insts", cl::Hidden, cl::init(false),
cl::desc("Sink common instructions (default = false)"));
@@ -272,6 +277,21 @@ static bool simplifyFunctionCFGImpl(Function &F, const TargetTransformInfo &TTI,
const SimplifyCFGOptions &Options) {
DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+ // In codegen, we use unreachableblockelim to remove dead blocks b/c it
+ // doesn't really have any well defined semantics for unreachable code.
+ //
+ // The first character is not capitalized to reduce the code diff FTTB.
+ auto removeUnreachableBlocks = [&](Function &F, DomTreeUpdater *DTU = nullptr,
+ MemorySSAUpdater *MSSAU = nullptr) {
+ return !Options.RunInCodeGen &&
+ llvm::removeUnreachableBlocks(F, DTU, MSSAU);
+ };
+
+ // Only run simplifycfg in codegen if we'd like to hoist loads/stores.
+ if (Options.RunInCodeGen && (!Options.HoistLoadsStoresWithCondFaulting ||
+ !TTI.hasConditionalLoadStoreForType()))
+ return false;
+
bool EverChanged = removeUnreachableBlocks(F, DT ? &DTU : nullptr);
EverChanged |=
tailMergeBlocksWithSimilarFunctionTerminators(F, DT ? &DTU : nullptr);
@@ -326,6 +346,9 @@ static void applyCommandLineOverridesToOptions(SimplifyCFGOptions &Options) {
Options.NeedCanonicalLoop = UserKeepLoops;
if (UserHoistCommonInsts.getNumOccurrences())
Options.HoistCommonInsts = UserHoistCommonInsts;
+ if (UserHoistLoadsStoresWithCondFaulting.getNumOccurrences())
+ Options.HoistLoadsStoresWithCondFaulting =
+ UserHoistLoadsStoresWithCondFaulting;
if (UserSinkCommonInsts.getNumOccurrences())
Options.SinkCommonInsts = UserSinkCommonInsts;
if (UserSpeculateUnpredictables.getNumOccurrences())
@@ -354,6 +377,8 @@ void SimplifyCFGPass::printPipeline(
<< "switch-to-lookup;";
OS << (Options.NeedCanonicalLoop ? "" : "no-") << "keep-loops;";
OS << (Options.HoistCommonInsts ? "" : "no-") << "hoist-common-insts;";
+ OS << (Options.HoistLoadsStoresWithCondFaulting ? "" : "no-")
+ << "hoist-loads-stores-with-cond-faulting;";
OS << (Options.SinkCommonInsts ? "" : "no-") << "sink-common-insts;";
OS << (Options.SpeculateBlocks ? "" : "no-") << "speculate-blocks;";
OS << (Options.SimplifyCondBranch ? "" : "no-") << "simplify-cond-branch;";
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f23e28888931d..c284c9121e51e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -117,6 +117,12 @@ static cl::opt<bool>
HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true),
cl::desc("Hoist common instructions up to the parent block"));
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+ "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+ cl::init(true),
+ cl::desc("Hoist loads/stores if the target supports "
+ "conditional faulting"));
+
static cl::opt<unsigned>
HoistCommonSkipLimit("simplifycfg-hoist-common-skip-limit", cl::Hidden,
cl::init(20),
@@ -275,6 +281,9 @@ class SimplifyCFGOpt {
bool hoistSuccIdenticalTerminatorToSwitchOrIf(
Instruction *TI, Instruction *I1,
SmallVectorImpl<Instruction *> &OtherSuccTIs);
+ bool
+ hoistLoadStoreWithCondFaultingFromSuccessors(BranchInst *BI,
+ BasicBlock *ThenBB = nullptr);
bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2960,6 +2969,232 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
return HaveRewritablePHIs;
}
+static bool isProfitableToSpeculate(const BranchInst *BI, bool Invert,
+ const TargetTransformInfo &TTI) {
+ // If the branch is non-unpredictable, and is predicted to *not* branch to
+ // the `then` block, then avoid speculating it.
+ if (BI->getMetadata(LLVMContext::MD_unpredictable))
+ return true;
+
+ uint64_t TWeight, FWeight;
+ if (!extractBranchWeights(*BI, TWeight, FWeight) || (TWeight + FWeight) == 0)
+ return true;
+
+ uint64_t EndWeight = Invert ? TWeight : FWeight;
+ BranchProbability BIEndProb =
+ BranchProbability::getBranchProbability(EndWeight, TWeight + FWeight);
+ BranchProbability Likely = TTI.getPredictableBranchThreshold();
+ return BIEndProb < Likely;
+}
+
+/// Hoist load/store instructions from the conditional successor blocks up into
+/// the block.
+///
+/// We are looking for code like the following:
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// store i32 1, ptr %q, align 4
+/// ...
+/// TrueBB:
+/// %0 = load i32, ptr %b, align 4
+/// store i32 %0, ptr %p, align 4
+/// ...
+/// \endcode
+//
+/// We are going to transform this into:
+///
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// %0 = cload i32, ptr %b, %cond
+/// cstore i32 %0, ptr %p, %cond
+/// cstore i32 1, ptr %q, ~%cond
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// ...
+/// TrueBB:
+/// ...
+/// \endcode
+///
+/// where cload/cstore is represented by intrinsic like llvm.masked.load/store,
+/// e.g.
+///
+/// \code
+/// %vcond = bitcast i1 %cond to <1 x i1>
+/// %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+/// (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
+/// %0 = bitcast <1 x i32> %v0 to i32
+/// call void @llvm.masked.store.v1i32.p0
+// (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+/// %cond.not = xor i1 %cond, true
+/// %vcond.not = bitcast i1 %cond.not to <1 x i>
+/// call void @llvm.masked.store.v1i32.p0
+/// (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+/// \endcode
+///
+/// \returns true if any load/store is hosited.
+///
+/// Note that this tranform should be run
+/// * before SpeculativelyExecuteBB so that the latter can have more chance.
+/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
+/// are handled first.
+bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
+ BranchInst *BI, BasicBlock *ThenBB) {
+ if (!HoistLoadsStoresWithCondFaulting ||
+ !Options.HoistLoadsStoresWithCondFaulting ||
+ !TTI.hasConditionalLoadStoreForType())
+ return false;
+
+ if (!BI || !BI->isConditional())
+ return false;
+
+ if (!isProfitableToSpeculate(BI, ThenBB != BI->getSuccessor(0), TTI))
+ return false;
+
+ SmallVector<BasicBlock *, 2> Successors;
+ if (ThenBB)
+ Successors.push_back(ThenBB);
+ else
+ Successors.append({BI->getSuccessor(0), BI->getSuccessor(1)});
+
+ // If either of the blocks has it's address taken, then we can't do this fold,
+ // because the code we'd hoist would no longer run when we jump into the block
+ // by it's address.
+ for (auto *Succ : Successors)
+ if (Succ->hasAddressTaken())
+ return false;
+
+ // Not use isa<AllocaInst>(getUnderlyingObject(I.getOperand(0)) to avoid
+ // checking all intermediate operands dominate the branch.
+ auto IsLoadFromAlloca = [](const Instruction &I) {
+ return isa<LoadInst>(I) && isa<AllocaInst>((I.getOperand(0)));
+ };
+
+ // Collect hoisted loads/stores.
+ SmallSetVector<Instruction *, 4> HoistedInsts;
+ // Not hoist load/store if
+ // 1. target does not have corresponding conditional faulting load/store.
+ // 2. it's volatile or atomic.
+ // 3. there is a load/store that can not be hoisted in the same bb.
+ // 4. there is a non-load/store that's not safe to speculatively execute
+ // in the same bb.
+ // 5. any operand of it does not dominate the branch.
+ // 6. it's a store and a memory read is skipped.
+ auto HoistInstsInBB = [&](BasicBlock *BB) {
+ bool SkipMemoryRead = false;
+ // A more efficient way to check domination. An operand dominates the
+ // BranchInst if
+ // 1. it's not defined in the same bb as the instruction.
+ // 2. it's to be hoisted.
+ //
+ // b/c BB is only predecessor and BranchInst does not define any value.
+ auto OpsDominatesBranch = [&](Instruction &I) {
+ return llvm::all_of(I.operands(), [&](Value *Op) {
+ if (auto *J = dyn_cast<Instruction>(Op)) {
+ if (HoistedInsts.contains(J))
+ return true;
+ if (J->getParent() == I.getParent())
+ return false;
+ }
+ return true;
+ });
+ };
+ for (auto &I : *BB) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (LI || SI) {
+ bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
+ if (!IsSimple || !OpsDominatesBranch(I))
+ return false;
+ auto *Type = getLoadStoreType(&I);
+ // a load from alloca is always safe.
+ if (!IsLoadFromAlloca(I) && !TTI.hasConditionalLoadStoreForType(Type))
+ return false;
+ // Conservative aliasing check.
+ if (SI && SkipMemoryRead)
+ return false;
+ HoistedInsts.insert(&I);
+ } else if (!I.isTerminator() && !isSafeToSpeculativelyExecute(&I))
+ return false;
+ else if (I.mayReadFromMemory())
+ SkipMemoryRead = true;
+ }
+ return true;
+ };
+
+ for (auto *Succ : Successors)
+ if (!HoistInstsInBB(Succ))
+ return false;
+
+ if (HoistedInsts.empty())
+ return false;
+
+ // Put newly added instructions before the BranchInst.
+ IRBuilder<> Builder(BI);
+ auto &Context = BI->getParent()->getContext();
+ auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+ auto *Cond = BI->getOperand(0);
+ auto *VCond = Builder.CreateBitCast(Cond, VCondTy);
+ Value *VCondNot = nullptr;
+ for (auto *I : HoistedInsts) {
+ // Only need to move the position for load from alloca.
+ if (IsLoadFromAlloca(*I)) {
+ I->moveBefore(BI);
+ continue;
+ }
+
+ bool InvertCond = I->getParent() == BI->getSuccessor(1);
+ // Construct the inverted condition if need.
+ if (InvertCond && !VCondNot)
+ VCondNot = Builder.CreateBitCast(
+ Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+
+ auto *Mask = InvertCond ? VCondNot : VCond;
+ auto *Op0 = I->getOperand(0);
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ // Load
+ auto *Ty = I->getType();
+ // NOTE: Now we assume conditional faulting load/store is supported for
+ // scalar only when creating new instructions, but it's easy to extend it
+ // for vector types in the future.
+ assert(!Ty->isVectorTy() && "not implemented");
+ auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
+ LI->getAlign(), Mask);
+ auto *S0 = Builder.CreateBitCast(V0, Ty);
+ V0->copyMetadata(*I);
+ I->replaceAllUsesWith(S0);
+ } else {
+ // Store
+ assert(!Op0->getType()->isVectorTy() && "not implemented");
+ auto *StoredVal =
+ Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+ auto *VStore = Builder.CreateMaskedStore(
+ StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+ VStore->copyMetadata(*I);
+ // FIXME: DIAssignID is not supported for masked store yet.
+ // (Verifier::visitDIAssignIDMetadata)
+ at::deleteAssignmentMarkers(VStore);
+ VStore->eraseMetadataIf([](unsigned MDKind, MDNode *Node) {
+ return Node->getMetadataID() == Metadata::DIAssignIDKind;
+ });
+ }
+ }
+
+ // Erase the hoisted instrutions in reverse order to avoid use-w/o-define
+ // error.
+ std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(), [&](auto I) {
+ if (!IsLoadFromAlloca(*I))
+ I->eraseFromParent();
+ });
+
+ return true;
+}
+
/// Speculate a conditional basic block flattening the CFG.
///
/// Note that this is a very risky transform currently. Speculating
@@ -3021,20 +3256,8 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
}
assert(EndBB == BI->getSuccessor(!Invert) && "No edge from to end block");
- // If the branch is non-unpredictable, and is predicted to *not* branch to
- // the `then` block, then avoid speculating it.
- if (!BI->getMetadata(LLVMContext::MD_unpredictable)) {
- uint64_t TWeight, FWeight;
- if (extractBranchWeights(*BI, TWeight, FWeight) &&
- (TWeight + FWeight) != 0) {
- uint64_t EndWeight = Invert ? TWeight : FWeight;
- BranchProbability BIEndProb =
- BranchProbability::getBranchProbability(EndWeight, TWeight + FWeight);
- BranchProbability Likely = TTI.getPredictableBranchThreshold();
- if (BIEndProb >= Likely)
- return false;
- }
- }
+ if (!isProfitableToSpeculate(BI, Invert, TTI))
+ return false;
// Keep a count of how many times instructions are used within ThenBB when
// they are candidates for sinking into ThenBB. Specifically:
@@ -7513,31 +7736,43 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
return requestResimplify();
// We have a conditional branch to two blocks that are only reachable
- // from BI. We know that the condbr dominates the two blocks, so see if
- // there is any identical code in the "then" and "else" blocks. If so, we
- // can hoist it up to the branching block.
+ // from BI. We know that the condbr dominates the two blocks, so see
+ //
+ // * if there is any identical code in the "then" and "else" blocks.
+ // * if there is any different load/store in the "then" and "else" blocks.
+ //
+ // If so, we can hoist it up to the branching block.
if (BI->getSuccessor(0)->getSinglePredecessor()) {
if (BI->getSuccessor(1)->getSinglePredecessor()) {
if (HoistCommon && hoistCommonCodeFromSuccessors(
BI->getParent(), !Options.HoistCommonInsts))
return requestResimplify();
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI))
+ return requestResimplify();
} else {
// If Successor #1 has multiple preds, we may be able to conditionally
// execute Successor #0 if it branches to Successor #1.
Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
if (Succ0TI->getNumSuccessors() == 1 &&
- Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+ Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) {
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI,
+ BI->getSuccessor(0)))
+ return requestResimplify();
if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
return requestResimplify();
+ }
}
} else if (BI->getSuccessor(1)->getSinglePredecessor()) {
// If Successor #0 has multiple preds, we may be able to conditionally
// execute Successor #1 if it branches to Successor #0.
Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
if (Succ1TI->getNumSuccessors() == 1 &&
- Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+ Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) {
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI, BI->getSuccessor(1)))
+ return requestResimplify();
if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
return requestResimplify();
+ }
}
// If this is a branch on something for which we know the constant value in
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index 017349aa32af3..0f72420127f03 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -52,7 +52,9 @@
; CHECK-NEXT: Expand memcmp() to load/stores
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
+; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Remove unreachable blocks from the CFG
+; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Post-Dominator Tree Construction
; CHECK-NEXT: Branch Probability Analysis
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 15f23eda241b4..b17819b478d3c 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -214,7 +214,9 @@
; GCN-O1-NEXT: Lazy Branch Probability Analysis
; GCN-O1-NEXT: Lazy Block Frequency Analysis
; GCN-O1-NEXT: Expand memcmp() to load/stores
+; GCN-O1-NEXT: Simplify the CFG
; GCN-O1-NEXT: Remove unreachable blocks from the CFG
+; GCN-O1-NEXT: Dominator Tree Construction
; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Post-Dominator Tree Construction
; GCN-O1-NEXT: Branch Probability Analysis
@@ -500,7 +502,9 @@
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
; GCN-O1-OPTS-NEXT: Expand memcmp() to load/stores
+; GCN-O1-OPTS-NEXT: Simplify the CFG
; GCN-O1-OPTS-NEXT: Remove unreachable blocks from the CFG
+; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
; GCN-O1-OPTS-NEXT: Post-Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Branch Probability Analysis
@@ -805,7 +809,9 @@
; GCN-O2-NEXT: Lazy Branch Probability Analysis
; GCN-O2-NEXT: Lazy Block Frequency Analysis
; GCN-O2-NEXT: Expand memcmp() to load/stores
+; GCN-O2-NEXT: Simplify the CFG
; GCN-O2-NEXT: Remove unreachable blocks from the CFG
+; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
; GCN-O2-NEXT: Post-Dominator Tree Construction
; GCN-O2-NEXT: Branch Probability Analysis
@@ -1118,7 +1124,9 @@
; GCN-O3-NEXT: Lazy Branch Probability Analysis
; GCN-O3-NEXT: Lazy Block Frequency Analysis
; GCN-O3-NEXT: Expand memcmp() to load/stores
+; GCN-O3-NEXT: Simplify the CFG
; GCN-O3-NEXT: Remove unreachable blocks from the CFG
+; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
; GCN-O3-NEXT: Post-Dominator Tree Construction
; GCN-O3-NEXT: Branch Probability Analysis
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index aa92c7ab4fb9f..8bccc4d8d9ac8 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -30,7 +30,9 @@
; CHECK-NEXT: Expand memcmp() to load/stores
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
+; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Remove unreachable blocks from the CFG
+; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Post-Dominator Tree Construction
; CHECK-NEXT: Branch Probability Analysis
diff --git a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
index c5c5342e303c7..61ff8ccda5904 100644
--- a/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
+++ b/llvm/test/CodeGen/LoongArch/opt-pipeline.ll
@@ -53,7 +53,9 @@
; LAXX-NEXT: Expand memcmp() to load/stores
; LAXX-NEXT: Lower Garbage Collection Instructions
; LAXX-NEXT: Shadow Stack GC Lowering
+; LAXX-NEXT: Simplify the CFG
; LAXX-NEXT: Remove unreachable blocks from the CFG
+; LAXX-NEXT: Dominator Tree Construction
; LAXX-NEXT: Natural Loop Information
; LAXX-NEXT: Post-Dominator Tree Construction
; LAXX-NEXT: Branch Probability Analysis
diff --git a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
index be0fbf3abcda2..9304825f4e642 100644
--- a/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
+++ b/llvm/test/CodeGen/PowerPC/O3-pipeline.ll
@@ -54,7 +54,9 @@
; CHECK-NEXT: Expand memcmp() to load/stores
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
+; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Remove unreachable blocks from the CFG
+; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Post-Dominator Tree Construction
; CHECK-NEXT: Branch Probability Analysis
diff --git a/llvm/test/CodeGen/RISCV/O3-pipeline.ll b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
index d6d0cca6ddae7..1cd4075ccfff6 100644
--- a/llvm/test/CodeGen/RISCV/O3-pipeline.ll
+++ b/llvm/test/CodeGen/RISCV/O3-pipeline.ll
@@ -54,7 +54,9 @@
; CHECK-NEXT: Expand memcmp() to load/stores
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
+; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Remove unreachable blocks from the CFG
+; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Post-Dominator Tree Construction
; CHECK-NEXT: Branch Probability Analysis
diff --git a/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll b/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll
new file mode 100644
index 0000000000000..b8b4eb3a627e5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/hoist-loads-stores-with-cf.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64 -mattr=+cf | FileCheck %s
+
+;; Test masked.load/store.v1* is generated in simplifycfg and not falls back to branch+load/store in following passes.
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: basic:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: cfcmovel (%rsi), %eax
+; CHECK-NEXT: cfcmovel %eax, (%rdx)
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cfcmovneq %rax, (%rdx)
+; CHECK-NEXT: movw $2, %ax
+; CHECK-NEXT: cfcmovnew %ax, (%rcx)
+; CHECK-NEXT: retq
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i64 1, ptr %p, align 8
+ store i16 2, ptr %q, align 8
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
index 275a42e02d746..bb400d9696a4f 100644
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -51,7 +51,9 @@
; CHECK-NEXT: Expand memcmp() to load/stores
; CHECK-NEXT: Lower Garbage Collection Instructions
; CHECK-NEXT: Shadow Stack GC Lowering
+; CHECK-NEXT: Simplify the CFG
; CHECK-NEXT: Remove unreachable blocks from the CFG
+; CHECK-NEXT: Dominator Tree Construction
; CHECK-NEXT: Natural Loop Information
; CHECK-NEXT: Post-Dominator Tree Construction
; CHECK-NEXT: Branch Probability Analysis
diff --git a/llvm/test/Other/new-pm-print-pipeline.ll b/llvm/test/Other/new-pm-print-pipeline.ll
index f2e80814f347a..12f88d60d66ce 100644
--- a/llvm/test/Other/new-pm-print-pipeline.ll
+++ b/llvm/test/Other/new-pm-print-pipeline.ll
@@ -49,8 +49,8 @@
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(print<stack-lifetime><may>,print<stack-lifetime><must>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-17
; CHECK-17: function(print<stack-lifetime><may>,print<stack-lifetime><must>)
-; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
-; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
+; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;switch-to-lookup;keep-loops;hoist-common-insts;hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-18
+; CHECK-18: function(simplifycfg<bonus-inst-threshold=5;forward-switch-cond;no-switch-range-to-icmp;switch-to-lookup;keep-loops;hoist-common-insts;hoist-loads-stores-with-cond-faulting;sink-common-insts;speculate-blocks;simplify-cond-branch;speculate-unpredictables>,simplifycfg<bonus-inst-threshold=7;no-forward-switch-cond;no-switch-range-to-icmp;no-switch-to-lookup;no-keep-loops;no-hoist-common-insts;no-hoist-loads-stores-with-cond-faulting;no-sink-common-insts;no-speculate-blocks;no-simplify-cond-branch;no-speculate-unpredictables>)
; RUN: opt -disable-output -disable-verify -print-pipeline-passes -passes='function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only>,loop-vectorize<interleave-forced-only;vectorize-forced-only>)' < %s | FileCheck %s --match-full-lines --check-prefixes=CHECK-19
; CHECK-19: function(loop-vectorize<no-interleave-forced-only;no-vectorize-forced-only;>,loop-vectorize<interleave-forced-only;vectorize-forced-only;>)
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
new file mode 100644
index 0000000000000..dc18d2e5055a8
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-loads-stores-with-cf.ll
@@ -0,0 +1,623 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes='simplifycfg<hoist-loads-stores-with-cond-faulting>' -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+
+;; Redundant bitcast will be opimized out in instcombine pass.
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i64 1, ptr %p, align 8, !dbg !8
+ store i16 2, ptr %q, align 8, !dbg !8
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4, !dbg !9
+ store i32 %0, ptr %p, align 4, !DIAssignID !13
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; simplifycfg is probably run before sroa, alloca here is not optimized away yet.
+define void @alloca(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @alloca(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT: [[Q_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store ptr [[P:%.*]], ptr [[P_ADDR]], align 8
+; CHECK-NEXT: store ptr [[Q:%.*]], ptr [[Q_ADDR]], align 8
+; CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[Q_ADDR]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP2]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <1 x i1> [[TMP1]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %p.addr = alloca ptr
+ %q.addr = alloca ptr
+ %a.addr = alloca i32
+ store ptr %p, ptr %p.addr
+ store ptr %q, ptr %q.addr
+ store i32 %a, ptr %a.addr
+ %0 = load i32, ptr %a.addr
+ %tobool = icmp ne i32 %0, 0
+ br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+ %1 = load ptr, ptr %q.addr
+ %2 = load i32, ptr %1
+ %3 = load ptr, ptr %p.addr
+ store i32 %2, ptr %3
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; Successor 1 branches to successor 0.
+define void @succ1to0(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @succ1to0(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %tobool = icmp ne i32 %a, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+ ret void
+
+if.then:
+ %0 = load i32, ptr %q
+ store i32 %0, ptr %p
+ br label %if.end
+}
+
+;; Successor 1 branches to successor 0 and there is a phi node.
+define i32 @succ1to0_phi(ptr %p) {
+; CHECK-LABEL: @succ1to0_phi(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[P]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[SPEC_SELECT:%.*]] = select i1 [[COND]], i32 0, i32 [[TMP4]]
+; CHECK-NEXT: ret i32 [[SPEC_SELECT]]
+;
+entry:
+ %cond = icmp eq ptr %p, null
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %0 = load i32, ptr %p
+ br label %if.true
+
+if.true:
+ %res = phi i32 [ %0, %if.false ], [ 0, %entry ]
+ ret i32 %res
+}
+
+;; Successor 0 branches to successor 1.
+define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @succ0to1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b
+ store i32 %0, ptr %p
+ br label %if.false
+
+if.end:
+ ret void
+}
+
+;; Load after store can be hoisted.
+define i64 @load_after_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_after_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT]], [[TMP4]]
+; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = select i1 [[COND]], i64 [[ADD]], i64 0
+; CHECK-NEXT: ret i64 [[COMMON_RET_OP]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.end
+
+if.true:
+ store i32 1, ptr %b
+ %0 = load i16, ptr %p
+ %1 = load i64, ptr %q
+ %zext = zext i16 %0 to i64
+ %add = add i64 %zext, %1
+ ret i64 %add
+
+if.end:
+ ret i64 0
+}
+
+;; Speculatable memory read doesn't prevent the hoist.
+define i32 @load_skip_speculatable_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_speculatable_memory_read(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
+; CHECK-NEXT: [[READ:%.*]] = call i32 @read_memory_only()
+; CHECK-NEXT: [[PHI:%.*]] = select i1 [[COND]], i32 0, i32 [[READ]]
+; CHECK-NEXT: ret i32 [[PHI]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %read = call i32 @read_memory_only()
+ %0 = load i32, ptr %q
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b
+ store i32 %1, ptr %p
+ br label %if.end
+
+if.end:
+ %phi = phi i32 [%read, %if.false], [0, %if.true]
+ ret i32 %phi
+}
+
+;; Source of the load can be a GEP.
+;; TODO: Make the hoist happen.
+define i32 @load_from_gep(ptr %p) {
+; CHECK-LABEL: @load_from_gep(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: br label [[IF_TRUE]]
+; CHECK: if.true:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cond = icmp eq ptr %p, null
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %arrayidx = getelementptr inbounds i8, ptr %p, i64 16
+ %0 = load i32, ptr %arrayidx
+ br label %if.true
+
+if.true:
+ %res = phi i32 [ %0, %if.false ], [ 0, %entry ]
+ ret i32 %res
+}
+
+;; Not do hoist if the cost of instructions to be hoisted is expensive.
+define i32 @not_cheap_to_hoist(i32 %a, ptr %b, ptr %p, ptr %q, i32 %v0, i32 %v1, i32 %v2, i1 %cc) {
+; CHECK-LABEL: @not_cheap_to_hoist(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]])
+; CHECK-NEXT: call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 2, <1 x i1> [[TMP5]])
+; CHECK-NEXT: br i1 [[COND]], label [[COMMON_RET:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: common.ret:
+; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ [[VVVV:%.*]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
+; CHECK: if.false:
+; CHECK-NEXT: [[V:%.*]] = udiv i32 [[A]], 12345
+; CHECK-NEXT: [[VV:%.*]] = mul i32 [[V]], [[V0:%.*]]
+; CHECK-NEXT: [[VVV:%.*]] = mul i32 [[VV]], [[V1:%.*]]
+; CHECK-NEXT: [[VVVV]] = select i1 [[CC:%.*]], i32 [[V2:%.*]], i32 [[VVV]]
+; CHECK-NEXT: br label [[COMMON_RET]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i64 1, ptr %p
+ store i16 2, ptr %q
+
+ %v = udiv i32 %a, 12345
+ %vv = mul i32 %v, %v0
+ %vvv = mul i32 %vv, %v1
+ %vvvv = select i1 %cc, i32 %v2, i32 %vvv
+ ret i32 %vvvv
+
+if.true:
+ %0 = load i32, ptr %b
+ store i32 %0, ptr %p
+ br label %if.end
+
+if.end:
+ ret i32 0
+}
+
+;; Not hoist if there is more than 1 prodecessor.
+define void @not_single_predecessor(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @not_single_predecessor(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+; CHECK: if.end:
+; CHECK-NEXT: br label [[IF_THEN]]
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+;
+entry:
+ %tobool = icmp ne i32 %a, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+ br label %if.then
+
+if.then:
+ %1 = load i32, ptr %q
+ store i32 %1, ptr %p
+ br label %if.end
+}
+
+;; Not hoist b/c i8 is not supported by conditional faulting
+define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_supported_type(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i8 1, ptr [[Q:%.*]], align 1
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1
+; CHECK-NEXT: store i8 [[TMP0]], ptr [[P:%.*]], align 1
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i8 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i8 1, ptr %q
+ br label %if.end
+
+if.true:
+ %0 = load i8, ptr %b
+ store i8 %0, ptr %p
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; Not hoist if the terminator is not br.
+define void @not_br_terminator(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_br_terminator(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: switch i32 [[A:%.*]], label [[IF_END:%.*]] [
+; CHECK-NEXT: i32 1, label [[IF_FALSE:%.*]]
+; CHECK-NEXT: i32 2, label [[IF_TRUE:%.*]]
+; CHECK-NEXT: ]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_FALSE]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ switch i32 %a, label %if.end [
+ i32 1, label %if.false
+ i32 2, label %if.true
+ ]
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.false
+
+if.end:
+ ret void
+}
+
+;; Not hoist if the instruction to be hoist is atomic/volatile.
+define void @not_simple(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_simple(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store atomic i32 1, ptr [[Q:%.*]] seq_cst, align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store atomic i32 1, ptr %q seq_cst, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; Not hoist if there is a load/store that can not be hoisted in the same bb.
+define void @not_hoistable_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store volatile i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store volatile i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; Not hoist if there is an instruction that has side effect in the same bb.
+define void @not_hoistable_sideeffect(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_sideeffect(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[RMW:%.*]] = atomicrmw xchg ptr [[Q]], double 4.000000e+00 seq_cst, align 8
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ %rmw= atomicrmw xchg ptr %q, double 4.0 seq_cst
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; Not hoist b/c the operand of store does not dominate the branch.
+;; TODO: Could we improve it?
+define void @not_ops_dominate_br(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_ops_dominate_br(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A]], 2
+; CHECK-NEXT: store i32 [[ADD]], ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %add = add i32 %a, 2
+ store i32 %add, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b, align 4
+ store i32 %1, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; Not hoist there's a store and a memory read is skipped.
+define void @not_store_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_store_skip_memory_read(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: call void @read_memory_only()
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ call void @read_memory_only()
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b, align 4
+ store i32 %1, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; Not hoist if the branch is predictable and the `then` BB is not likely to execute.
+define void @not_likely_to_execute(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @not_likely_to_execute(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[TOBOOL]], label [[IF_THEN:%.*]], label [[IF_END:%.*]], !prof [[PROF13:![0-9]+]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+; CHECK: if.then:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+;
+entry:
+ %tobool = icmp ne i32 %a, 0
+ br i1 %tobool, label %if.then, label %if.end, !prof !14
+
+if.end:
+ ret void
+
+if.then:
+ %0 = load i32, ptr %q
+ store i32 %0, ptr %p
+ br label %if.end
+}
+
+declare i32 @read_memory_only() readonly nounwind willreturn speculatable
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !DILocation(line: 1, column: 2, scope: !10)
+!9 = !DILocation(line: 1, column: 3, scope: !10)
+!10 = distinct !DISubprogram(name: "basic", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, unit: !0)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
+!13 = distinct !DIAssignID()
+!14 = !{!"branch_weights", i32 1, i32 99}
More information about the llvm-commits
mailing list