[llvm] [X86][SimplifyCFG] Support hoisting load/store with conditional faulting (PR #96878)
Shengchen Kan via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 28 00:33:56 PDT 2024
https://github.com/KanRobert updated https://github.com/llvm/llvm-project/pull/96878
>From 0195a0824793e3add14cd985b506d9b04162c921 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Thu, 27 Jun 2024 17:10:00 +0800
Subject: [PATCH 1/4] [X86][SimplifyCFG] Support hoisting load/store with
conditional faulting
---
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 221 ++++++++-
.../X86/hoist-load-store-with-cf.ll | 460 ++++++++++++++++++
2 files changed, 676 insertions(+), 5 deletions(-)
create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index c52c4dc0b8a51..558fafd5a2652 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -131,6 +131,12 @@ static cl::opt<bool> HoistCondStores(
"simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
cl::desc("Hoist conditional stores if an unconditional store precedes"));
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+ "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+ cl::init(true),
+ cl::desc("Hoist loads/stores if the target supports "
+ "conditional faulting"));
+
static cl::opt<bool> MergeCondStores(
"simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
cl::desc("Hoist conditional stores even if an unconditional store does not "
@@ -275,6 +281,7 @@ class SimplifyCFGOpt {
bool hoistSuccIdenticalTerminatorToSwitchOrIf(
Instruction *TI, Instruction *I1,
SmallVectorImpl<Instruction *> &OtherSuccTIs);
+ bool hoistLoadStoreWithCondFaultingFromSuccessors(BasicBlock *BB);
bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2960,6 +2967,199 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
return HaveRewritablePHIs;
}
+/// Hoist load/store instructions from the conditional successor blocks up into
+/// the block.
+///
+/// We are looking for code like the following:
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// store i32 1, ptr %q, align 4
+/// ...
+/// TrueBB:
+/// %0 = load i32, ptr %b, align 4
+/// store i32 %0, ptr %p, align 4
+/// ...
+/// \endcode
+//
+/// We are going to transform this into:
+///
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// %0 = cload i32, ptr %b, %cond
+/// cstore i32 %0, ptr %p, %cond
+/// cstore i32 1, ptr %q, ~%cond
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// ...
+/// TrueBB:
+/// ...
+/// \endcode
+///
+/// where cload/cstore is represented by intrinsic like llvm.masked.load/store,
+/// e.g.
+///
+/// \code
+/// %vcond = bitcast i1 %cond to <1 x i1>
+/// %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+/// (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
+/// %0 = bitcast <1 x i32> %v0 to i32
+/// call void @llvm.masked.store.v1i32.p0
+// (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+/// %cond.not = xor i1 %cond, true
+/// %vcond.not = bitcast i1 %cond.not to <1 x i>
+/// call void @llvm.masked.store.v1i32.p0
+/// (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+/// \endcode
+///
+/// \returns true if any load/store is hosited.
+///
+/// Note that this tranform should be run
+/// * before SpeculativelyExecuteBB so that the latter can have more chance.
+/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
+/// are handled first.
+bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
+ BasicBlock *BB) {
+ if (!HoistLoadsStoresWithCondFaulting ||
+ !TTI.hasConditionalLoadStoreForType())
+ return false;
+
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ BasicBlock *IfTrueBB = BI->getSuccessor(0);
+ BasicBlock *IfFalseBB = BI->getSuccessor(1);
+
+ // If either of the blocks has it's address taken, then we can't do this fold,
+ // because the code we'd hoist would no longer run when we jump into the block
+ // by it's address.
+ for (auto *Succ : {IfTrueBB, IfFalseBB})
+ if (Succ->hasAddressTaken())
+ return false;
+
+ // Not use isa<AllocaInst>(getUnderlyingObject(I.getOperand(0)) to avoid
+ // checking all intermediate operands dominate the branch.
+ auto IsLoadFromAlloca = [](const Instruction &I) {
+ return isa<LoadInst>(I) && isa<AllocaInst>((I.getOperand(0)));
+ };
+
+ // Collect hoisted loads/stores.
+ SmallSetVector<Instruction *, 4> HoistedInsts;
+ // Not hoist load/store if
+ // 1. target does not have corresponding conditional faulting load/store.
+ // 2. it's volatile or atomic.
+ // 3. there is a load/store that can not be hoisted in the same bb.
+ // 4. there is a non-load/store that's not safe to speculatively execute
+ // in the same bb.
+ // 5. any operand of it does not dominate the branch.
+ // 6. it's a store and a memory read is skipped.
+ auto HoistInstsInBB = [&](BasicBlock *BB) {
+ bool SkipMemoryRead = false;
+ // A more efficient way to check domination. An operand dominates the
+ // BranchInst if
+ // 1. it's not defined in the same bb as the instruction.
+ // 2. it's to be hoisted.
+ //
+ // b/c BB is only predecessor and BranchInst does not define any value.
+ auto OpsDominatesBranch = [&](Instruction &I) {
+ return llvm::all_of(I.operands(), [&](Value *Op) {
+ if (auto *J = dyn_cast<Instruction>(Op)) {
+ if (HoistedInsts.contains(J))
+ return true;
+ if (J->getParent() == I.getParent())
+ return false;
+ }
+ return true;
+ });
+ };
+ for (auto &I : *BB) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (LI || SI) {
+ bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
+ if (!IsSimple || !OpsDominatesBranch(I))
+ return false;
+ auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
+ // a load from alloca is always safe.
+ if (!IsLoadFromAlloca(I) && !TTI.hasConditionalLoadStoreForType(Type))
+ return false;
+ // Conservative aliasing check.
+ if (SI && SkipMemoryRead)
+ return false;
+ HoistedInsts.insert(&I);
+ } else if (!I.isTerminator() && !isSafeToSpeculativelyExecute(&I))
+ return false;
+ else if (I.mayReadFromMemory())
+ SkipMemoryRead = true;
+ }
+ return true;
+ };
+
+ if (!HoistInstsInBB(IfTrueBB) || !HoistInstsInBB(IfFalseBB) ||
+ HoistedInsts.empty())
+ return false;
+
+ // Put newly added instructions before the BranchInst.
+ IRBuilder<> Builder(BI);
+ auto &Context = BB->getContext();
+ auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+ auto *Cond = BI->getOperand(0);
+ auto *VCond = Builder.CreateBitCast(Cond, VCondTy);
+ Value *VCondNot = nullptr;
+ for (auto *I : HoistedInsts) {
+ // Only need to move the position for load from alloca.
+ if (IsLoadFromAlloca(*I)) {
+ I->moveBefore(BI);
+ continue;
+ }
+
+ bool InvertCond = I->getParent() == IfFalseBB;
+ // Construct the inverted condition if need.
+ if (InvertCond && !VCondNot)
+ VCondNot = Builder.CreateBitCast(
+ Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+
+ auto *Mask = InvertCond ? VCondNot : VCond;
+ auto *Op0 = I->getOperand(0);
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ // Load
+ auto *Ty = I->getType();
+ // NOTE: Now we assume conditional faulting load/store is supported for
+ // scalar only when creating new instructions, but it's easy to extend it
+ // for vector types in the future.
+ assert(!Ty->isVectorTy() && "not implemented");
+ auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
+ LI->getAlign(), Mask);
+ auto *S0 = Builder.CreateBitCast(V0, Ty);
+ V0->copyMetadata(*I);
+ I->replaceAllUsesWith(S0);
+ } else {
+ // Store
+ assert(!Op0->getType()->isVectorTy() && "not implemented");
+ auto *StoredVal =
+ Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+ auto *VStore = Builder.CreateMaskedStore(
+ StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+ VStore->copyMetadata(*I);
+ }
+ }
+
+ // Erase the hoisted instrutions in reverse order to avoid use-w/o-define
+ // error.
+ std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(), [&](auto I) {
+ if (!IsLoadFromAlloca(*I))
+ I->eraseFromParent();
+ });
+
+ return true;
+}
+
/// Speculate a conditional basic block flattening the CFG.
///
/// Note that this is a very risky transform currently. Speculating
@@ -7420,31 +7620,42 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
return requestResimplify();
// We have a conditional branch to two blocks that are only reachable
- // from BI. We know that the condbr dominates the two blocks, so see if
- // there is any identical code in the "then" and "else" blocks. If so, we
- // can hoist it up to the branching block.
+ // from BI. We know that the condbr dominates the two blocks, so see
+ //
+ // * if there is any identical code in the "then" and "else" blocks.
+ // * if there is any different load/store in the "then" and "else" blocks.
+ //
+ // If so, we can hoist it up to the branching block.
if (BI->getSuccessor(0)->getSinglePredecessor()) {
if (BI->getSuccessor(1)->getSinglePredecessor()) {
if (HoistCommon && hoistCommonCodeFromSuccessors(
BI->getParent(), !Options.HoistCommonInsts))
return requestResimplify();
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
} else {
// If Successor #1 has multiple preds, we may be able to conditionally
// execute Successor #0 if it branches to Successor #1.
Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
if (Succ0TI->getNumSuccessors() == 1 &&
- Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+ Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) {
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
return requestResimplify();
+ }
}
} else if (BI->getSuccessor(1)->getSinglePredecessor()) {
// If Successor #0 has multiple preds, we may be able to conditionally
// execute Successor #1 if it branches to Successor #0.
Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
if (Succ1TI->getNumSuccessors() == 1 &&
- Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+ Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) {
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
return requestResimplify();
+ }
}
// If this is a branch on something for which we know the constant value in
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
new file mode 100644
index 0000000000000..2fd0055cf05f9
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -0,0 +1,460 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+
+;; The redundant bitcast/insertelement will be opimized out in instcombine pass.
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i64 1, ptr %p, align 8, !dbg !8
+ store i16 2, ptr %q, align 8, !dbg !8
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4, !dbg !9
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; simplifycfg is run before sroa. alloca here is not optimized away yet.
+define void @alloca(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @alloca(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT: [[Q_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store ptr [[P:%.*]], ptr [[P_ADDR]], align 8
+; CHECK-NEXT: store ptr [[Q:%.*]], ptr [[Q_ADDR]], align 8
+; CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[Q_ADDR]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP2]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <1 x i1> [[TMP1]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %p.addr = alloca ptr
+ %q.addr = alloca ptr
+ %a.addr = alloca i32
+ store ptr %p, ptr %p.addr
+ store ptr %q, ptr %q.addr
+ store i32 %a, ptr %a.addr
+ %0 = load i32, ptr %a.addr
+ %tobool = icmp ne i32 %0, 0
+ br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+ %1 = load ptr, ptr %q.addr
+ %2 = load i32, ptr %1
+ %3 = load ptr, ptr %p.addr
+ store i32 %2, ptr %3
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; successor 1 branches to successor 0.
+define void @succ1to0(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @succ1to0(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %tobool = icmp ne i32 %a, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+ ret void
+
+if.then:
+ %0 = load i32, ptr %q
+ store i32 %0, ptr %p
+ br label %if.end
+}
+
+;; successor 0 branches to successor 1.
+define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @succ0to1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b
+ store i32 %0, ptr %p
+ br label %if.false
+
+if.end:
+ ret void
+}
+
+;; load after store can be hoisted.
+define i64 @load_after_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_after_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT]], [[TMP4]]
+; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = select i1 [[COND]], i64 [[ADD]], i64 0
+; CHECK-NEXT: ret i64 [[COMMON_RET_OP]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.end
+
+if.true:
+ store i32 1, ptr %b
+ %0 = load i16, ptr %p
+ %1 = load i64, ptr %q
+ %zext = zext i16 %0 to i64
+ %add = add i64 %zext, %1
+ ret i64 %add
+
+if.end:
+ ret i64 0
+}
+
+define i32 @load_skip_speculatable_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_speculatable_memory_read(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
+; CHECK-NEXT: [[READ:%.*]] = call i32 @read_memory_only()
+; CHECK-NEXT: [[PHI:%.*]] = select i1 [[COND]], i32 0, i32 [[READ]]
+; CHECK-NEXT: ret i32 [[PHI]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %read = call i32 @read_memory_only()
+ %0 = load i32, ptr %q
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b
+ store i32 %1, ptr %p
+ br label %if.end
+
+if.end:
+ %phi = phi i32 [%read, %if.false], [0, %if.true]
+ ret i32 %phi
+}
+
+; i8 is not supported by conditional faulting
+define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_supported_type(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i8 1, ptr [[Q:%.*]], align 1
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1
+; CHECK-NEXT: store i8 [[TMP0]], ptr [[P:%.*]], align 1
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i8 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i8 1, ptr %q
+ br label %if.end
+
+if.true:
+ %0 = load i8, ptr %b
+ store i8 %0, ptr %p
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_br_terminator(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_br_terminator(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: switch i32 [[A:%.*]], label [[IF_END:%.*]] [
+; CHECK-NEXT: i32 1, label [[IF_FALSE:%.*]]
+; CHECK-NEXT: i32 2, label [[IF_TRUE:%.*]]
+; CHECK-NEXT: ]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_FALSE]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ switch i32 %a, label %if.end [
+ i32 1, label %if.false
+ i32 2, label %if.true
+ ]
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.false
+
+if.end:
+ ret void
+}
+
+define void @not_simple(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_simple(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store atomic i32 1, ptr [[Q:%.*]] seq_cst, align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store atomic i32 1, ptr %q seq_cst, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_hoistable_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store volatile i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store volatile i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_hoistable_sideeffect(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_sideeffect(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[RMW:%.*]] = atomicrmw xchg ptr [[Q]], double 4.000000e+00 seq_cst, align 8
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ %rmw= atomicrmw xchg ptr %q, double 4.0 seq_cst
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_ops_dominate_br(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_ops_dominate_br(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A]], 2
+; CHECK-NEXT: store i32 [[ADD]], ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %add = add i32 %a, 2
+ store i32 %add, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b, align 4
+ store i32 %1, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_store_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_store_skip_memory_read(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: call void @read_memory_only()
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ call void @read_memory_only()
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b, align 4
+ store i32 %1, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+declare i32 @read_memory_only() readonly nounwind willreturn speculatable
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !DILocation(line: 1, column: 2, scope: !10)
+!9 = !DILocation(line: 1, column: 3, scope: !10)
+!10 = distinct !DISubprogram(name: "basic", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, unit: !0)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
>From 51703ad3e37936496a6acd73b0e6e3e84ecbe059 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Fri, 28 Jun 2024 14:59:29 +0800
Subject: [PATCH 2/4] address review comments: add 2 tests
---
.../X86/hoist-load-store-with-cf.ll | 74 +++++++++++++++++++
1 file changed, 74 insertions(+)
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
index 2fd0055cf05f9..c489feca631d8 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -205,6 +205,80 @@ if.end:
ret i32 %phi
}
+define i32 @expensive_to_hoist(i32 %a, ptr %b, ptr %p, ptr %q, i32 %v0, i32 %v1, i32 %v2, i1 %cc) {
+; CHECK-LABEL: @expensive_to_hoist(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !dbg [[DBG8]], !range [[RNG13:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT: call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT: br i1 [[COND]], label [[COMMON_RET:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: common.ret:
+; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = phi i32 [ [[VVVV:%.*]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: ret i32 [[COMMON_RET_OP]]
+; CHECK: if.false:
+; CHECK-NEXT: [[V:%.*]] = udiv i32 [[A]], 12345
+; CHECK-NEXT: [[VV:%.*]] = mul i32 [[V]], [[V0:%.*]]
+; CHECK-NEXT: [[VVV:%.*]] = mul i32 [[VV]], [[V1:%.*]]
+; CHECK-NEXT: [[VVVV]] = select i1 [[CC:%.*]], i32 [[V2:%.*]], i32 [[VVV]]
+; CHECK-NEXT: br label [[COMMON_RET]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i64 1, ptr %p, align 8, !dbg !8
+ store i16 2, ptr %q, align 8, !dbg !8
+
+ %v = udiv i32 %a, 12345
+ %vv = mul i32 %v, %v0
+ %vvv = mul i32 %vv, %v1
+ %vvvv = select i1 %cc, i32 %v2, i32 %vvv
+ ret i32 %vvvv
+
+if.true:
+ %0 = load i32, ptr %b, align 4, !dbg !9, !range ! { i32 20, i32 30}
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret i32 0
+}
+
+define i32 @load_from_gep(ptr %p) {
+; CHECK-LABEL: @load_from_gep(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT: br label [[IF_TRUE]]
+; CHECK: if.true:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cond = icmp eq ptr %p, null
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %arrayidx = getelementptr inbounds i8, ptr %p, i64 16
+ %0 = load i32, ptr %arrayidx
+ br label %if.true
+
+if.true:
+ %res = phi i32 [ %0, %if.false ], [ 0, %entry ]
+ ret i32 %res
+}
+
; i8 is not supported by conditional faulting
define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
; CHECK-LABEL: @not_supported_type(
>From e1489004f8973982c6c07d28060712436f8c7cb5 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Fri, 28 Jun 2024 15:20:17 +0800
Subject: [PATCH 3/4] address review comment:nfc
---
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 558fafd5a2652..40f057c61021b 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3085,7 +3085,7 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
if (!IsSimple || !OpsDominatesBranch(I))
return false;
- auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
+ auto *Type = getLoadStoreType(&I);
// a load from alloca is always safe.
if (!IsLoadFromAlloca(I) && !TTI.hasConditionalLoadStoreForType(Type))
return false;
>From dbd8a6780b343f7c0fa28a33926359b54ad10028 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Fri, 28 Jun 2024 15:32:42 +0800
Subject: [PATCH 4/4] add test
---
.../X86/hoist-load-store-with-cf.ll | 25 +++++++++++++++++++
1 file changed, 25 insertions(+)
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
index c489feca631d8..4fc3b4f07a911 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -105,6 +105,31 @@ if.then:
br label %if.end
}
+define i32 @succ1to0_phi(ptr %p) {
+; CHECK-LABEL: @succ1to0_phi(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[P:%.*]], null
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[P]], align 4
+; CHECK-NEXT: br label [[IF_TRUE]]
+; CHECK: if.true:
+; CHECK-NEXT: [[RES:%.*]] = phi i32 [ [[TMP0]], [[IF_FALSE]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT: ret i32 [[RES]]
+;
+entry:
+ %cond = icmp eq ptr %p, null
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %0 = load i32, ptr %p
+ br label %if.true
+
+if.true:
+ %res = phi i32 [ %0, %if.false ], [ 0, %entry ]
+ ret i32 %res
+}
+
;; successor 0 branches to successor 1.
define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
; CHECK-LABEL: @succ0to1(
More information about the llvm-commits
mailing list