[llvm] [X86] Support hoisting load/store with conditional faulting (PR #95515)
Shengchen Kan via llvm-commits
llvm-commits at lists.llvm.org
Fri Jun 14 02:17:19 PDT 2024
https://github.com/KanRobert updated https://github.com/llvm/llvm-project/pull/95515
>From e598cb1cf295e3ef843c94a4f4d15e073d51478a Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Wed, 5 Jun 2024 15:04:27 +0800
Subject: [PATCH] [X86] Support hoisting load/store with conditional faulting
1. Add TTI interface for conditional load/store
2. Hoist load/store from successors if the targets support conditional
faulting
3. Lower masked load/store to CFCMOV
---
.../llvm/Analysis/TargetTransformInfo.h | 8 ++
.../llvm/Analysis/TargetTransformInfoImpl.h | 1 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 +
.../lib/Target/X86/X86TargetTransformInfo.cpp | 18 ++++
llvm/lib/Target/X86/X86TargetTransformInfo.h | 1 +
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 92 ++++++++++++++++++-
llvm/test/CodeGen/X86/apx/cf.ll | 51 ++++++++++
.../X86/hoist-load-store-with-cf.ll | 34 +++++++
8 files changed, 207 insertions(+), 3 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/apx/cf.ll
create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a..37afda39a1c9c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1113,6 +1113,10 @@ class TargetTransformInfo {
/// \return the number of registers in the target-provided register class.
unsigned getNumberOfRegisters(unsigned ClassID) const;
+ /// \return true if the target supports load/store that enables fault
+ /// suppression of memory operands when the source condition is false.
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
+
/// \return the target-provided register class ID for the provided type,
/// accounting for type promotion and other type-legalization techniques that
/// the target might apply. However, it specifically does not account for the
@@ -1956,6 +1960,7 @@ class TargetTransformInfo::Concept {
virtual bool preferToKeepConstantsAttached(const Instruction &Inst,
const Function &Fn) const = 0;
virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
+ virtual bool hasConditionalFaultingLoadStoreForType(Type *Ty) const = 0;
virtual unsigned getRegisterClassForType(bool Vector,
Type *Ty = nullptr) const = 0;
virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
@@ -2543,6 +2548,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getNumberOfRegisters(unsigned ClassID) const override {
return Impl.getNumberOfRegisters(ClassID);
}
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty) const override {
+ return Impl.hasConditionalFaultingLoadStoreForType(Ty);
+ }
unsigned getRegisterClassForType(bool Vector,
Type *Ty = nullptr) const override {
return Impl.getRegisterClassForType(Vector, Ty);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43..a4aa836ed82d3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -457,6 +457,7 @@ class TargetTransformInfoImplBase {
}
unsigned getNumberOfRegisters(unsigned ClassID) const { return 8; }
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty) const { return false; }
unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
return Vector ? 1 : 0;
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f..46936f266bf46 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -722,6 +722,11 @@ unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const {
return TTIImpl->getNumberOfRegisters(ClassID);
}
+bool TargetTransformInfo::hasConditionalFaultingLoadStoreForType(
+ Type *Ty) const {
+ return TTIImpl->hasConditionalFaultingLoadStoreForType(Ty);
+}
+
unsigned TargetTransformInfo::getRegisterClassForType(bool Vector,
Type *Ty) const {
return TTIImpl->getRegisterClassForType(Vector, Ty);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 74948778ccf85..25597f836eb4d 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -176,6 +176,24 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
return 8;
}
+bool X86TTIImpl::hasConditionalFaultingLoadStoreForType(Type *Ty) const {
+ // Conditional faulting is supported by CFCMOV, which only accepts
+ // 8/16/32/64-bit operands.
+ // NOTE: Though VMOVSS/VMOVSD suppresses memory fault with zero mask, it has
+ // performance penalty.
+ if (!ST->hasCF() || !Ty || !Ty->isIntegerTy())
+ return false;
+ switch (cast<IntegerType>(Ty)->getBitWidth()) {
+ default:
+ return false;
+ case 8:
+ case 16:
+ case 32:
+ case 64:
+ return true;
+ }
+}
+
TypeSize
X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
unsigned PreferVectorWidth = ST->getPreferVectorWidth();
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index e14dc9fc09051..701648c6a2b3a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,6 +132,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const;
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
unsigned getMaxInterleaveFactor(ElementCount VF);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 107c8bb6c027f..7be6ba1ecda7c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -131,6 +131,12 @@ static cl::opt<bool> HoistCondStores(
"simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
cl::desc("Hoist conditional stores if an unconditional store precedes"));
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+ "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+ cl::init(true),
+ cl::desc("Hoist loads/stores if the target supports "
+ "conditional faulting"));
+
static cl::opt<bool> MergeCondStores(
"simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
cl::desc("Hoist conditional stores even if an unconditional store does not "
@@ -275,6 +281,7 @@ class SimplifyCFGOpt {
bool hoistSuccIdenticalTerminatorToSwitchOrIf(
Instruction *TI, Instruction *I1,
SmallVectorImpl<Instruction *> &OtherSuccTIs);
+ bool hoistLoadStoreWithCondFaultingFromSuccessors(BasicBlock *BB);
bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2977,6 +2984,79 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
return HaveRewritablePHIs;
}
+/// Hoist load/store instructions from the conditional successor blocks up into
+/// the block.
+///
+/// We are looking for code like the following:
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// store i32 1, ptr %q, align 4
+/// ...
+/// TrueBB:
+/// %0 = load i32, ptr %b, align 4
+/// store i32 %0, ptr %p, align 4
+/// ...
+/// \endcode
+//
+/// We are going to transform this into:
+///
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// %0 = cload i32, ptr %b, %cond
+/// cstore i32 %0, ptr %p, %cond
+/// cstore i32 1, ptr %q, ~%cond
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// ...
+/// TrueBB:
+/// ...
+/// \endcode
+///
+/// where cload/cstore is represented by llvm.masked.load/store, e.g.
+///
+/// \code
+/// %vcond = insertelement <1 x i1> undef, i1 %cond, i32 0
+/// %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+/// (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> undef)
+/// %0 = bitcast <1 x i32> %v0 to i32 call void
+/// @llvm.masked.store.v1i32.p0(<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+/// %cond.not = xor i1 %cond, true
+/// %vcond.not = insertelement <1 x i1> undef, i1 %cond, i32 0
+/// call void @llvm.masked.store.v1i32.p0
+/// (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+/// \endcode
+///
+/// \returns true if any load/store is hosited.
+///
+/// Note that this tranform should be run
+/// * before SpeculativelyExecuteBB so that the latter can have more chance.
+/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
+/// are handled first.
+bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
+ BasicBlock *BB) {
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ BasicBlock *IfTrueBB = BI->getSuccessor(0);
+ BasicBlock *IfFalseBB = BI->getSuccessor(1);
+
+ // If either of the blocks has it's address taken, then we can't do this fold,
+ // because the code we'd hoist would no longer run when we jump into the block
+ // by it's address.
+ for (auto *Succ : {IfTrueBB, IfFalseBB})
+ if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+ return false;
+
+ return false;
+}
+
/// Speculate a conditional basic block flattening the CFG.
///
/// Note that this is a very risky transform currently. Speculating
@@ -7436,14 +7516,20 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
return requestResimplify();
// We have a conditional branch to two blocks that are only reachable
- // from BI. We know that the condbr dominates the two blocks, so see if
- // there is any identical code in the "then" and "else" blocks. If so, we
- // can hoist it up to the branching block.
+ // from BI. We know that the condbr dominates the two blocks, so see
+ //
+ // * if there is any identical code in the "then" and "else" blocks.
+ // * if there is any different load/store in the "then" and "else" blocks.
+ //
+ // If so, we can hoist it up to the branching block.
if (BI->getSuccessor(0)->getSinglePredecessor()) {
if (BI->getSuccessor(1)->getSinglePredecessor()) {
if (HoistCommon && hoistCommonCodeFromSuccessors(
BI->getParent(), !Options.HoistCommonInsts))
return requestResimplify();
+ if (HoistLoadsStoresWithCondFaulting &&
+ hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
} else {
// If Successor #1 has multiple preds, we may be able to conditionally
// execute Successor #0 if it branches to Successor #1.
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
new file mode 100644
index 0000000000000..be89625d2c631
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+cf -verify-machineinstrs | FileCheck %s
+
+define void @test (i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: test:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: negb %al
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: # implicit-def: $r8d
+; CHECK-NEXT: je .LBB0_1
+; CHECK-NEXT: # %bb.2: # %else
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: jne .LBB0_3
+; CHECK-NEXT: .LBB0_4: # %else1
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: je .LBB0_5
+; CHECK-NEXT: .LBB0_6: # %else3
+; CHECK-NEXT: retq
+; CHECK-NEXT: .LBB0_1: # %cond.load
+; CHECK-NEXT: movl (%rsi), %r8d
+; CHECK-NEXT: testb $1, %al
+; CHECK-NEXT: je .LBB0_4
+; CHECK-NEXT: .LBB0_3: # %cond.store
+; CHECK-NEXT: movl %r8d, (%rdx)
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: jne .LBB0_6
+; CHECK-NEXT: .LBB0_5: # %cond.store2
+; CHECK-NEXT: movl $1, (%rcx)
+; CHECK-NEXT: retq
+entry:
+ %cond = icmp eq i32 %a, 0
+ %vcond = insertelement <1 x i1> undef, i1 %cond, i32 0
+ %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0 (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> undef)
+ %0 = bitcast <1 x i32> %v0 to i32
+ call void @llvm.masked.store.v1i32.p0(<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+ %cond.not = xor i1 %cond, true
+ %vcond.not = insertelement <1 x i1> undef, i1 %cond, i32 0
+ call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr %q, i32 4, <1 x i1> %vcond.not)
+ br i1 %cond, label %if.true, label %if.false
+
+if.false: ; preds = %entry
+ br label %if.end
+
+if.true: ; preds = %entry
+ br label %if.end
+
+if.end: ; preds = %if.true, %if.false
+ ret void
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
new file mode 100644
index 0000000000000..302ea7a4ccd6b
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+
+define void @test(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false: ; preds = %entry
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true: ; preds = %entry
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end: ; preds = %if.true, %if.false
+ ret void
+}
More information about the llvm-commits
mailing list