[llvm] [X86][SimplifyCFG][CodeGen] Support hoisting load/store with conditional faulting (PR #95515)
Shengchen Kan via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 24 08:41:13 PDT 2024
https://github.com/KanRobert updated https://github.com/llvm/llvm-project/pull/95515
>From 329f5685e9d49eae16ec6533fb3d53464b9320df Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Wed, 5 Jun 2024 15:04:27 +0800
Subject: [PATCH 01/12] [X86] Support hoisting load/store with conditional
faulting
1. Add TTI interface for conditional load/store.
2. Hoist load/store from successors with masked load/store if
the targets support conditional faulting.
3. Mark 1 x i16/i32/i64 masked load/store legal so that it's not
legalized in pass scalarize-masked-mem-intrin.
3. Visit 1 x i16/i32/i64 masked load/store to build a target-specific
conditional load/store node to avoid error in
DAGTypeLegalizer::ScalarizeVectorResult.
4. Lower conditional load/store to CFCMOV.
---
.../llvm/Analysis/TargetTransformInfo.h | 8 +
.../llvm/Analysis/TargetTransformInfoImpl.h | 1 +
llvm/include/llvm/CodeGen/TargetLowering.h | 12 +
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 +
.../SelectionDAG/SelectionDAGBuilder.cpp | 32 +-
llvm/lib/Target/X86/X86ISelLowering.cpp | 50 +++
llvm/lib/Target/X86/X86ISelLowering.h | 16 +
llvm/lib/Target/X86/X86InstrCMovSetCC.td | 29 ++
llvm/lib/Target/X86/X86InstrFragments.td | 12 +
.../lib/Target/X86/X86TargetTransformInfo.cpp | 32 +-
llvm/lib/Target/X86/X86TargetTransformInfo.h | 1 +
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 190 +++++++++-
llvm/test/CodeGen/X86/apx/cf.ll | 90 +++++
.../X86/hoist-load-store-with-cf.ll | 357 ++++++++++++++++++
14 files changed, 822 insertions(+), 13 deletions(-)
create mode 100644 llvm/test/CodeGen/X86/apx/cf.ll
create mode 100644 llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index f55f21c94a85a..37afda39a1c9c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1113,6 +1113,10 @@ class TargetTransformInfo {
/// \return the number of registers in the target-provided register class.
unsigned getNumberOfRegisters(unsigned ClassID) const;
+ /// \return true if the target supports load/store that enables fault
+ /// suppression of memory operands when the source condition is false.
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
+
/// \return the target-provided register class ID for the provided type,
/// accounting for type promotion and other type-legalization techniques that
/// the target might apply. However, it specifically does not account for the
@@ -1956,6 +1960,7 @@ class TargetTransformInfo::Concept {
virtual bool preferToKeepConstantsAttached(const Instruction &Inst,
const Function &Fn) const = 0;
virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
+ virtual bool hasConditionalFaultingLoadStoreForType(Type *Ty) const = 0;
virtual unsigned getRegisterClassForType(bool Vector,
Type *Ty = nullptr) const = 0;
virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
@@ -2543,6 +2548,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getNumberOfRegisters(unsigned ClassID) const override {
return Impl.getNumberOfRegisters(ClassID);
}
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty) const override {
+ return Impl.hasConditionalFaultingLoadStoreForType(Ty);
+ }
unsigned getRegisterClassForType(bool Vector,
Type *Ty = nullptr) const override {
return Impl.getRegisterClassForType(Vector, Ty);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 7828bdc1f1f43..a4aa836ed82d3 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -457,6 +457,7 @@ class TargetTransformInfoImplBase {
}
unsigned getNumberOfRegisters(unsigned ClassID) const { return 8; }
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty) const { return false; }
unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
return Vector ? 1 : 0;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 06f7ee2a589c8..2b0a45133bb0e 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3895,6 +3895,18 @@ class TargetLowering : public TargetLoweringBase {
const SDValue OldRHS, SDValue &Chain,
bool IsSignaling = false) const;
+ virtual SDValue visitMaskedLoadForCondFaulting(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
+ SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
+ llvm_unreachable("Not Implemented");
+ }
+
+ virtual SDValue visitMaskedStoreForCondFaulting(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
+ SDValue Ptr, SDValue Val, SDValue Mask) const {
+ llvm_unreachable("Not Implemented");
+ }
+
/// Returns a pair of (return value, chain).
/// It is an error to pass RTLIB::UNKNOWN_LIBCALL as \p LC.
std::pair<SDValue, SDValue> makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC,
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 7e721cbc87f3f..46936f266bf46 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -722,6 +722,11 @@ unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const {
return TTIImpl->getNumberOfRegisters(ClassID);
}
+bool TargetTransformInfo::hasConditionalFaultingLoadStoreForType(
+ Type *Ty) const {
+ return TTIImpl->hasConditionalFaultingLoadStoreForType(Ty);
+}
+
unsigned TargetTransformInfo::getRegisterClassForType(bool Vector,
Type *Ty) const {
return TTIImpl->getRegisterClassForType(Vector, Ty);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 296b06187ec0f..24dbe6efabbca 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4783,9 +4783,18 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
MachinePointerInfo(PtrOperand), MMOFlags,
LocationSize::beforeOrAfterPointer(), Alignment, I.getAAMetadata());
+
+ const auto &TLI = DAG.getTargetLoweringInfo();
+ const auto &TTI =
+ TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
SDValue StoreNode =
- DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
- ISD::UNINDEXED, false /* Truncating */, IsCompressing);
+ (!IsCompressing && TTI.hasConditionalFaultingLoadStoreForType(
+ I.getArgOperand(0)->getType()->getScalarType()))
+ ? TLI.visitMaskedStoreForCondFaulting(DAG, sdl, getMemoryRoot(), MMO,
+ Ptr, Src0, Mask)
+ : DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask,
+ VT, MMO, ISD::UNINDEXED, false /* Truncating */,
+ IsCompressing);
DAG.setRoot(StoreNode);
setValue(&I, StoreNode);
}
@@ -4958,12 +4967,23 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
MachinePointerInfo(PtrOperand), MMOFlags,
LocationSize::beforeOrAfterPointer(), Alignment, AAInfo, Ranges);
- SDValue Load =
- DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
- ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding);
+ const auto &TLI = DAG.getTargetLoweringInfo();
+ const auto &TTI =
+ TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
+ // The Load/Res may point to different values.
+ SDValue Load;
+ SDValue Res;
+ if (!IsExpanding && TTI.hasConditionalFaultingLoadStoreForType(
+ Src0Operand->getType()->getScalarType()))
+ Res = TLI.visitMaskedLoadForCondFaulting(DAG, sdl, InChain, MMO, Load, Ptr,
+ Src0, Mask);
+ else
+ Res = Load =
+ DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
+ ISD::UNINDEXED, ISD::NON_EXTLOAD, IsExpanding);
if (AddToChain)
PendingLoads.push_back(Load.getValue(1));
- setValue(&I, Load);
+ setValue(&I, Res);
}
void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f27c935812f51..2f89758c4783d 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32308,6 +32308,54 @@ bool X86TargetLowering::isInlineAsmTargetBranch(
return Inst.equals_insensitive("call") || Inst.equals_insensitive("jmp");
}
+static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue V) {
+ assert(V.getValueType() == MVT::i1 && "assume i1 value");
+ EVT Ty = MVT::i8;
+ SDValue VE = DAG.getZExtOrTrunc(V, DL, Ty);
+ SDValue Zero = DAG.getConstant(0, DL, Ty);
+ SDVTList X86SubVTs = DAG.getVTList(Ty, MVT::i32);
+ SDValue CmpZero = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Zero, VE);
+ return SDValue(CmpZero.getNode(), 1);
+}
+
+SDValue X86TargetLowering::visitMaskedLoadForCondFaulting(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
+ SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
+ // @llvm.masked.load.*(ptr, alignment, mask, passthru)
+ // ->
+ // _, flags = SUB 0, mask
+ // res, chain = CLOAD inchain, ptr, (bit_cast_to_scalar passthru), cond, flags
+ // bit_cast_to_vector<res>
+ EVT VTy = PassThru.getValueType();
+ EVT Ty = VTy.getVectorElementType();
+ SDVTList Tys = DAG.getVTList(Ty, MVT::Other);
+ SDValue ScalarPassThru = DAG.getBitcast(Ty, PassThru);
+ SDValue ScalarMask = DAG.getBitcast(MVT::i1, Mask);
+ SDValue Flags = getFlagsOfCmpZeroFori1(DAG, DL, ScalarMask);
+ SDValue COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ SDValue Ops[] = {Chain, Ptr, ScalarPassThru, COND_NE, Flags};
+ NewLoad = DAG.getMemIntrinsicNode(X86ISD::CLOAD, DL, Tys, Ops, Ty, MMO);
+ return DAG.getBitcast(VTy, NewLoad);
+}
+
+SDValue X86TargetLowering::visitMaskedStoreForCondFaulting(
+ SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
+ SDValue Ptr, SDValue Val, SDValue Mask) const {
+ // llvm.masked.store.*(Src0, Ptr, alignment, Mask)
+ // ->
+ // _, flags = SUB 0, mask
+ // chain = CSTORE inchain, (bit_cast_to_scalar val), ptr, cond, flags
+ EVT Ty = Val.getValueType().getVectorElementType();
+ SDVTList Tys = DAG.getVTList(MVT::Other);
+ SDValue ScalarVal = DAG.getBitcast(Ty, Val);
+ SDValue ScalarMask = DAG.getBitcast(MVT::i1, Mask);
+ SDValue Flags = getFlagsOfCmpZeroFori1(DAG, DL, ScalarMask);
+ SDValue COND_NE = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
+ SDValue Ops[] = {Chain, ScalarVal, Ptr, COND_NE, Flags};
+ return DAG.getMemIntrinsicNode(X86ISD::CSTORE, DL, Tys, Ops, Ty, MMO);
+}
+
/// Provide custom lowering hooks for some operations.
SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
switch (Op.getOpcode()) {
@@ -34024,6 +34072,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
NODE_NAME_CASE(STRICT_FP80_ADD)
NODE_NAME_CASE(CCMP)
NODE_NAME_CASE(CTEST)
+ NODE_NAME_CASE(CLOAD)
+ NODE_NAME_CASE(CSTORE)
}
return nullptr;
#undef NODE_NAME_CASE
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3c5c903bc0d98..05ef982ef2023 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -903,6 +903,10 @@ namespace llvm {
// is needed so that this can be expanded with control flow.
VASTART_SAVE_XMM_REGS,
+ // Conditional load/store instructions
+ CLOAD,
+ CSTORE,
+
// WARNING: Do not add anything in the end unless you want the node to
// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
// opcodes will be thought as target memory ops!
@@ -1556,6 +1560,18 @@ namespace llvm {
bool isInlineAsmTargetBranch(const SmallVectorImpl<StringRef> &AsmStrs,
unsigned OpNo) const override;
+ SDValue visitMaskedLoadForCondFaulting(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain,
+ MachineMemOperand *MMO,
+ SDValue &NewLoad, SDValue Ptr,
+ SDValue PassThru,
+ SDValue Mask) const override;
+ SDValue visitMaskedStoreForCondFaulting(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain,
+ MachineMemOperand *MMO, SDValue Ptr,
+ SDValue Val,
+ SDValue Mask) const override;
+
/// Lower interleaved load(s) into target specific
/// instructions/intrinsics.
bool lowerInterleavedLoad(LoadInst *LI,
diff --git a/llvm/lib/Target/X86/X86InstrCMovSetCC.td b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
index e27aa4115990e..543057c58035a 100644
--- a/llvm/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/llvm/lib/Target/X86/X86InstrCMovSetCC.td
@@ -113,6 +113,35 @@ let Predicates = [HasCMOV, HasCF] in {
(CFCMOV32rr GR32:$src1, (inv_cond_XFORM timm:$cond))>;
def : Pat<(X86cmov GR64:$src1, 0, timm:$cond, EFLAGS),
(CFCMOV64rr GR64:$src1, (inv_cond_XFORM timm:$cond))>;
+
+ def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS),
+ (CFCMOV16rm addr:$src1, timm:$cond)>;
+ def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS),
+ (CFCMOV32rm addr:$src1, timm:$cond)>;
+ def : Pat<(X86cload addr:$src1, 0, timm:$cond, EFLAGS),
+ (CFCMOV64rm addr:$src1, timm:$cond)>;
+
+ // FIXME: Shouldn't patterns for 0 work for undef?
+ def : Pat<(X86cload addr:$src1, undef, timm:$cond, EFLAGS),
+ (CFCMOV16rm addr:$src1, timm:$cond)>;
+ def : Pat<(X86cload addr:$src1, undef, timm:$cond, EFLAGS),
+ (CFCMOV32rm addr:$src1, timm:$cond)>;
+ def : Pat<(X86cload addr:$src1, undef, timm:$cond, EFLAGS),
+ (CFCMOV64rm addr:$src1, timm:$cond)>;
+
+ def : Pat<(X86cload addr:$src2, GR16:$src1, timm:$cond, EFLAGS),
+ (CFCMOV16rm_ND GR16:$src1, addr:$src2, timm:$cond)>;
+ def : Pat<(X86cload addr:$src2, GR32:$src1, timm:$cond, EFLAGS),
+ (CFCMOV32rm_ND GR32:$src1, addr:$src2, timm:$cond)>;
+ def : Pat<(X86cload addr:$src2, GR64:$src1, timm:$cond, EFLAGS),
+ (CFCMOV64rm_ND GR64:$src1, addr:$src2, timm:$cond)>;
+
+ def : Pat<(X86cstore GR16:$src2, addr:$src1, timm:$cond, EFLAGS),
+ (CFCMOV16mr addr:$src1, GR16:$src2, timm:$cond)>;
+ def : Pat<(X86cstore GR32:$src2, addr:$src1, timm:$cond, EFLAGS),
+ (CFCMOV32mr addr:$src1, GR32:$src2, timm:$cond)>;
+ def : Pat<(X86cstore GR64:$src2, addr:$src1, timm:$cond, EFLAGS),
+ (CFCMOV64mr addr:$src1, GR64:$src2, timm:$cond)>;
}
// SetCC instructions.
diff --git a/llvm/lib/Target/X86/X86InstrFragments.td b/llvm/lib/Target/X86/X86InstrFragments.td
index 162e322712a6d..972b56e0f0cfe 100644
--- a/llvm/lib/Target/X86/X86InstrFragments.td
+++ b/llvm/lib/Target/X86/X86InstrFragments.td
@@ -15,6 +15,15 @@ def SDTX86FCmp : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>,
def SDTX86Ccmp : SDTypeProfile<1, 5,
[SDTCisVT<3, i8>, SDTCisVT<4, i8>, SDTCisVT<5, i32>]>;
+// res, chain = CLOAD inchain, ptr, passthru, cond, flags
+def SDTX86Cload : SDTypeProfile<1, 4,
+ [SDTCisInt<0>, SDTCisPtrTy<1>, SDTCisSameAs<0, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
+// chain = CSTORE inchain, val, ptr, cond, flags
+def SDTX86Cstore : SDTypeProfile<0, 4,
+ [SDTCisInt<0>, SDTCisPtrTy<1>,
+ SDTCisVT<2, i8>, SDTCisVT<3, i32>]>;
+
def SDTX86Cmov : SDTypeProfile<1, 4,
[SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
@@ -144,6 +153,9 @@ def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
def X86ccmp : SDNode<"X86ISD::CCMP", SDTX86Ccmp>;
def X86ctest : SDNode<"X86ISD::CTEST", SDTX86Ccmp>;
+def X86cload : SDNode<"X86ISD::CLOAD", SDTX86Cload, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86cstore : SDNode<"X86ISD::CSTORE", SDTX86Cstore, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+
def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
def X86brcond : SDNode<"X86ISD::BRCOND", SDTX86BrCond,
[SDNPHasChain]>;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index de0144331dba3..1100be925b127 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -176,6 +176,23 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
return 8;
}
+bool X86TTIImpl::hasConditionalFaultingLoadStoreForType(Type *Ty) const {
+ // Conditional faulting is supported by CFCMOV, which only accepts
+ // 16/32/64-bit operands.
+ // NOTE: Though VMOVSS/VMOVSD suppresses memory fault with zero mask, it has
+ // performance penalty.
+ if (!ST->hasCF() || !Ty || !Ty->isIntegerTy())
+ return false;
+ switch (cast<IntegerType>(Ty)->getBitWidth()) {
+ default:
+ return false;
+ case 16:
+ case 32:
+ case 64:
+ return true;
+ }
+}
+
TypeSize
X86TTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const {
unsigned PreferVectorWidth = ST->getPreferVectorWidth();
@@ -5891,14 +5908,21 @@ bool X86TTIImpl::canMacroFuseCmp() {
}
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
+ bool IsSingleElementVector =
+ isa<VectorType>(DataTy) &&
+ cast<FixedVectorType>(DataTy)->getNumElements() == 1;
+ Type *ScalarTy = DataTy->getScalarType();
+
+ if (ST->hasCF() && IsSingleElementVector &&
+ hasConditionalFaultingLoadStoreForType(ScalarTy))
+ return true;
+
if (!ST->hasAVX())
return false;
- // The backend can't handle a single element vector.
- if (isa<VectorType>(DataTy) &&
- cast<FixedVectorType>(DataTy)->getNumElements() == 1)
+ // The backend can't handle a single element vector w/o CFCMOV.
+ if (IsSingleElementVector)
return false;
- Type *ScalarTy = DataTy->getScalarType();
if (ScalarTy->isPointerTy())
return true;
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index e14dc9fc09051..701648c6a2b3a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,6 +132,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const;
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
unsigned getMaxInterleaveFactor(ElementCount VF);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 4e2dc7f2b2f4e..86bae71768dcd 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -131,6 +131,12 @@ static cl::opt<bool> HoistCondStores(
"simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
cl::desc("Hoist conditional stores if an unconditional store precedes"));
+static cl::opt<bool> HoistLoadsStoresWithCondFaulting(
+ "simplifycfg-hoist-loads-stores-with-cond-faulting", cl::Hidden,
+ cl::init(true),
+ cl::desc("Hoist loads/stores if the target supports "
+ "conditional faulting"));
+
static cl::opt<bool> MergeCondStores(
"simplifycfg-merge-cond-stores", cl::Hidden, cl::init(true),
cl::desc("Hoist conditional stores even if an unconditional store does not "
@@ -275,6 +281,7 @@ class SimplifyCFGOpt {
bool hoistSuccIdenticalTerminatorToSwitchOrIf(
Instruction *TI, Instruction *I1,
SmallVectorImpl<Instruction *> &OtherSuccTIs);
+ bool hoistLoadStoreWithCondFaultingFromSuccessors(BasicBlock *BB);
bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB);
bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
BasicBlock *TrueBB, BasicBlock *FalseBB,
@@ -2960,6 +2967,177 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
return HaveRewritablePHIs;
}
+/// Hoist load/store instructions from the conditional successor blocks up into
+/// the block.
+///
+/// We are looking for code like the following:
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// store i32 1, ptr %q, align 4
+/// ...
+/// TrueBB:
+/// %0 = load i32, ptr %b, align 4
+/// store i32 %0, ptr %p, align 4
+/// ...
+/// \endcode
+//
+/// We are going to transform this into:
+///
+/// \code
+/// BB:
+/// ...
+/// %cond = icmp ult %x, %y
+/// %0 = cload i32, ptr %b, %cond
+/// cstore i32 %0, ptr %p, %cond
+/// cstore i32 1, ptr %q, ~%cond
+/// br i1 %cond, label %TrueBB, label %FalseBB
+/// FalseBB:
+/// ...
+/// TrueBB:
+/// ...
+/// \endcode
+///
+/// where cload/cstore is represented by intrinsic like llvm.masked.load/store,
+/// e.g.
+///
+/// \code
+/// %vcond = bitcast i1 %cond to <1 x i1>
+/// %v0 = call <1 x i32> @llvm.masked.load.v1i32.p0
+/// (ptr %b, i32 4, <1 x i1> %vcond, <1 x i32> poison)
+/// %0 = bitcast <1 x i32> %v0 to i32
+/// call void @llvm.masked.store.v1i32.p0
+// (<1 x i32> %v0, ptr %p, i32 4, <1 x i1> %vcond)
+/// %cond.not = xor i1 %cond, true
+/// %vcond.not = bitcast i1 %cond.not to <1 x i>
+/// call void @llvm.masked.store.v1i32.p0
+/// (<1 x i32> <i32 1>, ptr %q, i32 4, <1x i1> %vcond.not)
+/// \endcode
+///
+/// \returns true if any load/store is hosited.
+///
+/// Note that this tranform should be run
+/// * before SpeculativelyExecuteBB so that the latter can have more chance.
+/// * after hoistCommonCodeFromSuccessors to ensure unconditional loads/stores
+/// are handled first.
+bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
+ BasicBlock *BB) {
+ auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
+ if (!BI || !BI->isConditional())
+ return false;
+
+ BasicBlock *IfTrueBB = BI->getSuccessor(0);
+ BasicBlock *IfFalseBB = BI->getSuccessor(1);
+
+ // If either of the blocks has it's address taken, then we can't do this fold,
+ // because the code we'd hoist would no longer run when we jump into the block
+ // by it's address.
+ for (auto *Succ : {IfTrueBB, IfFalseBB})
+ if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+ return false;
+
+ // Collect hoisted loads/stores.
+ SmallSetVector<Instruction *, 4> HoistedInsts;
+ // Not hoist load/store if
+ // 1. target does not have corresponding conditional faulting load/store.
+ // 2. it's volatile or atomic.
+ // 3. there is a load/store that can not be hoisted in the same bb.
+ // 4. there is a non-load/store that may have side effects in the same bb.
+ // 5. any operand of it does not dominate the branch.
+ // 6. it's a store and a memory read is skipped.
+ auto HoistInstsInBB = [&](BasicBlock *BB) {
+ bool SkipMemoryRead = false;
+ // A more efficient way to check domination. An operand dominates the
+ // BranchInst if
+ // 1. it's not defined in the same bb as the instruction.
+ // 2. it's to be hoisted.
+ //
+ // b/c BB is only predecessor and BranchInst does not define any value.
+ auto OpsDominatesBranch = [&](Instruction &I) {
+ return llvm::none_of(I.operands(), [&](Value *Op) {
+ if (auto *J = dyn_cast<Instruction>(Op)) {
+ if (HoistedInsts.contains(J))
+ return false;
+ if (J->getParent() == I.getParent())
+ return true;
+ }
+ return false;
+ });
+ };
+ for (auto &I : *BB) {
+ auto *LI = dyn_cast<LoadInst>(&I);
+ auto *SI = dyn_cast<StoreInst>(&I);
+ if (LI || SI) {
+ auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
+ bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
+ if (!TTI.hasConditionalFaultingLoadStoreForType(Type) || !IsSimple ||
+ !OpsDominatesBranch(I))
+ return false;
+ if (SI && SkipMemoryRead)
+ return false;
+ HoistedInsts.insert(&I);
+ } else if (I.mayHaveSideEffects())
+ return false;
+ else if (I.mayReadFromMemory())
+ SkipMemoryRead = true;
+ }
+ return true;
+ };
+
+ if (!HoistInstsInBB(IfTrueBB) || !HoistInstsInBB(IfFalseBB) ||
+ HoistedInsts.empty())
+ return false;
+
+ // Put newly added instructions before the BranchInst.
+ IRBuilder<> Builder(BI);
+ auto &Context = BB->getContext();
+ auto *VCondTy = FixedVectorType::get(Type::getInt1Ty(Context), 1);
+ auto *Cond = BI->getOperand(0);
+ auto *VCond = Builder.CreateBitCast(Cond, VCondTy);
+ Value *VCondNot = nullptr;
+ for (auto *I : HoistedInsts) {
+ bool InvertCond = I->getParent() == IfFalseBB;
+ // Construct the inverted condition if need.
+ if (InvertCond && !VCondNot)
+ VCondNot = Builder.CreateBitCast(
+ Builder.CreateXor(Cond, ConstantInt::getTrue(Context)), VCondTy);
+
+ auto *Mask = InvertCond ? VCondNot : VCond;
+ auto *Op0 = I->getOperand(0);
+ if (auto *LI = dyn_cast<LoadInst>(I)) {
+ // Load
+ auto *Ty = I->getType();
+ // NOTE: Now we assume conditional faulting load/store is supported when
+ // creating new instructions, but it's easy to extend it for vector types
+ // in the future.
+ assert(!Ty->isVectorTy() && "not implemented");
+ auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
+ LI->getAlign(), Mask);
+ auto *S0 = Builder.CreateBitCast(V0, Ty);
+ V0->copyMetadata(*I);
+ I->replaceAllUsesWith(S0);
+ } else {
+ // Store
+ assert(!Op0->getType()->isVectorTy() && "not implemented");
+ auto *StoredVal =
+ Builder.CreateBitCast(Op0, FixedVectorType::get(Op0->getType(), 1));
+ auto *VStore = Builder.CreateMaskedStore(
+ StoredVal, I->getOperand(1), cast<StoreInst>(I)->getAlign(), Mask);
+ VStore->copyMetadata(*I);
+ }
+ }
+
+ // Erase the hoisted instrutions in reverse order to avoid use-w/o-define
+ // error.
+ std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(),
+ [](auto I) { I->eraseFromParent(); });
+
+ return true;
+}
+
/// Speculate a conditional basic block flattening the CFG.
///
/// Note that this is a very risky transform currently. Speculating
@@ -7419,14 +7597,20 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
return requestResimplify();
// We have a conditional branch to two blocks that are only reachable
- // from BI. We know that the condbr dominates the two blocks, so see if
- // there is any identical code in the "then" and "else" blocks. If so, we
- // can hoist it up to the branching block.
+ // from BI. We know that the condbr dominates the two blocks, so see
+ //
+ // * if there is any identical code in the "then" and "else" blocks.
+ // * if there is any different load/store in the "then" and "else" blocks.
+ //
+ // If so, we can hoist it up to the branching block.
if (BI->getSuccessor(0)->getSinglePredecessor()) {
if (BI->getSuccessor(1)->getSinglePredecessor()) {
if (HoistCommon && hoistCommonCodeFromSuccessors(
BI->getParent(), !Options.HoistCommonInsts))
return requestResimplify();
+ if (HoistLoadsStoresWithCondFaulting &&
+ hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
} else {
// If Successor #1 has multiple preds, we may be able to conditionally
// execute Successor #0 if it branches to Successor #1.
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
new file mode 100644
index 0000000000000..0253a633fd4b0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -0,0 +1,90 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64 -mattr=+cf -verify-machineinstrs | FileCheck %s
+
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: basic:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testl %edi, %edi
+; CHECK-NEXT: setne %al
+; CHECK-NEXT: sete %dil
+; CHECK-NEXT: negb %dil
+; CHECK-NEXT: cfcmovnel (%rsi), %esi
+; CHECK-NEXT: cfcmovnel %esi, (%rdx)
+; CHECK-NEXT: negb %al
+; CHECK-NEXT: movl $1, %eax
+; CHECK-NEXT: cfcmovneq %rax, (%rdx)
+; CHECK-NEXT: movw $2, %ax
+; CHECK-NEXT: cfcmovnew %ax, (%rcx)
+; CHECK-NEXT: retq
+entry:
+ %cond = icmp eq i32 %a, 0
+ %0 = bitcast i1 %cond to <1 x i1>
+ %1 = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i32> poison)
+ call void @llvm.masked.store.v1i32.p0(<1 x i32> %1, ptr %p, i32 4, <1 x i1> %0)
+ %2 = xor i1 %cond, true
+ %3 = bitcast i1 %2 to <1 x i1>
+ call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr %p, i32 8, <1 x i1> %3)
+ call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr %q, i32 8, <1 x i1> %3)
+ ret void
+}
+
+define i16 @cload_passthru_zero(i16 %a, ptr %b) {
+; CHECK-LABEL: cload_passthru_zero:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testw %di, %di
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: negb %al
+; CHECK-NEXT: cfcmovnew (%rsi), %ax
+; CHECK-NEXT: retq
+entry:
+ %cond = icmp eq i16 %a, 0
+ %0 = bitcast i1 %cond to <1 x i1>
+ %1 = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i16> <i16 0>)
+ %2 = bitcast <1 x i16> %1 to i16
+ ret i16 %2
+}
+
+define i64 @cload_passthru_not_zero(i64 %a, ptr %b) {
+; CHECK-LABEL: cload_passthru_not_zero:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: testq %rdi, %rdi
+; CHECK-NEXT: sete %al
+; CHECK-NEXT: negb %al
+; CHECK-NEXT: cfcmovneq (%rsi), %rdi, %rax
+; CHECK-NEXT: retq
+entry:
+ %cond = icmp eq i64 %a, 0
+ %0 = bitcast i1 %cond to <1 x i1>
+ %va = bitcast i64 %a to <1 x i64>
+ %1 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i64> %va)
+ %2 = bitcast <1 x i64> %1 to i64
+ ret i64 %2
+}
+
+define i16 @cond_false(ptr %b) {
+; CHECK-LABEL: cond_false:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xorl %eax, %eax
+; CHECK-NEXT: negb %al
+; CHECK-NEXT: cfcmovnew (%rdi), %ax
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast i1 false to <1 x i1>
+ %1 = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i16> <i16 0>)
+ %2 = bitcast <1 x i16> %1 to i16
+ ret i16 %2
+}
+
+define i64 @cond_true(ptr %b) {
+; CHECK-LABEL: cond_true:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: movb $1, %al
+; CHECK-NEXT: negb %al
+; CHECK-NEXT: cfcmovneq (%rdi), %rax
+; CHECK-NEXT: retq
+entry:
+ %0 = bitcast i1 true to <1 x i1>
+ %1 = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr %b, i32 4, <1 x i1> %0, <1 x i64> <i64 0>)
+ %2 = bitcast <1 x i64> %1 to i64
+ ret i64 %2
+}
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
new file mode 100644
index 0000000000000..16cb30a5be926
--- /dev/null
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -0,0 +1,357 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64 -mattr=+cf -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -S -simplifycfg-hoist-loads-stores-with-cond-faulting=true | FileCheck %s
+
+;; The redundant bitcast/insertelement will be opimized out in instcombine pass.
+define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @basic(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison), !dbg [[DBG8:![0-9]+]]
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i64.p0(<1 x i64> <i64 1>, ptr [[P]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12:![0-9]+]]
+; CHECK-NEXT: call void @llvm.masked.store.v1i16.p0(<1 x i16> <i16 2>, ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP5]]), !dbg [[DBG12]]
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i64 1, ptr %p, align 8, !dbg !8
+ store i16 2, ptr %q, align 8, !dbg !8
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4, !dbg !9
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_supported_type(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i8 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i8 1, ptr [[Q:%.*]], align 1
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[B:%.*]], align 1
+; CHECK-NEXT: store i8 [[TMP0]], ptr [[P:%.*]], align 1
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i8 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i8 1, ptr %q
+ br label %if.end
+
+if.true:
+ %0 = load i8, ptr %b
+ store i8 %0, ptr %p
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_br_terminator(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_br_terminator(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: switch i32 [[A:%.*]], label [[IF_END:%.*]] [
+; CHECK-NEXT: i32 1, label [[IF_FALSE:%.*]]
+; CHECK-NEXT: i32 2, label [[IF_TRUE:%.*]]
+; CHECK-NEXT: ]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_FALSE]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ switch i32 %a, label %if.end [
+ i32 1, label %if.false
+ i32 2, label %if.true
+ ]
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.false
+
+if.end:
+ ret void
+}
+
+define void @not_single_predecessor(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_single_predecessor(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: ret void
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_FALSE]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.false
+
+if.end:
+ ret void
+}
+
+define void @not_simple(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_simple(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store atomic i32 1, ptr [[Q:%.*]] seq_cst, align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store atomic i32 1, ptr %q seq_cst, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_hoistable_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store volatile i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store volatile i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_hoistable_sideeffect(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_hoistable_sideeffect(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: [[RMW:%.*]] = atomicrmw xchg ptr [[Q]], double 4.000000e+00 seq_cst, align 8
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ %rmw= atomicrmw xchg ptr %q, double 4.0 seq_cst
+ store i32 %0, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_ops_dominate_br(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_ops_dominate_br(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: [[ADD:%.*]] = add i32 [[A]], 2
+; CHECK-NEXT: store i32 [[ADD]], ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ %add = add i32 %a, 2
+ store i32 %add, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b, align 4
+ store i32 %1, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @load_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_memory_read(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
+; CHECK-NEXT: br i1 [[COND]], label [[IF_END:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: call void @read_memory_only()
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ call void @read_memory_only()
+ %0 = load i32, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b, align 4
+ store i32 %1, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+define void @not_store_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @not_store_skip_memory_read(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: call void @read_memory_only()
+; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END:%.*]]
+; CHECK: if.true:
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
+; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ call void @read_memory_only()
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b, align 4
+ store i32 %1, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+declare void @read_memory_only() readonly nounwind willreturn
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "test.c", directory: "/tmp")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"PIE Level", i32 2}
+!7 = !{i32 7, !"uwtable", i32 2}
+!8 = !DILocation(line: 1, column: 2, scope: !10)
+!9 = !DILocation(line: 1, column: 3, scope: !10)
+!10 = distinct !DISubprogram(name: "basic", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, unit: !0)
+!11 = !DISubroutineType(types: !12)
+!12 = !{}
>From 380d23a899530a9d32cf41f384264d9466d1f132 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Sun, 23 Jun 2024 20:52:44 +0800
Subject: [PATCH 02/12] fix compfail Not a vector MVT in getMaskedMemoryOpCost
---
llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 1100be925b127..3c83c8403121b 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5079,7 +5079,12 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
std::pair<InstructionCost, MVT> LT = getTypeLegalizationCost(SrcVTy);
auto VT = TLI->getValueType(DL, SrcVTy);
InstructionCost Cost = 0;
- if (VT.isSimple() && LT.second != VT.getSimpleVT() &&
+ MVT Ty = LT.second;
+ if (Ty == MVT::i16 || Ty == MVT::i32 || Ty == MVT::i64)
+ // APX masked load/store for scalar is cheap.
+ return Cost + LT.first;
+
+ if (VT.isSimple() && Ty != VT.getSimpleVT() &&
LT.second.getVectorNumElements() == NumElem)
// Promotion requires extend/truncate for data and a shuffle for mask.
Cost += getShuffleCost(TTI::SK_PermuteTwoSrc, SrcVTy, std::nullopt,
@@ -5087,9 +5092,9 @@ X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, std::nullopt,
CostKind, 0, nullptr);
- else if (LT.first * LT.second.getVectorNumElements() > NumElem) {
+ else if (LT.first * Ty.getVectorNumElements() > NumElem) {
auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
- LT.second.getVectorNumElements());
+ Ty.getVectorNumElements());
// Expanding requires fill mask with zeroes
Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, std::nullopt,
CostKind, 0, MaskTy);
>From e4a668607b9b251eee9380648eb41cff2f44a4c4 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Sun, 23 Jun 2024 23:08:02 +0800
Subject: [PATCH 03/12] support more cases in mid-end
---
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 39 ++++-
.../X86/hoist-load-store-with-cf.ll | 133 ++++++++++++++----
2 files changed, 135 insertions(+), 37 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 86bae71768dcd..892d48dba9bc1 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2967,6 +2967,10 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
return HaveRewritablePHIs;
}
+static bool isLoadFromAlloca(const Instruction &I) {
+ return isa<LoadInst>(I) && isa<AllocaInst>(I.getOperand(0));
+}
+
/// Hoist load/store instructions from the conditional successor blocks up into
/// the block.
///
@@ -3025,6 +3029,9 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
/// are handled first.
bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
BasicBlock *BB) {
+ if (!HoistLoadsStoresWithCondFaulting)
+ return false;
+
auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
if (!BI || !BI->isConditional())
return false;
@@ -3036,7 +3043,7 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
// because the code we'd hoist would no longer run when we jump into the block
// by it's address.
for (auto *Succ : {IfTrueBB, IfFalseBB})
- if (Succ->hasAddressTaken() || !Succ->getSinglePredecessor())
+ if (Succ->hasAddressTaken())
return false;
// Collect hoisted loads/stores.
@@ -3071,6 +3078,11 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
auto *LI = dyn_cast<LoadInst>(&I);
auto *SI = dyn_cast<StoreInst>(&I);
if (LI || SI) {
+ // a load from alloca is always safe.
+ if (isLoadFromAlloca(I)) {
+ HoistedInsts.insert(&I);
+ continue;
+ }
auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
if (!TTI.hasConditionalFaultingLoadStoreForType(Type) || !IsSimple ||
@@ -3099,6 +3111,12 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
auto *VCond = Builder.CreateBitCast(Cond, VCondTy);
Value *VCondNot = nullptr;
for (auto *I : HoistedInsts) {
+ // Only need to move the position for load from alloca.
+ if (isLoadFromAlloca(*I)) {
+ I->moveBefore(BI);
+ continue;
+ }
+
bool InvertCond = I->getParent() == IfFalseBB;
// Construct the inverted condition if need.
if (InvertCond && !VCondNot)
@@ -3132,8 +3150,10 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
// Erase the hoisted instrutions in reverse order to avoid use-w/o-define
// error.
- std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(),
- [](auto I) { I->eraseFromParent(); });
+ std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(), [](auto I) {
+ if (!isLoadFromAlloca(*I))
+ I->eraseFromParent();
+ });
return true;
}
@@ -7608,26 +7628,31 @@ bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
if (HoistCommon && hoistCommonCodeFromSuccessors(
BI->getParent(), !Options.HoistCommonInsts))
return requestResimplify();
- if (HoistLoadsStoresWithCondFaulting &&
- hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
return requestResimplify();
} else {
// If Successor #1 has multiple preds, we may be able to conditionally
// execute Successor #0 if it branches to Successor #1.
Instruction *Succ0TI = BI->getSuccessor(0)->getTerminator();
if (Succ0TI->getNumSuccessors() == 1 &&
- Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
+ Succ0TI->getSuccessor(0) == BI->getSuccessor(1)) {
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
return requestResimplify();
+ }
}
} else if (BI->getSuccessor(1)->getSinglePredecessor()) {
// If Successor #0 has multiple preds, we may be able to conditionally
// execute Successor #1 if it branches to Successor #0.
Instruction *Succ1TI = BI->getSuccessor(1)->getTerminator();
if (Succ1TI->getNumSuccessors() == 1 &&
- Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
+ Succ1TI->getSuccessor(0) == BI->getSuccessor(0)) {
+ if (hoistLoadStoreWithCondFaultingFromSuccessors(BI->getParent()))
+ return requestResimplify();
if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
return requestResimplify();
+ }
}
// If this is a branch on something for which we know the constant value in
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
index 16cb30a5be926..7e7f2768cf302 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -35,6 +35,109 @@ if.end:
ret void
}
+;; simplifycfg is run before sroa. alloca here is not optimized away yet.
+define void @alloca(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @alloca(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[P_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT: [[Q_ADDR:%.*]] = alloca ptr, align 8
+; CHECK-NEXT: [[A_ADDR:%.*]] = alloca i32, align 4
+; CHECK-NEXT: store ptr [[P:%.*]], ptr [[P_ADDR]], align 8
+; CHECK-NEXT: store ptr [[Q:%.*]], ptr [[Q_ADDR]], align 8
+; CHECK-NEXT: store i32 [[A:%.*]], ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[A_ADDR]], align 4
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[TMP0]], 0
+; CHECK-NEXT: [[TMP1:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[Q_ADDR]], align 8
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[TMP2]], i32 4, <1 x i1> [[TMP1]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[P_ADDR]], align 8
+; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP6]], ptr [[TMP5]], i32 4, <1 x i1> [[TMP1]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %p.addr = alloca ptr
+ %q.addr = alloca ptr
+ %a.addr = alloca i32
+ store ptr %p, ptr %p.addr
+ store ptr %q, ptr %q.addr
+ store i32 %a, ptr %a.addr
+ %0 = load i32, ptr %a.addr
+ %tobool = icmp ne i32 %0, 0
+ br i1 %tobool, label %if.then, label %if.end
+
+if.then:
+ %1 = load ptr, ptr %q.addr
+ %2 = load i32, ptr %1
+ %3 = load ptr, ptr %p.addr
+ store i32 %2, ptr %3
+ br label %if.end
+
+if.end:
+ ret void
+}
+
+;; successor 1 branches to successor 0
+define void @succ1to0(ptr %p, ptr %q, i32 %a) {
+; CHECK-LABEL: @succ1to0(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TOBOOL:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[TOBOOL]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = xor i1 [[TOBOOL]], true
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast i1 [[TMP1]] to <1 x i1>
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP2]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i32> [[TMP3]] to i32
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32 [[TMP4]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP5]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP2]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %tobool = icmp ne i32 %a, 0
+ br i1 %tobool, label %if.end, label %if.then
+
+if.end:
+ ret void
+
+if.then:
+ %0 = load i32, ptr %q
+ store i32 %0, ptr %p
+ br label %if.end
+}
+
+;; successor 0 branches to successor 1
+define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @succ0to1(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]])
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ store i32 1, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %0 = load i32, ptr %b, align 4
+ store i32 %0, ptr %p, align 4
+ br label %if.false
+
+if.end:
+ ret void
+}
+
+; i8 is not supported by conditional faulting
define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
; CHECK-LABEL: @not_supported_type(
; CHECK-NEXT: entry:
@@ -103,36 +206,6 @@ if.end:
ret void
}
-define void @not_single_predecessor(i32 %a, ptr %b, ptr %p, ptr %q) {
-; CHECK-LABEL: @not_single_predecessor(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT: br i1 [[COND]], label [[IF_TRUE:%.*]], label [[IF_FALSE:%.*]]
-; CHECK: if.false:
-; CHECK-NEXT: store i32 1, ptr [[Q:%.*]], align 4
-; CHECK-NEXT: ret void
-; CHECK: if.true:
-; CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[B:%.*]], align 4
-; CHECK-NEXT: store i32 [[TMP0]], ptr [[P:%.*]], align 4
-; CHECK-NEXT: br label [[IF_FALSE]]
-;
-entry:
- %cond = icmp eq i32 %a, 0
- br i1 %cond, label %if.true, label %if.false
-
-if.false:
- store i32 1, ptr %q, align 4
- br label %if.end
-
-if.true:
- %0 = load i32, ptr %b, align 4
- store i32 %0, ptr %p, align 4
- br label %if.false
-
-if.end:
- ret void
-}
-
define void @not_simple(i32 %a, ptr %b, ptr %p, ptr %q) {
; CHECK-LABEL: @not_simple(
; CHECK-NEXT: entry:
>From ca2c4c9f3f04ed3b3d204634f0900268804d4b55 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Mon, 24 Jun 2024 00:47:21 +0800
Subject: [PATCH 04/12] DAG combine for cload/cstore
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 32 +++++++++++++++++++++++++
llvm/test/CodeGen/X86/apx/cf.ll | 16 ++++---------
2 files changed, 36 insertions(+), 12 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 2f89758c4783d..60303b5c082d4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -55683,6 +55683,36 @@ static SDValue combineSubSetcc(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue combineX86CloadCstore(SDNode *N, SelectionDAG &DAG) {
+ // res, flags2 = sub 0, (setcc cc, flag)
+ // cload/cstore ..., cond_ne, flag2
+ // ->
+ // cload/cstore cc, flag
+ //
+ // if res has no users, where op is cload/cstore.
+ if (N->getConstantOperandVal(3) != X86::COND_NE)
+ return SDValue();
+
+ SDNode *Sub = N->getOperand(4).getNode();
+ if (Sub->getOpcode() != X86ISD::SUB)
+ return SDValue();
+
+ SDValue Op1 = Sub->getOperand(1);
+
+ if (Sub->hasAnyUseOfValue(0) || !X86::isZeroNode(Sub->getOperand(0)) ||
+ Op1.getOpcode() != X86ISD::SETCC)
+ return SDValue();
+
+
+ SmallVector<SDValue> Ops(N->op_values());
+ Ops[3] = Op1.getOperand(0);
+ Ops[4] = Op1.getOperand(1);
+
+ return DAG.getMemIntrinsicNode(N->getOpcode(), SDLoc(N), N->getVTList(), Ops,
+ cast<MemSDNode>(N)->getMemoryVT(),
+ cast<MemSDNode>(N)->getMemOperand());
+}
+
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -57390,6 +57420,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SUB: return combineSub(N, DAG, DCI, Subtarget);
case X86ISD::ADD:
case X86ISD::SUB: return combineX86AddSub(N, DAG, DCI, Subtarget);
+ case X86ISD::CLOAD:
+ case X86ISD::CSTORE: return combineX86CloadCstore(N, DAG);
case X86ISD::SBB: return combineSBB(N, DAG);
case X86ISD::ADC: return combineADC(N, DAG, DCI);
case ISD::MUL: return combineMul(N, DAG, DCI, Subtarget);
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
index 0253a633fd4b0..6afa9b02d5403 100644
--- a/llvm/test/CodeGen/X86/apx/cf.ll
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -5,12 +5,8 @@ define void @basic(i32 %a, ptr %b, ptr %p, ptr %q) {
; CHECK-LABEL: basic:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testl %edi, %edi
-; CHECK-NEXT: setne %al
-; CHECK-NEXT: sete %dil
-; CHECK-NEXT: negb %dil
-; CHECK-NEXT: cfcmovnel (%rsi), %esi
-; CHECK-NEXT: cfcmovnel %esi, (%rdx)
-; CHECK-NEXT: negb %al
+; CHECK-NEXT: cfcmovel (%rsi), %eax
+; CHECK-NEXT: cfcmovel %eax, (%rdx)
; CHECK-NEXT: movl $1, %eax
; CHECK-NEXT: cfcmovneq %rax, (%rdx)
; CHECK-NEXT: movw $2, %ax
@@ -32,9 +28,7 @@ define i16 @cload_passthru_zero(i16 %a, ptr %b) {
; CHECK-LABEL: cload_passthru_zero:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testw %di, %di
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: negb %al
-; CHECK-NEXT: cfcmovnew (%rsi), %ax
+; CHECK-NEXT: cfcmovew (%rsi), %ax
; CHECK-NEXT: retq
entry:
%cond = icmp eq i16 %a, 0
@@ -48,9 +42,7 @@ define i64 @cload_passthru_not_zero(i64 %a, ptr %b) {
; CHECK-LABEL: cload_passthru_not_zero:
; CHECK: # %bb.0: # %entry
; CHECK-NEXT: testq %rdi, %rdi
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: negb %al
-; CHECK-NEXT: cfcmovneq (%rsi), %rdi, %rax
+; CHECK-NEXT: cfcmoveq (%rsi), %rdi, %rax
; CHECK-NEXT: retq
entry:
%cond = icmp eq i64 %a, 0
>From 21933ef8d3800b96a7c4fbfc5003a6a6a0137b2e Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Mon, 24 Jun 2024 11:16:52 +0800
Subject: [PATCH 05/12] refine comment
---
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 892d48dba9bc1..e6c3290655760 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3128,9 +3128,9 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
if (auto *LI = dyn_cast<LoadInst>(I)) {
// Load
auto *Ty = I->getType();
- // NOTE: Now we assume conditional faulting load/store is supported when
- // creating new instructions, but it's easy to extend it for vector types
- // in the future.
+ // NOTE: Now we assume conditional faulting load/store is supported for
+ // scalar only when creating new instructions, but it's easy to extend it
+ // for vector types in the future.
assert(!Ty->isVectorTy() && "not implemented");
auto *V0 = Builder.CreateMaskedLoad(FixedVectorType::get(Ty, 1), Op0,
LI->getAlign(), Mask);
>From 4556bcd472fa6875a4a9a5f70ab218d53cc719a9 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Mon, 24 Jun 2024 11:38:19 +0800
Subject: [PATCH 06/12] Check CF outside of loop
---
llvm/include/llvm/Analysis/TargetTransformInfo.h | 8 +++++---
llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 10 +++++++---
llvm/lib/Target/X86/X86TargetTransformInfo.h | 2 +-
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 3 ++-
4 files changed, 15 insertions(+), 8 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 37afda39a1c9c..8775609ae0f4d 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1115,7 +1115,7 @@ class TargetTransformInfo {
/// \return true if the target supports load/store that enables fault
/// suppression of memory operands when the source condition is false.
- bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty = nullptr) const;
/// \return the target-provided register class ID for the provided type,
/// accounting for type promotion and other type-legalization techniques that
@@ -1960,7 +1960,8 @@ class TargetTransformInfo::Concept {
virtual bool preferToKeepConstantsAttached(const Instruction &Inst,
const Function &Fn) const = 0;
virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
- virtual bool hasConditionalFaultingLoadStoreForType(Type *Ty) const = 0;
+ virtual bool
+ hasConditionalFaultingLoadStoreForType(Type *Ty = nullptr) const = 0;
virtual unsigned getRegisterClassForType(bool Vector,
Type *Ty = nullptr) const = 0;
virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
@@ -2548,7 +2549,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getNumberOfRegisters(unsigned ClassID) const override {
return Impl.getNumberOfRegisters(ClassID);
}
- bool hasConditionalFaultingLoadStoreForType(Type *Ty) const override {
+ bool
+ hasConditionalFaultingLoadStoreForType(Type *Ty = nullptr) const override {
return Impl.hasConditionalFaultingLoadStoreForType(Ty);
}
unsigned getRegisterClassForType(bool Vector,
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 3c83c8403121b..5c302aafe0e38 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -177,11 +177,15 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
}
bool X86TTIImpl::hasConditionalFaultingLoadStoreForType(Type *Ty) const {
+ if (!ST->hasCF())
+ return false;
+ if (!Ty)
+ return true;
// Conditional faulting is supported by CFCMOV, which only accepts
// 16/32/64-bit operands.
- // NOTE: Though VMOVSS/VMOVSD suppresses memory fault with zero mask, it has
- // performance penalty.
- if (!ST->hasCF() || !Ty || !Ty->isIntegerTy())
+ // TODO: Support f32/f64 with VMOVSS/VMOVSD with zero mask when it's
+ // profitable.
+ if (!Ty->isIntegerTy())
return false;
switch (cast<IntegerType>(Ty)->getBitWidth()) {
default:
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 701648c6a2b3a..80558478e547e 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,7 +132,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const;
- bool hasConditionalFaultingLoadStoreForType(Type *Ty) const;
+ bool hasConditionalFaultingLoadStoreForType(Type *Ty = nullptr) const;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
unsigned getMaxInterleaveFactor(ElementCount VF);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index e6c3290655760..423c1bb9133a5 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3029,7 +3029,8 @@ static bool isLoadFromAlloca(const Instruction &I) {
/// are handled first.
bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
BasicBlock *BB) {
- if (!HoistLoadsStoresWithCondFaulting)
+ if (!HoistLoadsStoresWithCondFaulting ||
+ !TTI.hasConditionalFaultingLoadStoreForType())
return false;
auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
>From cbda47567684c2976ce65b1d436275e9f3bcce38 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Mon, 24 Jun 2024 12:01:39 +0800
Subject: [PATCH 07/12] address review comments
---
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 25 +++++++++++------------
1 file changed, 12 insertions(+), 13 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 423c1bb9133a5..194a3b12634cd 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2968,7 +2968,8 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
}
static bool isLoadFromAlloca(const Instruction &I) {
- return isa<LoadInst>(I) && isa<AllocaInst>(I.getOperand(0));
+ return isa<LoadInst>(I) &&
+ isa<AllocaInst>(getUnderlyingObject(I.getOperand(0)));
}
/// Hoist load/store instructions from the conditional successor blocks up into
@@ -3065,35 +3066,33 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
//
// b/c BB is only predecessor and BranchInst does not define any value.
auto OpsDominatesBranch = [&](Instruction &I) {
- return llvm::none_of(I.operands(), [&](Value *Op) {
+ return llvm::all_of(I.operands(), [&](Value *Op) {
if (auto *J = dyn_cast<Instruction>(Op)) {
if (HoistedInsts.contains(J))
- return false;
- if (J->getParent() == I.getParent())
return true;
+ if (J->getParent() == I.getParent())
+ return false;
}
- return false;
+ return true;
});
};
for (auto &I : *BB) {
auto *LI = dyn_cast<LoadInst>(&I);
auto *SI = dyn_cast<StoreInst>(&I);
if (LI || SI) {
- // a load from alloca is always safe.
- if (isLoadFromAlloca(I)) {
- HoistedInsts.insert(&I);
- continue;
- }
- auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
bool IsSimple = (LI && LI->isSimple()) || (SI && SI->isSimple());
- if (!TTI.hasConditionalFaultingLoadStoreForType(Type) || !IsSimple ||
- !OpsDominatesBranch(I))
+ if (!IsSimple || !OpsDominatesBranch(I))
+ return false;
+ auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
+ // a load from alloca is always safe.
+ if (!isLoadFromAlloca(I) && !TTI.hasConditionalFaultingLoadStoreForType(Type))
return false;
if (SI && SkipMemoryRead)
return false;
HoistedInsts.insert(&I);
} else if (I.mayHaveSideEffects())
return false;
+ // Conservative aliasing check.
else if (I.mayReadFromMemory())
SkipMemoryRead = true;
}
>From ea2c243506c17bfbde4e8cdcaa0c9bed4ee76fe3 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Mon, 24 Jun 2024 12:17:10 +0800
Subject: [PATCH 08/12] address review comment
---
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 19 ++++++++++---------
1 file changed, 10 insertions(+), 9 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 194a3b12634cd..04cb7ae2edd4e 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -2967,11 +2967,6 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
return HaveRewritablePHIs;
}
-static bool isLoadFromAlloca(const Instruction &I) {
- return isa<LoadInst>(I) &&
- isa<AllocaInst>(getUnderlyingObject(I.getOperand(0)));
-}
-
/// Hoist load/store instructions from the conditional successor blocks up into
/// the block.
///
@@ -3048,6 +3043,12 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
if (Succ->hasAddressTaken())
return false;
+ // Not use isa<AllocaInst>(getUnderlyingObject(I.getOperand(0)) to avoid
+ // checking all intermediate operands dominate the branch.
+ auto IsLoadFromAlloca = [](const Instruction &I) {
+ return isa<LoadInst>(I) && isa<AllocaInst>((I.getOperand(0)));
+ };
+
// Collect hoisted loads/stores.
SmallSetVector<Instruction *, 4> HoistedInsts;
// Not hoist load/store if
@@ -3085,7 +3086,7 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
return false;
auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
// a load from alloca is always safe.
- if (!isLoadFromAlloca(I) && !TTI.hasConditionalFaultingLoadStoreForType(Type))
+ if (!IsLoadFromAlloca(I) && !TTI.hasConditionalFaultingLoadStoreForType(Type))
return false;
if (SI && SkipMemoryRead)
return false;
@@ -3112,7 +3113,7 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
Value *VCondNot = nullptr;
for (auto *I : HoistedInsts) {
// Only need to move the position for load from alloca.
- if (isLoadFromAlloca(*I)) {
+ if (IsLoadFromAlloca(*I)) {
I->moveBefore(BI);
continue;
}
@@ -3150,8 +3151,8 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
// Erase the hoisted instrutions in reverse order to avoid use-w/o-define
// error.
- std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(), [](auto I) {
- if (!isLoadFromAlloca(*I))
+ std::for_each(HoistedInsts.rbegin(), HoistedInsts.rend(), [&](auto I) {
+ if (!IsLoadFromAlloca(*I))
I->eraseFromParent();
});
>From cba2421456e40a390d7e9aad1cfb086291ee7930 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Mon, 24 Jun 2024 14:39:02 +0800
Subject: [PATCH 09/12] add test for review comments
---
.../X86/hoist-load-store-with-cf.ll | 118 +++++++++++-------
1 file changed, 75 insertions(+), 43 deletions(-)
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
index 7e7f2768cf302..c62e0cf742d7f 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -78,7 +78,7 @@ if.end:
ret void
}
-;; successor 1 branches to successor 0
+;; successor 1 branches to successor 0.
define void @succ1to0(ptr %p, ptr %q, i32 %a) {
; CHECK-LABEL: @succ1to0(
; CHECK-NEXT: entry:
@@ -105,7 +105,7 @@ if.then:
br label %if.end
}
-;; successor 0 branches to successor 1
+;; successor 0 branches to successor 1.
define void @succ0to1(i32 %a, ptr %b, ptr %p, ptr %q) {
; CHECK-LABEL: @succ0to1(
; CHECK-NEXT: entry:
@@ -125,18 +125,88 @@ entry:
br i1 %cond, label %if.true, label %if.false
if.false:
- store i32 1, ptr %q, align 4
+ store i32 1, ptr %q
br label %if.end
if.true:
- %0 = load i32, ptr %b, align 4
- store i32 %0, ptr %p, align 4
+ %0 = load i32, ptr %b
+ store i32 %0, ptr %p
br label %if.false
if.end:
ret void
}
+;; load after store can be hoisted.
+define i64 @load_after_store(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_after_store(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> <i32 1>, ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i16> @llvm.masked.load.v1i16.p0(ptr [[P:%.*]], i32 2, <1 x i1> [[TMP0]], <1 x i16> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i16> [[TMP1]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = call <1 x i64> @llvm.masked.load.v1i64.p0(ptr [[Q:%.*]], i32 8, <1 x i1> [[TMP0]], <1 x i64> poison)
+; CHECK-NEXT: [[TMP4:%.*]] = bitcast <1 x i64> [[TMP3]] to i64
+; CHECK-NEXT: [[ZEXT:%.*]] = zext i16 [[TMP2]] to i64
+; CHECK-NEXT: [[ADD:%.*]] = add i64 [[ZEXT]], [[TMP4]]
+; CHECK-NEXT: [[COMMON_RET_OP:%.*]] = select i1 [[COND]], i64 [[ADD]], i64 0
+; CHECK-NEXT: ret i64 [[COMMON_RET_OP]]
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.end
+
+if.true:
+ store i32 1, ptr %b
+ %0 = load i16, ptr %p
+ %1 = load i64, ptr %q
+ %zext = zext i16 %0 to i64
+ %add = add i64 %zext, %1
+ ret i64 %add
+
+if.end:
+ ret i64 0
+}
+
+define void @load_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_memory_read(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
+; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
+; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
+; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
+; CHECK-NEXT: [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
+; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
+; CHECK-NEXT: br i1 [[COND]], label [[IF_END:%.*]], label [[IF_FALSE:%.*]]
+; CHECK: if.false:
+; CHECK-NEXT: call void @read_memory_only()
+; CHECK-NEXT: br label [[IF_END]]
+; CHECK: if.end:
+; CHECK-NEXT: ret void
+;
+entry:
+ %cond = icmp eq i32 %a, 0
+ br i1 %cond, label %if.true, label %if.false
+
+if.false:
+ call void @read_memory_only()
+ %0 = load i32, ptr %q, align 4
+ br label %if.end
+
+if.true:
+ %1 = load i32, ptr %b, align 4
+ store i32 %1, ptr %p, align 4
+ br label %if.end
+
+if.end:
+ ret void
+}
+
; i8 is not supported by conditional faulting
define void @not_supported_type(i8 %a, ptr %b, ptr %p, ptr %q) {
; CHECK-LABEL: @not_supported_type(
@@ -338,44 +408,6 @@ if.end:
ret void
}
-define void @load_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
-; CHECK-LABEL: @load_skip_memory_read(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
-; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
-; CHECK-NEXT: [[TMP1:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[B:%.*]], i32 4, <1 x i1> [[TMP0]], <1 x i32> poison)
-; CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i32> [[TMP1]] to i32
-; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32 [[TMP2]] to <1 x i32>
-; CHECK-NEXT: call void @llvm.masked.store.v1i32.p0(<1 x i32> [[TMP3]], ptr [[P:%.*]], i32 4, <1 x i1> [[TMP0]])
-; CHECK-NEXT: [[TMP4:%.*]] = xor i1 [[COND]], true
-; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
-; CHECK-NEXT: [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
-; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
-; CHECK-NEXT: br i1 [[COND]], label [[IF_END:%.*]], label [[IF_FALSE:%.*]]
-; CHECK: if.false:
-; CHECK-NEXT: call void @read_memory_only()
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: ret void
-;
-entry:
- %cond = icmp eq i32 %a, 0
- br i1 %cond, label %if.true, label %if.false
-
-if.false:
- call void @read_memory_only()
- %0 = load i32, ptr %q, align 4
- br label %if.end
-
-if.true:
- %1 = load i32, ptr %b, align 4
- store i32 %1, ptr %p, align 4
- br label %if.end
-
-if.end:
- ret void
-}
-
define void @not_store_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
; CHECK-LABEL: @not_store_skip_memory_read(
; CHECK-NEXT: entry:
>From 1bc2f5ee58126f326cb1890838b7c566e9fe6939 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Mon, 24 Jun 2024 14:52:21 +0800
Subject: [PATCH 10/12] move comment to test
---
llvm/test/CodeGen/X86/apx/cf.ll | 3 +++
1 file changed, 3 insertions(+)
diff --git a/llvm/test/CodeGen/X86/apx/cf.ll b/llvm/test/CodeGen/X86/apx/cf.ll
index 6afa9b02d5403..1669c6c04c45a 100644
--- a/llvm/test/CodeGen/X86/apx/cf.ll
+++ b/llvm/test/CodeGen/X86/apx/cf.ll
@@ -53,6 +53,9 @@ entry:
ret i64 %2
}
+;; No need to optimize the generated assembly for cond_false/cond_true b/c it
+;; should never be emitted by middle end. Add IR here just to check it's
+;; legal to feed constant mask to backend.
define i16 @cond_false(ptr %b) {
; CHECK-LABEL: cond_false:
; CHECK: # %bb.0: # %entry
>From dc32975db0d31daf98617f578882a6d9dc0f437d Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Mon, 24 Jun 2024 22:48:47 +0800
Subject: [PATCH 11/12] mayHaveSideEffects() -> !isSafeToSpeculativelyExecute
---
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 8 +++---
.../X86/hoist-load-store-with-cf.ll | 26 +++++++++----------
2 files changed, 17 insertions(+), 17 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 04cb7ae2edd4e..d765c05b54e33 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3055,7 +3055,8 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
// 1. target does not have corresponding conditional faulting load/store.
// 2. it's volatile or atomic.
// 3. there is a load/store that can not be hoisted in the same bb.
- // 4. there is a non-load/store that may have side effects in the same bb.
+ // 4. there is a non-load/store that's not safe to speculatively execute
+ // in the same bb.
// 5. any operand of it does not dominate the branch.
// 6. it's a store and a memory read is skipped.
auto HoistInstsInBB = [&](BasicBlock *BB) {
@@ -3086,12 +3087,13 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
return false;
auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
// a load from alloca is always safe.
- if (!IsLoadFromAlloca(I) && !TTI.hasConditionalFaultingLoadStoreForType(Type))
+ if (!IsLoadFromAlloca(I) &&
+ !TTI.hasConditionalFaultingLoadStoreForType(Type))
return false;
if (SI && SkipMemoryRead)
return false;
HoistedInsts.insert(&I);
- } else if (I.mayHaveSideEffects())
+ } else if (!I.isTerminator() && !isSafeToSpeculativelyExecute(&I))
return false;
// Conservative aliasing check.
else if (I.mayReadFromMemory())
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
index c62e0cf742d7f..2fd0055cf05f9 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/hoist-load-store-with-cf.ll
@@ -169,8 +169,8 @@ if.end:
ret i64 0
}
-define void @load_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
-; CHECK-LABEL: @load_skip_memory_read(
+define i32 @load_skip_speculatable_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
+; CHECK-LABEL: @load_skip_speculatable_memory_read(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[A:%.*]], 0
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i1 [[COND]] to <1 x i1>
@@ -182,29 +182,27 @@ define void @load_skip_memory_read(i32 %a, ptr %b, ptr %p, ptr %q) {
; CHECK-NEXT: [[TMP5:%.*]] = bitcast i1 [[TMP4]] to <1 x i1>
; CHECK-NEXT: [[TMP6:%.*]] = call <1 x i32> @llvm.masked.load.v1i32.p0(ptr [[Q:%.*]], i32 4, <1 x i1> [[TMP5]], <1 x i32> poison)
; CHECK-NEXT: [[TMP7:%.*]] = bitcast <1 x i32> [[TMP6]] to i32
-; CHECK-NEXT: br i1 [[COND]], label [[IF_END:%.*]], label [[IF_FALSE:%.*]]
-; CHECK: if.false:
-; CHECK-NEXT: call void @read_memory_only()
-; CHECK-NEXT: br label [[IF_END]]
-; CHECK: if.end:
-; CHECK-NEXT: ret void
+; CHECK-NEXT: [[READ:%.*]] = call i32 @read_memory_only()
+; CHECK-NEXT: [[PHI:%.*]] = select i1 [[COND]], i32 0, i32 [[READ]]
+; CHECK-NEXT: ret i32 [[PHI]]
;
entry:
%cond = icmp eq i32 %a, 0
br i1 %cond, label %if.true, label %if.false
if.false:
- call void @read_memory_only()
- %0 = load i32, ptr %q, align 4
+ %read = call i32 @read_memory_only()
+ %0 = load i32, ptr %q
br label %if.end
if.true:
- %1 = load i32, ptr %b, align 4
- store i32 %1, ptr %p, align 4
+ %1 = load i32, ptr %b
+ store i32 %1, ptr %p
br label %if.end
if.end:
- ret void
+ %phi = phi i32 [%read, %if.false], [0, %if.true]
+ ret i32 %phi
}
; i8 is not supported by conditional faulting
@@ -442,7 +440,7 @@ if.end:
ret void
}
-declare void @read_memory_only() readonly nounwind willreturn
+declare i32 @read_memory_only() readonly nounwind willreturn speculatable
!llvm.dbg.cu = !{!0}
!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
>From 7635932ee4f5bf3cb1af7d25764276530653bca5 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan at intel.com>
Date: Mon, 24 Jun 2024 23:39:16 +0800
Subject: [PATCH 12/12] address review comment
---
.../llvm/Analysis/TargetTransformInfo.h | 10 ++++------
.../llvm/Analysis/TargetTransformInfoImpl.h | 2 +-
llvm/include/llvm/CodeGen/TargetLowering.h | 14 ++++++++------
llvm/lib/Analysis/TargetTransformInfo.cpp | 5 ++---
.../SelectionDAG/SelectionDAGBuilder.cpp | 13 ++++++-------
llvm/lib/Target/X86/X86ISelLowering.cpp | 9 +++++----
llvm/lib/Target/X86/X86ISelLowering.h | 18 +++++++-----------
llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 4 ++--
llvm/lib/Target/X86/X86TargetTransformInfo.h | 2 +-
llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 5 ++---
10 files changed, 38 insertions(+), 44 deletions(-)
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 8775609ae0f4d..f5c0127e1d422 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1115,7 +1115,7 @@ class TargetTransformInfo {
/// \return true if the target supports load/store that enables fault
/// suppression of memory operands when the source condition is false.
- bool hasConditionalFaultingLoadStoreForType(Type *Ty = nullptr) const;
+ bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const;
/// \return the target-provided register class ID for the provided type,
/// accounting for type promotion and other type-legalization techniques that
@@ -1960,8 +1960,7 @@ class TargetTransformInfo::Concept {
virtual bool preferToKeepConstantsAttached(const Instruction &Inst,
const Function &Fn) const = 0;
virtual unsigned getNumberOfRegisters(unsigned ClassID) const = 0;
- virtual bool
- hasConditionalFaultingLoadStoreForType(Type *Ty = nullptr) const = 0;
+ virtual bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const = 0;
virtual unsigned getRegisterClassForType(bool Vector,
Type *Ty = nullptr) const = 0;
virtual const char *getRegisterClassName(unsigned ClassID) const = 0;
@@ -2549,9 +2548,8 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
unsigned getNumberOfRegisters(unsigned ClassID) const override {
return Impl.getNumberOfRegisters(ClassID);
}
- bool
- hasConditionalFaultingLoadStoreForType(Type *Ty = nullptr) const override {
- return Impl.hasConditionalFaultingLoadStoreForType(Ty);
+ bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const override {
+ return Impl.hasConditionalLoadStoreForType(Ty);
}
unsigned getRegisterClassForType(bool Vector,
Type *Ty = nullptr) const override {
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index a4aa836ed82d3..49b4bd00baed4 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -457,7 +457,7 @@ class TargetTransformInfoImplBase {
}
unsigned getNumberOfRegisters(unsigned ClassID) const { return 8; }
- bool hasConditionalFaultingLoadStoreForType(Type *Ty) const { return false; }
+ bool hasConditionalLoadStoreForType(Type *Ty) const { return false; }
unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const {
return Vector ? 1 : 0;
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 2b0a45133bb0e..9a0df8b29d752 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3895,15 +3895,17 @@ class TargetLowering : public TargetLoweringBase {
const SDValue OldRHS, SDValue &Chain,
bool IsSignaling = false) const;
- virtual SDValue visitMaskedLoadForCondFaulting(
- SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
- SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
+ virtual SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain, MachineMemOperand *MMO,
+ SDValue &NewLoad, SDValue Ptr,
+ SDValue PassThru, SDValue Mask) const {
llvm_unreachable("Not Implemented");
}
- virtual SDValue visitMaskedStoreForCondFaulting(
- SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
- SDValue Ptr, SDValue Val, SDValue Mask) const {
+ virtual SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain, MachineMemOperand *MMO,
+ SDValue Ptr, SDValue Val,
+ SDValue Mask) const {
llvm_unreachable("Not Implemented");
}
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 46936f266bf46..0db8a4201fead 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -722,9 +722,8 @@ unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const {
return TTIImpl->getNumberOfRegisters(ClassID);
}
-bool TargetTransformInfo::hasConditionalFaultingLoadStoreForType(
- Type *Ty) const {
- return TTIImpl->hasConditionalFaultingLoadStoreForType(Ty);
+bool TargetTransformInfo::hasConditionalLoadStoreForType(Type *Ty) const {
+ return TTIImpl->hasConditionalLoadStoreForType(Ty);
}
unsigned TargetTransformInfo::getRegisterClassForType(bool Vector,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 24dbe6efabbca..1f9e73ef949e8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4788,12 +4788,12 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
const auto &TTI =
TLI.getTargetMachine().getTargetTransformInfo(*I.getFunction());
SDValue StoreNode =
- (!IsCompressing && TTI.hasConditionalFaultingLoadStoreForType(
+ (!IsCompressing && TTI.hasConditionalLoadStoreForType(
I.getArgOperand(0)->getType()->getScalarType()))
- ? TLI.visitMaskedStoreForCondFaulting(DAG, sdl, getMemoryRoot(), MMO,
- Ptr, Src0, Mask)
+ ? TLI.visitMaskedStore(DAG, sdl, getMemoryRoot(), MMO, Ptr, Src0,
+ Mask)
: DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask,
- VT, MMO, ISD::UNINDEXED, false /* Truncating */,
+ VT, MMO, ISD::UNINDEXED, /*Truncating=*/false,
IsCompressing);
DAG.setRoot(StoreNode);
setValue(&I, StoreNode);
@@ -4973,10 +4973,9 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
// The Load/Res may point to different values.
SDValue Load;
SDValue Res;
- if (!IsExpanding && TTI.hasConditionalFaultingLoadStoreForType(
+ if (!IsExpanding && TTI.hasConditionalLoadStoreForType(
Src0Operand->getType()->getScalarType()))
- Res = TLI.visitMaskedLoadForCondFaulting(DAG, sdl, InChain, MMO, Load, Ptr,
- Src0, Mask);
+ Res = TLI.visitMaskedLoad(DAG, sdl, InChain, MMO, Load, Ptr, Src0, Mask);
else
Res = Load =
DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 60303b5c082d4..a45e18ae67a91 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -32319,7 +32319,7 @@ static SDValue getFlagsOfCmpZeroFori1(SelectionDAG &DAG, const SDLoc &DL,
return SDValue(CmpZero.getNode(), 1);
}
-SDValue X86TargetLowering::visitMaskedLoadForCondFaulting(
+SDValue X86TargetLowering::visitMaskedLoad(
SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
SDValue &NewLoad, SDValue Ptr, SDValue PassThru, SDValue Mask) const {
// @llvm.masked.load.*(ptr, alignment, mask, passthru)
@@ -32339,9 +32339,10 @@ SDValue X86TargetLowering::visitMaskedLoadForCondFaulting(
return DAG.getBitcast(VTy, NewLoad);
}
-SDValue X86TargetLowering::visitMaskedStoreForCondFaulting(
- SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, MachineMemOperand *MMO,
- SDValue Ptr, SDValue Val, SDValue Mask) const {
+SDValue X86TargetLowering::visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL,
+ SDValue Chain,
+ MachineMemOperand *MMO, SDValue Ptr,
+ SDValue Val, SDValue Mask) const {
// llvm.masked.store.*(Src0, Ptr, alignment, Mask)
// ->
// _, flags = SUB 0, mask
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 05ef982ef2023..362daa98e1f8e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1560,17 +1560,13 @@ namespace llvm {
bool isInlineAsmTargetBranch(const SmallVectorImpl<StringRef> &AsmStrs,
unsigned OpNo) const override;
- SDValue visitMaskedLoadForCondFaulting(SelectionDAG &DAG, const SDLoc &DL,
- SDValue Chain,
- MachineMemOperand *MMO,
- SDValue &NewLoad, SDValue Ptr,
- SDValue PassThru,
- SDValue Mask) const override;
- SDValue visitMaskedStoreForCondFaulting(SelectionDAG &DAG, const SDLoc &DL,
- SDValue Chain,
- MachineMemOperand *MMO, SDValue Ptr,
- SDValue Val,
- SDValue Mask) const override;
+ SDValue visitMaskedLoad(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ MachineMemOperand *MMO, SDValue &NewLoad,
+ SDValue Ptr, SDValue PassThru,
+ SDValue Mask) const override;
+ SDValue visitMaskedStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
+ MachineMemOperand *MMO, SDValue Ptr, SDValue Val,
+ SDValue Mask) const override;
/// Lower interleaved load(s) into target specific
/// instructions/intrinsics.
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 5c302aafe0e38..aad4b9039bbb1 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -176,7 +176,7 @@ unsigned X86TTIImpl::getNumberOfRegisters(unsigned ClassID) const {
return 8;
}
-bool X86TTIImpl::hasConditionalFaultingLoadStoreForType(Type *Ty) const {
+bool X86TTIImpl::hasConditionalLoadStoreForType(Type *Ty) const {
if (!ST->hasCF())
return false;
if (!Ty)
@@ -5923,7 +5923,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
Type *ScalarTy = DataTy->getScalarType();
if (ST->hasCF() && IsSingleElementVector &&
- hasConditionalFaultingLoadStoreForType(ScalarTy))
+ hasConditionalLoadStoreForType(ScalarTy))
return true;
if (!ST->hasAVX())
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 80558478e547e..e6bb4720071d5 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -132,7 +132,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
/// @{
unsigned getNumberOfRegisters(unsigned ClassID) const;
- bool hasConditionalFaultingLoadStoreForType(Type *Ty = nullptr) const;
+ bool hasConditionalLoadStoreForType(Type *Ty = nullptr) const;
TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
unsigned getMaxInterleaveFactor(ElementCount VF);
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index d765c05b54e33..e579cf584d80d 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3026,7 +3026,7 @@ static bool validateAndCostRequiredSelects(BasicBlock *BB, BasicBlock *ThenBB,
bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
BasicBlock *BB) {
if (!HoistLoadsStoresWithCondFaulting ||
- !TTI.hasConditionalFaultingLoadStoreForType())
+ !TTI.hasConditionalLoadStoreForType())
return false;
auto *BI = dyn_cast<BranchInst>(BB->getTerminator());
@@ -3087,8 +3087,7 @@ bool SimplifyCFGOpt::hoistLoadStoreWithCondFaultingFromSuccessors(
return false;
auto *Type = LI ? I.getType() : I.getOperand(0)->getType();
// a load from alloca is always safe.
- if (!IsLoadFromAlloca(I) &&
- !TTI.hasConditionalFaultingLoadStoreForType(Type))
+ if (!IsLoadFromAlloca(I) && !TTI.hasConditionalLoadStoreForType(Type))
return false;
if (SI && SkipMemoryRead)
return false;
More information about the llvm-commits
mailing list