[llvm] [PPC] Implement `areInlineCompatible` (PR #126562)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Feb 10 10:07:54 PST 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-powerpc
Author: Henry Jiang (mustartt)
<details>
<summary>Changes</summary>
After the default implementation swap from https://github.com/llvm/llvm-project/pull/117493, where `areInlineCompatible` checks if the callee features are a subset of caller features. This is not a safe assumption in general on PPC. We fallback to check for strict feature set equality for now, and see what improvements we can make.
---
Full diff: https://github.com/llvm/llvm-project/pull/126562.diff
2 Files Affected:
- (modified) llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp (+73-62)
- (modified) llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h (+2)
``````````diff
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index c308ec332e84434..885e4b3fb323093 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -25,26 +25,29 @@ using namespace llvm;
#define DEBUG_TYPE "ppctti"
static cl::opt<bool> VecMaskCost("ppc-vec-mask-cost",
-cl::desc("add masking cost for i1 vectors"), cl::init(true), cl::Hidden);
+ cl::desc("add masking cost for i1 vectors"),
+ cl::init(true), cl::Hidden);
-static cl::opt<bool> DisablePPCConstHoist("disable-ppc-constant-hoisting",
-cl::desc("disable constant hoisting on PPC"), cl::init(false), cl::Hidden);
+static cl::opt<bool>
+ DisablePPCConstHoist("disable-ppc-constant-hoisting",
+ cl::desc("disable constant hoisting on PPC"),
+ cl::init(false), cl::Hidden);
static cl::opt<bool>
-EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
- cl::desc("Enable using coldcc calling conv for cold "
- "internal functions"));
+ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
+ cl::desc("Enable using coldcc calling conv for cold "
+ "internal functions"));
static cl::opt<bool>
-LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
- cl::desc("Do not add instruction count to lsr cost model"));
+ LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
+ cl::desc("Do not add instruction count to lsr cost model"));
// The latency of mtctr is only justified if there are more than 4
// comparisons that will be removed as a result.
-static cl::opt<unsigned>
-SmallCTRLoopThreshold("min-ctr-loop-threshold", cl::init(4), cl::Hidden,
- cl::desc("Loops with a constant trip count smaller than "
- "this value will not use the count register."));
+static cl::opt<unsigned> SmallCTRLoopThreshold(
+ "min-ctr-loop-threshold", cl::init(4), cl::Hidden,
+ cl::desc("Loops with a constant trip count smaller than "
+ "this value will not use the count register."));
//===----------------------------------------------------------------------===//
//
@@ -56,8 +59,9 @@ TargetTransformInfo::PopcntSupportKind
PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
if (ST->hasPOPCNTD() != PPCSubtarget::POPCNTD_Unavailable && TyWidth <= 64)
- return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow ?
- TTI::PSK_SlowHardware : TTI::PSK_FastHardware;
+ return ST->hasPOPCNTD() == PPCSubtarget::POPCNTD_Slow
+ ? TTI::PSK_SlowHardware
+ : TTI::PSK_FastHardware;
return TTI::PSK_Software;
}
@@ -290,14 +294,12 @@ InstructionCost PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
return TTI::TCC_Free;
if (RunFree) {
- if (Imm.getBitWidth() <= 32 &&
- (isShiftedMask_32(Imm.getZExtValue()) ||
- isShiftedMask_32(~Imm.getZExtValue())))
+ if (Imm.getBitWidth() <= 32 && (isShiftedMask_32(Imm.getZExtValue()) ||
+ isShiftedMask_32(~Imm.getZExtValue())))
return TTI::TCC_Free;
- if (ST->isPPC64() &&
- (isShiftedMask_64(Imm.getZExtValue()) ||
- isShiftedMask_64(~Imm.getZExtValue())))
+ if (ST->isPPC64() && (isShiftedMask_64(Imm.getZExtValue()) ||
+ isShiftedMask_64(~Imm.getZExtValue())))
return TTI::TCC_Free;
}
@@ -364,14 +366,15 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
Call->getIntrinsicID() == Intrinsic::loop_decrement)
return false;
- SmallVector<BasicBlock*, 4> ExitingBlocks;
+ SmallVector<BasicBlock *, 4> ExitingBlocks;
L->getExitingBlocks(ExitingBlocks);
// If there is an exit edge known to be frequently taken,
// we should not transform this loop.
for (auto &BB : ExitingBlocks) {
Instruction *TI = BB->getTerminator();
- if (!TI) continue;
+ if (!TI)
+ continue;
if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
uint64_t TrueWeight = 0, FalseWeight = 0;
@@ -382,15 +385,15 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
// If the exit path is more frequent than the loop path,
// we return here without further analysis for this loop.
bool TrueIsExit = !L->contains(BI->getSuccessor(0));
- if (( TrueIsExit && FalseWeight < TrueWeight) ||
+ if ((TrueIsExit && FalseWeight < TrueWeight) ||
(!TrueIsExit && FalseWeight > TrueWeight))
return false;
}
}
LLVMContext &C = L->getHeader()->getContext();
- HWLoopInfo.CountType = TM.isPPC64() ?
- Type::getInt64Ty(C) : Type::getInt32Ty(C);
+ HWLoopInfo.CountType =
+ TM.isPPC64() ? Type::getInt64Ty(C) : Type::getInt32Ty(C);
HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1);
return true;
}
@@ -419,9 +422,7 @@ void PPCTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
// Returning true results in coldcc being used for functions which are cold at
// all call sites when the callers of the functions are not calling any other
// non coldcc functions.
-bool PPCTTIImpl::useColdCCForColdCall(Function &F) {
- return EnablePPCColdCC;
-}
+bool PPCTTIImpl::useColdCCForColdCall(Function &F) { return EnablePPCColdCC; }
bool PPCTTIImpl::enableAggressiveInterleaving(bool LoopHasReductions) {
// On the A2, always unroll aggressively.
@@ -439,13 +440,11 @@ PPCTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
return Options;
}
-bool PPCTTIImpl::enableInterleavedAccessVectorization() {
- return true;
-}
+bool PPCTTIImpl::enableInterleavedAccessVectorization() { return true; }
unsigned PPCTTIImpl::getNumberOfRegisters(unsigned ClassID) const {
- assert(ClassID == GPRRC || ClassID == FPRRC ||
- ClassID == VRRC || ClassID == VSXRC);
+ assert(ClassID == GPRRC || ClassID == FPRRC || ClassID == VRRC ||
+ ClassID == VSXRC);
if (ST->hasVSX()) {
assert(ClassID == GPRRC || ClassID == VSXRC || ClassID == VRRC);
return ClassID == VSXRC ? 64 : 32;
@@ -469,16 +468,20 @@ unsigned PPCTTIImpl::getRegisterClassForType(bool Vector, Type *Ty) const {
return GPRRC;
}
-const char* PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
+const char *PPCTTIImpl::getRegisterClassName(unsigned ClassID) const {
switch (ClassID) {
- default:
- llvm_unreachable("unknown register class");
- return "PPC::unknown register class";
- case GPRRC: return "PPC::GPRRC";
- case FPRRC: return "PPC::FPRRC";
- case VRRC: return "PPC::VRRC";
- case VSXRC: return "PPC::VSXRC";
+ default:
+ llvm_unreachable("unknown register class");
+ return "PPC::unknown register class";
+ case GPRRC:
+ return "PPC::GPRRC";
+ case FPRRC:
+ return "PPC::FPRRC";
+ case VRRC:
+ return "PPC::VRRC";
+ case VSXRC:
+ return "PPC::VSXRC";
}
}
@@ -509,9 +512,7 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
return 64;
}
-unsigned PPCTTIImpl::getPrefetchDistance() const {
- return 300;
-}
+unsigned PPCTTIImpl::getPrefetchDistance() const { return 300; }
unsigned PPCTTIImpl::getMaxInterleaveFactor(ElementCount VF) {
unsigned Directive = ST->getCPUDirective();
@@ -582,8 +583,7 @@ InstructionCost PPCTTIImpl::vectorCostAdjustmentFactor(unsigned Opcode,
InstructionCost PPCTTIImpl::getArithmeticInstrCost(
unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
TTI::OperandValueInfo Op1Info, TTI::OperandValueInfo Op2Info,
- ArrayRef<const Value *> Args,
- const Instruction *CxtI) {
+ ArrayRef<const Value *> Args, const Instruction *CxtI) {
assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
InstructionCost CostFactor = vectorCostAdjustmentFactor(Opcode, Ty, nullptr);
@@ -592,12 +592,12 @@ InstructionCost PPCTTIImpl::getArithmeticInstrCost(
// TODO: Handle more cost kinds.
if (CostKind != TTI::TCK_RecipThroughput)
- return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
- Op2Info, Args, CxtI);
+ return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
+ Args, CxtI);
// Fallback to the default implementation.
- InstructionCost Cost = BaseT::getArithmeticInstrCost(
- Opcode, Ty, CostKind, Op1Info, Op2Info);
+ InstructionCost Cost =
+ BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
return Cost * CostFactor;
}
@@ -753,8 +753,7 @@ InstructionCost PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
// because they require store and reload with the attendant
// processor stall for load-hit-store. Until VSX is available,
// these need to be estimated as very costly.
- if (ISD == ISD::EXTRACT_VECTOR_ELT ||
- ISD == ISD::INSERT_VECTOR_ELT)
+ if (ISD == ISD::EXTRACT_VECTOR_ELT || ISD == ISD::INSERT_VECTOR_ELT)
return LHSPenalty + Cost;
return Cost;
@@ -771,7 +770,7 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
if (!CostFactor.isValid())
return InstructionCost::getMax();
- if (TLI->getValueType(DL, Src, true) == MVT::Other)
+ if (TLI->getValueType(DL, Src, true) == MVT::Other)
return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
CostKind);
// Legalize the type.
@@ -787,11 +786,11 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
Cost *= CostFactor;
- bool IsAltivecType = ST->hasAltivec() &&
- (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
- LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
- bool IsVSXType = ST->hasVSX() &&
- (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
+ bool IsAltivecType =
+ ST->hasAltivec() && (LT.second == MVT::v16i8 || LT.second == MVT::v8i16 ||
+ LT.second == MVT::v4i32 || LT.second == MVT::v4f32);
+ bool IsVSXType =
+ ST->hasVSX() && (LT.second == MVT::v2f64 || LT.second == MVT::v2i64);
// VSX has 32b/64b load instructions. Legalization can handle loading of
// 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
@@ -884,7 +883,7 @@ InstructionCost PPCTTIImpl::getInterleavedMemoryOpCost(
// instruction). For each result vector, we need one shuffle per incoming
// vector (except that the first shuffle can take two incoming vectors
// because it does not need to take itself).
- Cost += Factor*(LT.first-1);
+ Cost += Factor * (LT.first - 1);
return Cost;
}
@@ -895,6 +894,20 @@ PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
return BaseT::getIntrinsicInstrCost(ICA, CostKind);
}
+bool PPCTTIImpl::areInlineCompatible(const Function *Caller,
+ const Function *Callee) const {
+ const TargetMachine &TM = getTLI()->getTargetMachine();
+
+ const FeatureBitset &CallerBits =
+ TM.getSubtargetImpl(*Caller)->getFeatureBits();
+ const FeatureBitset &CalleeBits =
+ TM.getSubtargetImpl(*Callee)->getFeatureBits();
+
+ // Check that targets features are exactly the same. We can revisit to see if
+ // we can improve this.
+ return CallerBits == CalleeBits;
+}
+
bool PPCTTIImpl::areTypesABICompatible(const Function *Caller,
const Function *Callee,
const ArrayRef<Type *> &Types) const {
@@ -950,9 +963,7 @@ bool PPCTTIImpl::isLSRCostLess(const TargetTransformInfo::LSRCost &C1,
return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
}
-bool PPCTTIImpl::isNumRegsMajorCostOfLSR() {
- return false;
-}
+bool PPCTTIImpl::isNumRegsMajorCostOfLSR() { return false; }
bool PPCTTIImpl::shouldBuildRelLookupTables() const {
const PPCTargetMachine &TM = ST->getTargetMachine();
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 3cb60d7a1785ae3..bf3ddad134e14c3 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -139,6 +139,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
bool UseMaskForCond = false, bool UseMaskForGaps = false);
InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
TTI::TargetCostKind CostKind);
+ bool areInlineCompatible(const Function *Caller,
+ const Function *Callee) const;
bool areTypesABICompatible(const Function *Caller, const Function *Callee,
const ArrayRef<Type *> &Types) const;
bool hasActiveVectorLength(unsigned Opcode, Type *DataType,
``````````
</details>
https://github.com/llvm/llvm-project/pull/126562
More information about the llvm-commits
mailing list