[llvm] db0ed3e - AMDGPU: Refactor treatment of denormal mode
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Tue Nov 19 06:26:06 PST 2019
Author: Matt Arsenault
Date: 2019-11-19T19:55:43+05:30
New Revision: db0ed3e429b55d1730d1ecc253b0643de7fca099
URL: https://github.com/llvm/llvm-project/commit/db0ed3e429b55d1730d1ecc253b0643de7fca099
DIFF: https://github.com/llvm/llvm-project/commit/db0ed3e429b55d1730d1ecc253b0643de7fca099.diff
LOG: AMDGPU: Refactor treatment of denormal mode
Start moving towards treating this as a property of the calling
convention, and not the subtarget. The default denormal mode should
not be part of the subtarget, and be moved into a separate function
attribute.
This patch is still NFC. The denormal mode remains as a subtarget
feature for now, but make the necessary changes to switch to using an
attribute.
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/lib/Target/AMDGPU/SIISelLowering.h
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index c4fac3eab785..72526cac113e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -69,15 +69,14 @@ using namespace llvm::AMDGPU::HSAMD;
// We want to use these instructions, and using fp32 denormals also causes
// instructions to run at the double precision rate for the device so it's
// probably best to just report no single precision denormals.
-static uint32_t getFPMode(const MachineFunction &F) {
- const GCNSubtarget& ST = F.getSubtarget<GCNSubtarget>();
- // TODO: Is there any real use for the flush in only / flush out only modes?
+static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) {
+ // TODO: Is there any real use for the flush in only / flush out only modes?
uint32_t FP32Denormals =
- ST.hasFP32Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+ Mode.FP32Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
uint32_t FP64Denormals =
- ST.hasFP64Denormals() ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+ Mode.FP64FP16Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
@@ -1033,11 +1032,12 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
ProgInfo.VGPRBlocks = IsaInfo::getNumVGPRBlocks(
&STM, ProgInfo.NumVGPRsForWavesPerEU);
+ const SIModeRegisterDefaults Mode = MFI->getMode();
+
// Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
// register.
- ProgInfo.FloatMode = getFPMode(MF);
+ ProgInfo.FloatMode = getFPMode(Mode);
- const SIModeRegisterDefaults Mode = MFI->getMode();
ProgInfo.IEEEMode = Mode.IEEE;
// Make clamp modifier on NaN input returns 0.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index ed495bd40b82..cf908766caa0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -70,6 +70,7 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
bool HasUnsafeFPMath = false;
+ bool HasFP32Denormals = false;
/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.
@@ -575,7 +576,6 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Value *NewFDiv = nullptr;
- bool HasDenormals = ST->hasFP32Denormals();
if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
NewFDiv = UndefValue::get(VT);
@@ -586,7 +586,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
Value *DenEltI = Builder.CreateExtractElement(Den, I);
Value *NewElt;
- if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasDenormals)) {
+ if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
} else {
NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
@@ -595,7 +595,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
}
} else {
- if (!shouldKeepFDivF32(Num, UnsafeDiv, HasDenormals))
+ if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
NewFDiv = Builder.CreateCall(Decl, { Num, Den });
}
@@ -1034,6 +1034,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
DA = &getAnalysis<LegacyDivergenceAnalysis>();
HasUnsafeFPMath = hasUnsafeFPMath(F);
+ HasFP32Denormals = ST->hasFP32Denormals(F);
bool MadeChange = false;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index e5732018ebb1..75537cbe2edb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -128,6 +128,10 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
// Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
// make the right decision when generating code for
diff erent targets.
const GCNSubtarget *Subtarget;
+
+ // Default FP mode for the current function.
+ AMDGPU::SIModeRegisterDefaults Mode;
+
bool EnableLateStructurizeCFG;
public:
@@ -393,6 +397,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
}
#endif
Subtarget = &MF.getSubtarget<GCNSubtarget>();
+ Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
return SelectionDAGISel::runOnMachineFunction(MF);
}
@@ -2104,7 +2109,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
- assert((IsFMA || !Subtarget->hasFP32Denormals()) &&
+ assert((IsFMA || !Mode.FP32Denormals) &&
"fmad selected with denormals enabled");
// TODO: We can select this with f32 denormals enabled if all the sources are
// converted from f16 (in which case fmad isn't legal).
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index f669c98969b6..e5b94247ee47 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -1581,8 +1581,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
// float fqneg = -fq;
SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
+ MachineFunction &MF = DAG.getMachineFunction();
+ const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
+
// float fr = mad(fqneg, fb, fa);
- unsigned OpCode = Subtarget->hasFP32Denormals() ?
+ unsigned OpCode = MFI->getMode().FP32Denormals ?
(unsigned)AMDGPUISD::FMAD_FTZ :
(unsigned)ISD::FMAD;
SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
@@ -1663,8 +1666,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
}
if (isTypeLegal(MVT::i64)) {
+ MachineFunction &MF = DAG.getMachineFunction();
+ const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
// Compute denominator reciprocal.
- unsigned FMAD = Subtarget->hasFP32Denormals() ?
+ unsigned FMAD = MFI->getMode().FP32Denormals ?
(unsigned)AMDGPUISD::FMAD_FTZ :
(unsigned)ISD::FMAD;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 3020e3dec064..9e76c47038b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -100,13 +100,16 @@ class PredicateControl {
class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
PredicateControl;
-def FP16Denormals : Predicate<"Subtarget->hasFP16Denormals()">;
-def FP32Denormals : Predicate<"Subtarget->hasFP32Denormals()">;
-def FP64Denormals : Predicate<"Subtarget->hasFP64Denormals()">;
-def NoFP16Denormals : Predicate<"!Subtarget->hasFP16Denormals()">;
-def NoFP32Denormals : Predicate<"!Subtarget->hasFP32Denormals()">;
-def NoFP64Denormals : Predicate<"!Subtarget->hasFP64Denormals()">;
+let RecomputePerFunction = 1 in {
+def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
+def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
+def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
+def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
+def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
+def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
+}
+
def FMA : Predicate<"Subtarget->hasFMA()">;
def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 89ca702f577d..940ddff85d73 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -18,6 +18,7 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
LocalMemoryObjects(),
ExplicitKernArgSize(0),
LDSSize(0),
+ Mode(MF.getFunction(), MF.getSubtarget<GCNSubtarget>()),
IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
MemoryBound(false),
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 9818ab1ef148..1933e41c66f3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -11,6 +11,7 @@
#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/MachineFunction.h"
+#include "Utils/AMDGPUBaseInfo.h"
namespace llvm {
@@ -28,6 +29,9 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
/// Number of bytes in the LDS that are being used.
unsigned LDSSize;
+ // State of MODE register, assumed FP mode.
+ AMDGPU::SIModeRegisterDefaults Mode;
+
// Kernels + shaders. i.e. functions called by the driver and not called
// by other functions.
bool IsEntryFunction;
@@ -53,6 +57,10 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
return LDSSize;
}
+ AMDGPU::SIModeRegisterDefaults getMode() const {
+ return Mode;
+ }
+
bool isEntryFunction() const {
return IsEntryFunction;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 936feb00c62b..08878d87fb09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -148,7 +148,12 @@ class AMDGPUSubtarget {
return HasMadMixInsts;
}
- bool hasFP32Denormals() const {
+ bool hasFP32Denormals(const Function &F) const {
+ // FIXME: This should not be a property of the subtarget. This should be a
+ // property with a default set by the calling convention which can be
+ // overridden by attributes. For now, use the subtarget feature as a
+ // placeholder attribute. The function arguments only purpose is to
+ // discourage use without a function context until this is removed.
return FP32Denormals;
}
@@ -612,11 +617,17 @@ class GCNSubtarget : public AMDGPUGenSubtargetInfo,
unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
const Function &) const;
- bool hasFP16Denormals() const {
+ /// Alias for hasFP64FP16Denormals
+ bool hasFP16Denormals(const Function &F) const {
return FP64FP16Denormals;
}
- bool hasFP64Denormals() const {
+ /// Alias for hasFP64FP16Denormals
+ bool hasFP64Denormals(const Function &F) const {
+ return FP64FP16Denormals;
+ }
+
+ bool hasFP64FP16Denormals(const Function &F) const {
return FP64FP16Denormals;
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 0d44f3be539e..a5066a0f669d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -412,7 +412,7 @@ int GCNTTIImpl::getArithmeticInstrCost(
if (!Args.empty() && match(Args[0], PatternMatch::m_FPOne())) {
// TODO: This is more complicated, unsafe flags etc.
- if ((SLT == MVT::f32 && !ST->hasFP32Denormals()) ||
+ if ((SLT == MVT::f32 && !HasFP32Denormals) ||
(SLT == MVT::f16 && ST->has16BitInsts())) {
return LT.first * getQuarterRateInstrCost() * NElts;
}
@@ -431,7 +431,7 @@ int GCNTTIImpl::getArithmeticInstrCost(
if (SLT == MVT::f32 || SLT == MVT::f16) {
int Cost = 7 * getFullRateInstrCost() + 1 * getQuarterRateInstrCost();
- if (!ST->hasFP32Denormals()) {
+ if (!HasFP32Denormals) {
// FP mode switches.
Cost += 2 * getFullRateInstrCost();
}
@@ -671,10 +671,13 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
- const FeatureBitset &CallerBits =
- TM.getSubtargetImpl(*Caller)->getFeatureBits();
- const FeatureBitset &CalleeBits =
- TM.getSubtargetImpl(*Callee)->getFeatureBits();
+ const GCNSubtarget *CallerST
+ = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Caller));
+ const GCNSubtarget *CalleeST
+ = static_cast<const GCNSubtarget *>(TM.getSubtargetImpl(*Callee));
+
+ const FeatureBitset &CallerBits = CallerST->getFeatureBits();
+ const FeatureBitset &CalleeBits = CalleeST->getFeatureBits();
FeatureBitset RealCallerBits = CallerBits & ~InlineFeatureIgnoreList;
FeatureBitset RealCalleeBits = CalleeBits & ~InlineFeatureIgnoreList;
@@ -683,8 +686,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
// FIXME: dx10_clamp can just take the caller setting, but there seems to be
// no way to support merge for backend defined attributes.
- AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
- AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
+ AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
+ AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
return CallerMode.isInlineCompatible(CalleeMode);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index b6e2db454e6d..b41f4348f04d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -46,7 +46,7 @@ class AMDGPUTTIImpl final : public BasicTTIImplBase<AMDGPUTTIImpl> {
Triple TargetTriple;
- const TargetSubtargetInfo *ST;
+ const GCNSubtarget *ST;
const TargetLoweringBase *TLI;
const TargetSubtargetInfo *getST() const { return ST; }
@@ -73,6 +73,7 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
const AMDGPUTargetLowering *TLI;
AMDGPUTTIImpl CommonTTI;
bool IsGraphicsShader;
+ bool HasFP32Denormals;
const FeatureBitset InlineFeatureIgnoreList = {
// Codegen control options which don't matter.
@@ -131,7 +132,8 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
ST(static_cast<const GCNSubtarget*>(TM->getSubtargetImpl(F))),
TLI(ST->getTargetLowering()),
CommonTTI(TM, F),
- IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())) {}
+ IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
+ HasFP32Denormals(ST->hasFP32Denormals(F)) { }
bool hasBranchDivergence() { return true; }
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 6196be8ec832..ee24022c65f4 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -226,10 +226,8 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FMA, MVT::f64, Expand);
}
- // FIXME: This was moved from AMDGPUTargetLowering, I'm not sure if we
- // need it for R600.
- if (!Subtarget->hasFP32Denormals())
- setOperationAction(ISD::FMAD, MVT::f32, Legal);
+ // FIXME: May need no denormals check
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
if (!Subtarget->hasBFI()) {
// fcopysign can be done in a single instruction with BFI.
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 3e5fa7068e05..ca17ba8b7229 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -1359,8 +1359,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_MUL_F32_e64:
case AMDGPU::V_MUL_F16_e64: {
// If output denormals are enabled, omod is ignored.
- if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
- (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
+ if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals) ||
+ (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16Denormals))
return std::make_pair(nullptr, SIOutMods::NONE);
const MachineOperand *RegOp = nullptr;
@@ -1389,8 +1389,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
case AMDGPU::V_ADD_F32_e64:
case AMDGPU::V_ADD_F16_e64: {
// If output denormals are enabled, omod is ignored.
- if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
- (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
+ if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals) ||
+ (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16Denormals))
return std::make_pair(nullptr, SIOutMods::NONE);
// Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1a02037fcd40..c4712198693c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -100,6 +100,16 @@ static cl::opt<bool> DisableLoopAlignment(
cl::desc("Do not align and prefetch loops"),
cl::init(false));
+static bool hasFP32Denormals(const MachineFunction &MF) {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ return Info->getMode().FP32Denormals;
+}
+
+static bool hasFP64FP16Denormals(const MachineFunction &MF) {
+ const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+ return Info->getMode().FP64FP16Denormals;
+}
+
static unsigned findFirstFreeSGPR(CCState &CCInfo) {
unsigned NumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
for (unsigned Reg = 0; Reg < NumSGPRs; ++Reg) {
@@ -370,9 +380,10 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::FLOG10, MVT::f16, Custom);
}
- // v_mad_f32 does not support denormals according to some sources.
- if (!Subtarget->hasFP32Denormals())
- setOperationAction(ISD::FMAD, MVT::f32, Legal);
+ // v_mad_f32 does not support denormals. We report it as unconditionally
+ // legal, and the context where it is formed will disallow it when fp32
+ // denormals are enabled.
+ setOperationAction(ISD::FMAD, MVT::f32, Legal);
if (!Subtarget->hasBFI()) {
// fcopysign can be done in a single instruction with BFI.
@@ -510,7 +521,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
// F16 - VOP3 Actions.
setOperationAction(ISD::FMA, MVT::f16, Legal);
- if (!Subtarget->hasFP16Denormals() && STI.hasMadF16())
+ if (STI.hasMadF16())
setOperationAction(ISD::FMAD, MVT::f16, Legal);
for (MVT VT : {MVT::v2i16, MVT::v2f16, MVT::v4i16, MVT::v4f16}) {
@@ -772,8 +783,9 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
EVT DestVT, EVT SrcVT) const {
return ((Opcode == ISD::FMAD && Subtarget->hasMadMixInsts()) ||
(Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
- DestVT.getScalarType() == MVT::f32 && !Subtarget->hasFP32Denormals() &&
- SrcVT.getScalarType() == MVT::f16;
+ DestVT.getScalarType() == MVT::f32 &&
+ SrcVT.getScalarType() == MVT::f16 &&
+ !hasFP32Denormals(DAG.getMachineFunction());
}
bool SITargetLowering::isShuffleMaskLegal(ArrayRef<int>, EVT) const {
@@ -3930,7 +3942,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
// mad available which returns the same result as the separate operations
// which we should prefer over fma. We can't use this if we want to support
// denormals, so only report this in these cases.
- if (Subtarget->hasFP32Denormals())
+ if (hasFP32Denormals(MF))
return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
// If the subtarget has v_fmac_f32, that's just as good as v_mac_f32.
@@ -3939,7 +3951,7 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
case MVT::f64:
return true;
case MVT::f16:
- return Subtarget->has16BitInsts() && Subtarget->hasFP16Denormals();
+ return Subtarget->has16BitInsts() && hasFP64FP16Denormals(MF);
default:
break;
}
@@ -3953,9 +3965,11 @@ bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG,
// v_mad_f32/v_mac_f32 do not support denormals.
EVT VT = N->getValueType(0);
if (VT == MVT::f32)
- return !Subtarget->hasFP32Denormals();
- if (VT == MVT::f16)
- return !Subtarget->hasFP16Denormals() && Subtarget->hasMadF16();
+ return !hasFP32Denormals(DAG.getMachineFunction());
+ if (VT == MVT::f16) {
+ return Subtarget->hasMadF16() &&
+ !hasFP64FP16Denormals(DAG.getMachineFunction());
+ }
return false;
}
@@ -7564,7 +7578,7 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
const SDNodeFlags Flags = Op->getFlags();
bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
- if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals())
+ if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction()))
return SDValue();
if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
@@ -7707,7 +7721,7 @@ SDValue SITargetLowering::lowerFDIV_FAST(SDValue Op, SelectionDAG &DAG) const {
static const SDValue getSPDenormModeValue(int SPDenormMode, SelectionDAG &DAG,
const SDLoc &SL, const GCNSubtarget *ST) {
assert(ST->hasDenormModeInst() && "Requires S_DENORM_MODE");
- int DPDenormModeDefault = ST->hasFP64Denormals()
+ int DPDenormModeDefault = hasFP64FP16Denormals(DAG.getMachineFunction())
? FP_DENORM_FLUSH_NONE
: FP_DENORM_FLUSH_IN_FLUSH_OUT;
@@ -7743,7 +7757,9 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
(1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
- if (!Subtarget->hasFP32Denormals()) {
+ const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
+
+ if (!HasFP32Denormals) {
SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
SDValue EnableDenorm;
@@ -7787,8 +7803,7 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
NumeratorScaled, Fma3);
- if (!Subtarget->hasFP32Denormals()) {
-
+ if (!HasFP32Denormals) {
SDValue DisableDenorm;
if (Subtarget->hasDenormModeInst()) {
const SDValue DisableDenormValue =
@@ -8762,7 +8777,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
auto F = CFP->getValueAPF();
if (F.isNaN() && F.isSignaling())
return false;
- return !F.isDenormal() || denormalsEnabledForType(Op.getValueType());
+ return !F.isDenormal() || denormalsEnabledForType(DAG, Op.getValueType());
}
// If source is a result of another standard FP operation it is already in
@@ -8831,7 +8846,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
// snans will be quieted, so we only need to worry about denormals.
if (Subtarget->supportsMinMaxDenormModes() ||
- denormalsEnabledForType(Op.getValueType()))
+ denormalsEnabledForType(DAG, Op.getValueType()))
return true;
// Flushing may be required.
@@ -8903,7 +8918,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
LLVM_FALLTHROUGH;
}
default:
- return denormalsEnabledForType(Op.getValueType()) &&
+ return denormalsEnabledForType(DAG, Op.getValueType()) &&
DAG.isKnownNeverSNaN(Op);
}
@@ -8914,7 +8929,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
SDValue SITargetLowering::getCanonicalConstantFP(
SelectionDAG &DAG, const SDLoc &SL, EVT VT, const APFloat &C) const {
// Flush denormals to 0 if not enabled.
- if (C.isDenormal() && !denormalsEnabledForType(VT))
+ if (C.isDenormal() && !denormalsEnabledForType(DAG, VT))
return DAG.getConstantFP(0.0, SL, VT);
if (C.isNaN()) {
@@ -9452,8 +9467,8 @@ unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
// Only do this if we are not trying to support denormals. v_mad_f32 does not
// support denormals ever.
- if (((VT == MVT::f32 && !Subtarget->hasFP32Denormals()) ||
- (VT == MVT::f16 && !Subtarget->hasFP16Denormals() &&
+ if (((VT == MVT::f32 && !hasFP32Denormals(DAG.getMachineFunction())) ||
+ (VT == MVT::f16 && !hasFP64FP16Denormals(DAG.getMachineFunction()) &&
getSubtarget()->hasMadF16())) &&
isOperationLegal(ISD::FMAD, VT))
return ISD::FMAD;
@@ -10964,14 +10979,14 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
return false;
}
-bool SITargetLowering::denormalsEnabledForType(EVT VT) const {
+bool SITargetLowering::denormalsEnabledForType(const SelectionDAG &DAG,
+ EVT VT) const {
switch (VT.getScalarType().getSimpleVT().SimpleTy) {
case MVT::f32:
- return Subtarget->hasFP32Denormals();
+ return hasFP32Denormals(DAG.getMachineFunction());
case MVT::f64:
- return Subtarget->hasFP64Denormals();
case MVT::f16:
- return Subtarget->hasFP16Denormals();
+ return hasFP64FP16Denormals(DAG.getMachineFunction());
default:
return false;
}
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index b2c2e40923ae..d82473fca987 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -393,7 +393,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
bool isCanonicalized(SelectionDAG &DAG, SDValue Op,
unsigned MaxDepth = 5) const;
- bool denormalsEnabledForType(EVT VT) const;
+ bool denormalsEnabledForType(const SelectionDAG &DAG, EVT VT) const;
bool isKnownNeverNaNForTargetNode(SDValue Op,
const SelectionDAG &DAG,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 7dd0f11c95de..0c67b1467a5d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -28,7 +28,6 @@ using namespace llvm;
SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
: AMDGPUMachineFunction(MF),
- Mode(MF.getFunction()),
PrivateSegmentBuffer(false),
DispatchPtr(false),
QueuePtr(false),
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0d6153d06ce2..ef0186f7d57f 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -340,9 +340,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
AMDGPUFunctionArgInfo ArgInfo;
- // State of MODE register, assumed FP mode.
- AMDGPU::SIModeRegisterDefaults Mode;
-
// Graphics info.
unsigned PSInputAddr = 0;
unsigned PSInputEnable = 0;
@@ -515,10 +512,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
: I->second.Lanes[Lane];
}
- AMDGPU::SIModeRegisterDefaults getMode() const {
- return Mode;
- }
-
bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
unsigned NumLane) const;
bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index c72f93eb739c..a4b216f583dc 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1303,7 +1303,8 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
return true;
}
-SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
+SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
+ const GCNSubtarget &ST) {
*this = getDefaultForCallingConv(F.getCallingConv());
StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
@@ -1314,6 +1315,9 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
= F.getFnAttribute("amdgpu-dx10-clamp").getValueAsString();
if (!DX10ClampAttr.empty())
DX10Clamp = DX10ClampAttr == "true";
+
+ FP32Denormals = ST.hasFP32Denormals(F);
+ FP64FP16Denormals = ST.hasFP64FP16Denormals(F);
}
namespace {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index f8c082060ff5..05bb39235a4b 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -676,7 +676,8 @@ struct SIModeRegisterDefaults {
FP32Denormals(true),
FP64FP16Denormals(true) {}
- SIModeRegisterDefaults(const Function &F);
+ // FIXME: Should not depend on the subtarget
+ SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST);
static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
const bool IsCompute = AMDGPU::isCompute(CC);
@@ -695,10 +696,23 @@ struct SIModeRegisterDefaults {
FP64FP16Denormals == Other.FP64FP16Denormals;
}
+ /// Returns true if a flag is compatible if it's enabled in the callee, but
+ /// disabled in the caller.
+ static bool oneWayCompatible(bool CallerMode, bool CalleeMode) {
+ return CallerMode == CalleeMode || (CallerMode && !CalleeMode);
+ }
+
// FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
// be able to override.
bool isInlineCompatible(SIModeRegisterDefaults CalleeMode) const {
- return *this == CalleeMode;
+ if (DX10Clamp != CalleeMode.DX10Clamp)
+ return false;
+ if (IEEE != CalleeMode.IEEE)
+ return false;
+
+ // Allow inlining denormals enabled into denormals flushed functions.
+ return oneWayCompatible(FP64FP16Denormals, CalleeMode.FP64FP16Denormals) &&
+ oneWayCompatible(FP32Denormals, CalleeMode.FP32Denormals);
}
};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
index 70c5edbd87f6..076b1dee6d67 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fcanonicalize.mir
@@ -37,8 +37,8 @@ body: |
liveins: $vgpr0
; GFX9-LABEL: name: fcanonicalize_f16_flush
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9: [[V_MAX_F16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F16_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
- ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F16_e64_]]
+ ; GFX9: [[V_MUL_F16_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F16_e64 0, 15360, 0, [[COPY]], 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F16_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s16) = G_TRUNC %0
%2:vgpr(s16) = G_FCANONICALIZE %1
@@ -60,8 +60,8 @@ body: |
; GFX9-LABEL: name: fcanonicalize_f32_denorm
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 0, [[COPY]], 0, 0, implicit $exec
- ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
+ ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = G_FCANONICALIZE %0
S_ENDPGM 0, implicit %1
@@ -170,8 +170,8 @@ body: |
; GFX9-LABEL: name: fcanonicalize_f64_flush
; GFX9: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
- ; GFX9: [[V_MAX_F64_:%[0-9]+]]:vreg_64 = V_MAX_F64 0, [[COPY]], 0, [[COPY]], 0, 0, implicit $exec
- ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F64_]]
+ ; GFX9: [[V_MUL_F64_:%[0-9]+]]:vreg_64 = V_MUL_F64 0, 4607182418800017408, 0, [[COPY]], 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F64_]]
%0:vgpr(s64) = COPY $vgpr0_vgpr1
%1:vgpr(s64) = G_FCANONICALIZE %0
S_ENDPGM 0, implicit %1
@@ -191,8 +191,8 @@ body: |
liveins: $vgpr0
; GFX9-LABEL: name: fcanonicalize_fabs_f32_denorm
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[COPY]], 0, 0, implicit $exec
- ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
+ ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[COPY]], 2, [[COPY]], 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = G_FABS %0
%2:vgpr(s32) = G_FCANONICALIZE %1
@@ -237,8 +237,8 @@ body: |
liveins: $vgpr0
; GFX9-LABEL: name: fcanonicalize_fneg_f32_denorm
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
- ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 3212836864, 0, [[COPY]], 0, 0, implicit $exec
- ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
+ ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 1, [[COPY]], 1, [[COPY]], 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = G_FNEG %0
%2:vgpr(s32) = G_FCANONICALIZE %1
@@ -283,8 +283,8 @@ body: |
; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 2147483648
; GFX9: [[V_XOR_B32_e32_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e32 [[S_MOV_B32_]], [[COPY]], implicit $exec
- ; GFX9: [[V_MUL_F32_e64_:%[0-9]+]]:vgpr_32 = V_MUL_F32_e64 0, 1065353216, 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec
- ; GFX9: S_ENDPGM 0, implicit [[V_MUL_F32_e64_]]
+ ; GFX9: [[V_MAX_F32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_F32_e64 2, [[V_XOR_B32_e32_]], 2, [[V_XOR_B32_e32_]], 0, 0, implicit $exec
+ ; GFX9: S_ENDPGM 0, implicit [[V_MAX_F32_e64_]]
%0:vgpr(s32) = COPY $vgpr0
%1:vgpr(s32) = G_FNEG %0
%2:vgpr(s32) = G_FABS %1
More information about the llvm-commits
mailing list