[llvm] [AMDGPU] Correctly restore FP mode in FDIV32 lowering (PR #66346)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 14 02:15:26 PDT 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
<details>
<summary>Changes</summary>
Addresses the FIXME for both DAGISel and GISel.
--
Patch is 171.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66346.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+27-10)
- (modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+29-10)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll (+828-286)
- (modified) llvm/test/CodeGen/AMDGPU/fdiv.ll (+308-110)
<pre>
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index b5c746b81b91e32..2fd5f18f90e7435 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4657,6 +4657,10 @@ bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
return true;
}
+static const unsigned SPDenormModeBitField =
+ AMDGPU::Hwreg::ID_MODE | (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
+ (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
+
// Enable or disable FP32 denorm mode. When 'Enable' is true, emit instructions
// to enable denorm mode. When 'Enable' is false, disable denorm mode.
static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
@@ -4675,11 +4679,6 @@ static void toggleSPDenormMode(bool Enable, MachineIRBuilder &B,
.addImm(NewDenormModeValue);
} else {
- // Select FP32 bit field in mode register.
- unsigned SPDenormModeBitField = AMDGPU::Hwreg::ID_MODE |
- (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
- (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
-
B.buildInstr(AMDGPU::S_SETREG_IMM32_B32)
.addImm(SPDenormMode)
.addImm(SPDenormModeBitField);
@@ -4723,10 +4722,23 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
.setMIFlags(Flags);
auto NegDivScale0 = B.buildFNeg(S32, DenominatorScaled, Flags);
+ const bool PreservesDenormals = Mode.FP32Denormals == DenormalMode::getIEEE();
+ const bool HasDynamicDenormals =
+ (Mode.FP32Denormals.Input == DenormalMode::Dynamic) ||
+ (Mode.FP32Denormals.Output == DenormalMode::Dynamic);
+
// FIXME: Doesn't correctly model the FP mode switch, and the FP operations
// aren't modeled as reading it.
- if (Mode.FP32Denormals == DenormalMode::getPreserveSign())
+ Register SavedSPDenormMode;
+ if (!PreservesDenormals) {
+ if (HasDynamicDenormals) {
+ SavedSPDenormMode = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+ B.buildInstr(AMDGPU::S_GETREG_B32)
+ .addDef(SavedSPDenormMode)
+ .addImm(SPDenormModeBitField);
+ }
toggleSPDenormMode(true, B, ST, Mode);
+ }
auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
auto Fma1 = B.buildFMA(S32, Fma0, ApproxRcp, ApproxRcp, Flags);
@@ -4735,10 +4747,15 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
- // FIXME: This mishandles dynamic denormal mode. We need to query the
- // current mode and restore the original.
- if (Mode.FP32Denormals == DenormalMode::getPreserveSign())
- toggleSPDenormMode(false, B, ST, Mode);
+ if (!PreservesDenormals) {
+ if (HasDynamicDenormals) {
+ assert(SavedSPDenormMode);
+ B.buildInstr(AMDGPU::S_SETREG_B32)
+ .addReg(SavedSPDenormMode)
+ .addImm(SPDenormModeBitField);
+ } else
+ toggleSPDenormMode(false, B, ST, Mode);
+ }
auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32})
.addUse(Fma4.getReg(0))
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 998904bf08820c0..6f7cec17ab854e4 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9589,28 +9589,44 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
const DenormalMode DenormMode = Info->getMode().FP32Denormals;
- const bool HasFP32Denormals = DenormMode != DenormalMode::getPreserveSign();
+ const bool PreservesDenormals = DenormMode == DenormalMode::getIEEE();
+ const bool HasDynamicDenormals =
+ (DenormMode.Input == DenormalMode::Dynamic) ||
+ (DenormMode.Output == DenormalMode::Dynamic);
- if (!HasFP32Denormals) {
+ SDValue SavedDenormMode;
+
+ if (!PreservesDenormals) {
// Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
// lowering. The chain dependence is insufficient, and we need glue. We do
// not need the glue variants in a strictfp function.
SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+ SDValue Glue = DAG.getEntryNode();
+ if (HasDynamicDenormals) {
+ SDNode *GetReg = DAG.getMachineNode(AMDGPU::S_GETREG_B32, SL,
+ DAG.getVTList(MVT::i32, MVT::Glue),
+ {BitField, Glue});
+ SavedDenormMode = SDValue(GetReg, 0);
+
+ Glue = DAG.getMergeValues(
+ {DAG.getEntryNode(), SDValue(GetReg, 0), SDValue(GetReg, 1)}, SL);
+ }
+
SDNode *EnableDenorm;
if (Subtarget->hasDenormModeInst()) {
const SDValue EnableDenormValue =
getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, Info, Subtarget);
- EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
- DAG.getEntryNode(), EnableDenormValue).getNode();
+ EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs, Glue,
+ EnableDenormValue)
+ .getNode();
} else {
const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
SL, MVT::i32);
- EnableDenorm =
- DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
- {EnableDenormValue, BitField, DAG.getEntryNode()});
+ EnableDenorm = DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
+ {EnableDenormValue, BitField, Glue});
}
SDValue Ops[3] = {
@@ -9640,12 +9656,12 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
NumeratorScaled, Fma3, Flags);
- if (!HasFP32Denormals) {
+ if (!PreservesDenormals) {
// FIXME: This mishandles dynamic denormal mode. We need to query the
// current mode and restore the original.
SDNode *DisableDenorm;
- if (Subtarget->hasDenormModeInst()) {
+ if (!HasDynamicDenormals && Subtarget->hasDenormModeInst()) {
const SDValue DisableDenormValue = getSPDenormModeValue(
FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, Info, Subtarget);
@@ -9653,8 +9669,11 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
Fma4.getValue(1), DisableDenormValue,
Fma4.getValue(2)).getNode();
} else {
+ assert(HasDynamicDenormals == (bool)SavedDenormMode);
const SDValue DisableDenormValue =
- DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
+ HasDynamicDenormals
+ ? SavedDenormMode
+ : DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
DisableDenorm = DAG.getMachineNode(
AMDGPU::S_SETREG_B32, SL, MVT::Other,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 55b8dd68920975f..9da75b093fc9cb5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -209,12 +209,15 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX6-IEEE-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-IEEE-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
+; GFX6-IEEE-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-IEEE-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-IEEE-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
; GFX6-IEEE-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX6-IEEE-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX6-IEEE-FASTFMA-NEXT: s_setpc_b64 s[30:31]
@@ -225,12 +228,15 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX6-FLUSH-FASTFMA-NEXT: v_rcp_f32_e32 v3, v2
; GFX6-FLUSH-FASTFMA-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-FASTFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
+; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-FLUSH-FASTFMA-NEXT: v_mul_f32_e32 v5, v4, v3
; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v6, -v2, v5, v4
; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-FLUSH-FASTFMA-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-FASTFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
; GFX6-FLUSH-FASTFMA-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX6-FLUSH-FASTFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX6-FLUSH-FASTFMA-NEXT: s_setpc_b64 s[30:31]
@@ -240,13 +246,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX6-IEEE-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX6-IEEE-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX6-IEEE-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-IEEE-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0
; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4
; GFX6-IEEE-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4
; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3
; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5
; GFX6-IEEE-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX6-IEEE-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
; GFX6-IEEE-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5
; GFX6-IEEE-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX6-IEEE-SLOWFMA-NEXT: s_setpc_b64 s[30:31]
@@ -256,13 +265,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX6-FLUSH-SLOWFMA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX6-FLUSH-SLOWFMA-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX6-FLUSH-SLOWFMA-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX6-FLUSH-SLOWFMA-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, -v2, v4, 1.0
; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v4, v5, v4, v4
; GFX6-FLUSH-SLOWFMA-NEXT: v_mul_f32_e32 v5, v3, v4
; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v6, -v2, v5, v3
; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v5, v6, v4, v5
; GFX6-FLUSH-SLOWFMA-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX6-FLUSH-SLOWFMA-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fmas_f32 v2, v2, v4, v5
; GFX6-FLUSH-SLOWFMA-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX6-FLUSH-SLOWFMA-NEXT: s_setpc_b64 s[30:31]
@@ -272,13 +284,16 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX89-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-IEEE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0
; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4
; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4
; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3
; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5
; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5
; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -288,53 +303,105 @@ define float @v_fdiv_f32_dynamic_denorm(float %a, float %b) #0 {
; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX89-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4
; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3
; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5
; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5
; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX10-LABEL: v_fdiv_f32_dynamic_denorm:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0
-; GFX10-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
-; GFX10-NEXT: v_rcp_f32_e32 v3, v2
-; GFX10-NEXT: v_fma_f32 v4, -v2, v3, 1.0
-; GFX10-NEXT: v_fmac_f32_e32 v3, v4, v3
-; GFX10-NEXT: v_mul_f32_e32 v4, v5, v3
-; GFX10-NEXT: v_fma_f32 v6, -v2, v4, v5
-; GFX10-NEXT: v_fmac_f32_e32 v4, v6, v3
-; GFX10-NEXT: v_fma_f32 v2, -v2, v4, v5
-; GFX10-NEXT: v_div_fmas_f32 v2, v2, v3, v4
-; GFX10-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX10-NEXT: s_setpc_b64 s[30:31]
+; GFX10-IEEE-LABEL: v_fdiv_f32_dynamic_denorm:
+; GFX10-IEEE: ; %bb.0:
+; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-IEEE-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0
+; GFX10-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX10-IEEE-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
+; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX10-IEEE-NEXT: s_denorm_mode 15
+; GFX10-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX10-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX10-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX10-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
+; GFX10-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX10-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX11-LABEL: v_fdiv_f32_dynamic_denorm:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
-; GFX11-NEXT: v_div_scale_f32 v5, vcc_lo, v0, v1, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_rcp_f32_e32 v3, v2
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_f32 v4, -v2, v3, 1.0
-; GFX11-NEXT: v_fmac_f32_e32 v3, v4, v3
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_mul_f32_e32 v4, v5, v3
-; GFX11-NEXT: v_fma_f32 v6, -v2, v4, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_fmac_f32_e32 v4, v6, v3
-; GFX11-NEXT: v_fma_f32 v2, -v2, v4, v5
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_div_fmas_f32 v2, v2, v3, v4
-; GFX11-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX11-NEXT: s_setpc_b64 s[30:31]
+; GFX10-FLUSH-LABEL: v_fdiv_f32_dynamic_denorm:
+; GFX10-FLUSH: ; %bb.0:
+; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-FLUSH-NEXT: v_div_scale_f32 v2, s4, v1, v1, v0
+; GFX10-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX10-FLUSH-NEXT: s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
+; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX10-FLUSH-NEXT: s_denorm_mode 3
+; GFX10-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX10-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX10-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX10-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX10-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
+; GFX10-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX10-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX10-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-IEEE-LABEL: v_fdiv_f32_dynamic_denorm:
+; GFX11-IEEE: ; %bb.0:
+; GFX11-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-IEEE-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
+; GFX11-IEEE-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX11-IEEE-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX11-IEEE-NEXT: s_denorm_mode 15
+; GFX11-IEEE-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v3, v5, v3
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX11-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX11-IEEE-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
+; GFX11-IEEE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-FLUSH-LABEL: v_fdiv_f32_dynamic_denorm:
+; GFX11-FLUSH: ; %bb.0:
+; GFX11-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FLUSH-NEXT: v_div_scale_f32 v2, null, v1, v1, v0
+; GFX11-FLUSH-NEXT: v_div_scale_f32 v4, vcc_lo, v0, v1, v0
+; GFX11-FLUSH-NEXT: s_getreg_b32 s0, hwreg(HW_REG_MODE, 4, 2)
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX11-FLUSH-NEXT: s_denorm_mode 3
+; GFX11-FLUSH-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v3, v5, v3
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX11-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_fmac_f32_e32 v5, v6, v3
+; GFX11-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX11-FLUSH-NEXT: s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s0
+; GFX11-FLUSH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX11-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX11-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv f...
<truncated>
</pre>
</details>
https://github.com/llvm/llvm-project/pull/66346
More information about the llvm-commits
mailing list