[llvm-branch-commits] [llvm] ab3a3f5 - AMDGPU/GlobalISel: Update fdiv lowering for denormal/ulp interaction
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jan 6 09:36:45 PST 2021
Author: Matt Arsenault
Date: 2021-01-06T12:32:01-05:00
New Revision: ab3a3f543b18d36cec98faa9ca2a68cc9a6ecc65
URL: https://github.com/llvm/llvm-project/commit/ab3a3f543b18d36cec98faa9ca2a68cc9a6ecc65
DIFF: https://github.com/llvm/llvm-project/commit/ab3a3f543b18d36cec98faa9ca2a68cc9a6ecc65.diff
LOG: AMDGPU/GlobalISel: Update fdiv lowering for denormal/ulp interaction
Change the GlobalISel fast fdiv handling to match the changes in
2531535984ad989ce88aeee23cb92a827da6686e and
884acbb9e167d5668e43581630239d688edec8ad
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8c733a2afa03..a8e6f27e032b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -3053,22 +3053,14 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
Register Res = MI.getOperand(0).getReg();
Register LHS = MI.getOperand(1).getReg();
Register RHS = MI.getOperand(2).getReg();
-
uint16_t Flags = MI.getFlags();
-
LLT ResTy = MRI.getType(Res);
- LLT S32 = LLT::scalar(32);
- LLT S64 = LLT::scalar(64);
const MachineFunction &MF = B.getMF();
- bool Unsafe =
- MF.getTarget().Options.UnsafeFPMath || MI.getFlag(MachineInstr::FmArcp);
+ bool AllowInaccurateRcp = MF.getTarget().Options.UnsafeFPMath ||
+ MI.getFlag(MachineInstr::FmAfn);
- if (!MF.getTarget().Options.UnsafeFPMath && ResTy == S64)
- return false;
-
- if (!Unsafe && ResTy == S32 &&
- MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
+ if (!AllowInaccurateRcp)
return false;
if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
@@ -3095,17 +3087,13 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
}
// x / y -> x * (1.0 / y)
- if (Unsafe) {
- auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
- .addUse(RHS)
- .setMIFlags(Flags);
- B.buildFMul(Res, LHS, RCP, Flags);
-
- MI.eraseFromParent();
- return true;
- }
+ auto RCP = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {ResTy}, false)
+ .addUse(RHS)
+ .setMIFlags(Flags);
+ B.buildFMul(Res, LHS, RCP, Flags);
- return false;
+ MI.eraseFromParent();
+ return true;
}
bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 7775789bd0d2..c7b9b4f60bc6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -66,7 +66,28 @@ define half @v_fdiv_f16(half %a, half %b) {
}
define half @v_fdiv_f16_afn(half %a, half %b) {
-; GFX6-IEEE-LABEL: v_fdiv_f16_afn:
+; GFX6-LABEL: v_fdiv_f16_afn:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_rcp_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_fdiv_f16_afn:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_rcp_f16_e32 v1, v1
+; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv afn half %a, %b
+ ret half %fdiv
+}
+
+define half @v_fdiv_f16_ulp25(half %a, half %b) {
+; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -85,7 +106,7 @@ define half @v_fdiv_f16_afn(half %a, half %b) {
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_fdiv_f16_afn:
+; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -107,25 +128,29 @@ define half @v_fdiv_f16_afn(half %a, half %b) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX89-LABEL: v_fdiv_f16_afn:
+; GFX89-LABEL: v_fdiv_f16_ulp25:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v1, v1
-; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX89-NEXT: v_rcp_f32_e32 v2, v2
+; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX89-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn half %a, %b
+ %fdiv = fdiv half %a, %b, !fpmath !0
ret half %fdiv
}
-define half @v_fdiv_f16_ulp25(half %a, half %b) {
-; GFX6-IEEE-LABEL: v_fdiv_f16_ulp25:
+define half @v_rcp_f16(half %x) {
+; GFX6-IEEE-LABEL: v_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
@@ -133,18 +158,18 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_fdiv_f16_ulp25:
+; GFX6-FLUSH-LABEL: v_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
@@ -154,27 +179,27 @@ define half @v_fdiv_f16_ulp25(half %a, half %b) {
; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX89-LABEL: v_fdiv_f16_ulp25:
+; GFX89-LABEL: v_rcp_f16:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
-; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX89-NEXT: v_rcp_f32_e32 v2, v2
-; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2
-; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
; GFX89-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv half %a, %b, !fpmath !0
+ %fdiv = fdiv half 1.0, %x
ret half %fdiv
}
-define half @v_rcp_f16(half %x) {
-; GFX6-IEEE-LABEL: v_rcp_f16:
+define half @v_rcp_f16_arcp(half %x) {
+; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
@@ -193,7 +218,7 @@ define half @v_rcp_f16(half %x) {
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_rcp_f16:
+; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
@@ -215,30 +240,15 @@ define half @v_rcp_f16(half %x) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX89-LABEL: v_rcp_f16:
-; GFX89: ; %bb.0:
-; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v0, v0
-; GFX89-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv half 1.0, %x
- ret half %fdiv
-}
-
-define half @v_rcp_f16_arcp(half %x) {
-; GFX6-LABEL: v_rcp_f16_arcp:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
; GFX89-LABEL: v_rcp_f16_arcp:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v0, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX89-NEXT: v_rcp_f32_e32 v1, v1
+; GFX89-NEXT: v_mul_f32_e32 v1, v2, v1
+; GFX89-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX89-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
; GFX89-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp half 1.0, %x
ret half %fdiv
@@ -316,7 +326,28 @@ define half @v_rcp_f16_ulp25(half %x) {
}
define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
-; GFX6-IEEE-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX6-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_rcp_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX89: ; %bb.0:
+; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-NEXT: v_rcp_f16_e32 v1, v1
+; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv afn half %a, %b, !fpmath !0
+ ret half %fdiv
+}
+
+define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
+; GFX6-IEEE-LABEL: v_fdiv_f16_arcp_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -335,7 +366,7 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_fdiv_f16_afn_ulp25:
+; GFX6-FLUSH-LABEL: v_fdiv_f16_arcp_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -357,32 +388,15 @@ define half @v_fdiv_f16_afn_ulp25(half %a, half %b) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX89-LABEL: v_fdiv_f16_afn_ulp25:
-; GFX89: ; %bb.0:
-; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v1, v1
-; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
-; GFX89-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn half %a, %b, !fpmath !0
- ret half %fdiv
-}
-
-define half @v_fdiv_f16_arcp_ulp25(half %a, half %b) {
-; GFX6-LABEL: v_fdiv_f16_arcp_ulp25:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
; GFX89-LABEL: v_fdiv_f16_arcp_ulp25:
; GFX89: ; %bb.0:
; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: v_rcp_f16_e32 v1, v1
-; GFX89-NEXT: v_mul_f16_e32 v0, v0, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX89-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX89-NEXT: v_rcp_f32_e32 v2, v2
+; GFX89-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX89-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX89-NEXT: v_div_fixup_f16 v0, v2, v1, v0
; GFX89-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp half %a, %b, !fpmath !0
ret half %fdiv
@@ -508,76 +522,20 @@ define <2 x half> @v_fdiv_v2f16(<2 x half> %a, <2 x half> %b) {
}
define <2 x half> @v_fdiv_v2f16_afn(<2 x half> %a, <2 x half> %b) {
-; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn:
-; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn:
-; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v5, v4
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v4, v5, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v7, v5, v5
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v8, -v4, v7, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v4, v7, v6
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v4, v4, v5, v7
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v4, v2, v0
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v3, v3, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v3, v1
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v3, v1
-; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: v_fdiv_v2f16_afn:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
+; GFX6-NEXT: v_rcp_f32_e32 v3, v3
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_v2f16_afn:
; GFX8: ; %bb.0:
@@ -799,8 +757,18 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-LABEL: v_rcp_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_rcp_f16_e32 v0, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -809,35 +777,113 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-LABEL: v_rcp_v2f16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_rcp_f32_e32 v1, v1
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
ret <2 x half> %fdiv
}
define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
-; GFX6-LABEL: v_rcp_v2f16_arcp:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: s_movk_i32 s6, 0x3c00
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, s6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, s6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rcp_v2f16_arcp:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_rcp_f16_e32 v0, v0
+; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-NEXT: v_rcp_f32_e32 v1, v1
+; GFX8-NEXT: v_rcp_f32_e32 v3, v3
+; GFX8-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX8-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX8-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX8-NEXT: v_mov_b32_e32 v2, 16
; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -846,10 +892,21 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX9-LABEL: v_rcp_v2f16_arcp:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v1, v0
-; GFX9-NEXT: v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-NEXT: v_rcp_f32_e32 v1, v1
+; GFX9-NEXT: v_rcp_f32_e32 v3, v3
+; GFX9-NEXT: v_mul_f32_e32 v1, v4, v1
+; GFX9-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v2, 1.0
; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v1, v2, v0
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x half> <half 1.0, half 1.0>, %x
ret <2 x half> %fdiv
@@ -987,7 +1044,49 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
}
define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
-; GFX6-IEEE-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_rcp_f32_e32 v2, v2
+; GFX6-NEXT: v_rcp_f32_e32 v3, v3
+; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX8: ; %bb.0:
+; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT: v_rcp_f16_e32 v2, v1
+; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT: v_mov_b32_e32 v1, 16
+; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_rcp_f16_e32 v2, v1
+; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
+; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
+ ret <2 x half> %fdiv
+}
+
+define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
+; GFX6-IEEE-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -1020,7 +1119,7 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-FLUSH-LABEL: v_fdiv_v2f16_afn_ulp25:
+; GFX6-FLUSH-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -1058,69 +1157,48 @@ define <2 x half> @v_fdiv_v2f16_afn_ulp25(<2 x half> %a, <2 x half> %b) {
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
-; GFX8-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_e32 v2, v1
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v1, 16
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: v_fdiv_v2f16_afn_ulp25:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
- %fdiv = fdiv afn <2 x half> %a, %b, !fpmath !0
- ret <2 x half> %fdiv
-}
-
-define <2 x half> @v_fdiv_v2f16_arcp_ulp25(<2 x half> %a, <2 x half> %b) {
-; GFX6-LABEL: v_fdiv_v2f16_arcp_ulp25:
-; GFX6: ; %bb.0:
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_rcp_f32_e32 v2, v2
-; GFX6-NEXT: v_rcp_f32_e32 v3, v3
-; GFX6-NEXT: v_mul_f32_e32 v0, v0, v2
-; GFX6-NEXT: v_mul_f32_e32 v1, v1, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: s_setpc_b64 s[30:31]
-;
; GFX8-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_rcp_f16_e32 v2, v1
-; GFX8-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX8-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT: v_mov_b32_e32 v1, 16
-; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX8-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX8-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX8-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX8-NEXT: v_rcp_f32_e32 v2, v2
+; GFX8-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX8-NEXT: v_rcp_f32_e32 v5, v5
+; GFX8-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX8-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX8-NEXT: v_mul_f32_e32 v3, v7, v5
+; GFX8-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX8-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX8-NEXT: v_mov_b32_e32 v2, 16
+; GFX8-NEXT: v_div_fixup_f16 v1, v3, v4, v6
+; GFX8-NEXT: v_lshlrev_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX8-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_fdiv_v2f16_arcp_ulp25:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_rcp_f16_e32 v2, v1
-; GFX9-NEXT: v_rcp_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-NEXT: v_mul_f16_e32 v2, v0, v2
-; GFX9-NEXT: v_mul_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT: v_mov_b32_e32 v1, 0xffff
-; GFX9-NEXT: v_and_or_b32 v0, v2, v1, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v4, 16, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-NEXT: v_cvt_f32_f16_e32 v5, v4
+; GFX9-NEXT: v_lshrrev_b32_e32 v6, 16, v0
+; GFX9-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX9-NEXT: v_rcp_f32_e32 v2, v2
+; GFX9-NEXT: v_cvt_f32_f16_e32 v7, v6
+; GFX9-NEXT: v_rcp_f32_e32 v5, v5
+; GFX9-NEXT: v_mul_f32_e32 v2, v3, v2
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX9-NEXT: v_mul_f32_e32 v3, v7, v5
+; GFX9-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX9-NEXT: v_div_fixup_f16 v0, v2, v1, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT: v_div_fixup_f16 v1, v3, v4, v6
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX9-NEXT: v_and_or_b32 v0, v0, v2, v1
; GFX9-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x half> %a, %b, !fpmath !0
ret <2 x half> %fdiv
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 02114a058c89..a29c96b93f56 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -157,11 +157,23 @@ define float @v_rcp_f32(float %x) {
; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-FLUSH-LABEL: v_rcp_f32:
-; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX6-FLUSH-LABEL: v_rcp_f32:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-IEEE-LABEL: v_rcp_f32:
; GFX89-IEEE: ; %bb.0:
@@ -178,16 +190,96 @@ define float @v_rcp_f32(float %x) {
; GFX89-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4
; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_f32:
+; GFX89-FLUSH: ; %bb.0:
+; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv float 1.0, %x
ret float %fdiv
}
define float @v_rcp_f32_arcp(float %x) {
-; GCN-LABEL: v_rcp_f32_arcp:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_rcp_f32_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_f32_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_rcp_f32_arcp:
+; GFX89-IEEE: ; %bb.0:
+; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v3, v1
+; GFX89-IEEE-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX89-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX89-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_f32_arcp:
+; GFX89-FLUSH: ; %bb.0:
+; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v3, v1
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, -v1, v3, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v3, v4, v3, v3
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v4, v2, v3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v2
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v2
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v3, v4
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp float 1.0, %x
ret float %fdiv
}
@@ -237,12 +329,21 @@ define float @v_fdiv_f32_afn_ulp25(float %a, float %b) {
}
define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
-; GCN-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
-; GCN-IEEE: ; %bb.0:
-; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-NEXT: v_rcp_f32_e32 v1, v1
-; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-FLUSH-LABEL: v_fdiv_f32_arcp_ulp25:
; GCN-FLUSH: ; %bb.0:
@@ -256,6 +357,22 @@ define float @v_fdiv_f32_arcp_ulp25(float %a, float %b) {
; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v2, v0
; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_fdiv_f32_arcp_ulp25:
+; GFX89-IEEE: ; %bb.0:
+; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v3, vcc, v0, v1, v0
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-IEEE-NEXT: v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX89-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GFX89-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp float %a, %b, !fpmath !0
ret float %fdiv
}
@@ -500,12 +617,36 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
-; GCN-FLUSH-LABEL: v_rcp_v2f32:
-; GCN-FLUSH: ; %bb.0:
-; GCN-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
-; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31]
+; GFX6-FLUSH-LABEL: v_rcp_v2f32:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-IEEE-LABEL: v_rcp_v2f32:
; GFX89-IEEE: ; %bb.0:
@@ -534,17 +675,158 @@ define <2 x float> @v_rcp_v2f32(<2 x float> %x) {
; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_v2f32:
+; GFX89-FLUSH: ; %bb.0:
+; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX89-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x float> <float 1.0, float 1.0>, %x
ret <2 x float> %fdiv
}
define <2 x float> @v_rcp_v2f32_arcp(<2 x float> %x) {
-; GCN-LABEL: v_rcp_v2f32_arcp:
-; GCN: ; %bb.0:
-; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_rcp_f32_e32 v0, v0
-; GCN-NEXT: v_rcp_f32_e32 v1, v1
-; GCN-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_rcp_v2f32_arcp:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v5, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v3, v5, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX6-FLUSH-LABEL: v_rcp_v2f32_arcp:
+; GFX6-FLUSH: ; %bb.0:
+; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, v2
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_rcp_v2f32_arcp:
+; GFX89-IEEE: ; %bb.0:
+; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], 1.0, v1, 1.0
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v6, v2
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v7, v3
+; GFX89-IEEE-NEXT: v_fma_f32 v8, -v2, v6, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v9, -v3, v7, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v6, v8, v6, v6
+; GFX89-IEEE-NEXT: v_fma_f32 v7, v9, v7, v7
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX89-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v4
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v9, v5, v7
+; GFX89-IEEE-NEXT: v_fma_f32 v11, -v3, v9, v5
+; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX89-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v4
+; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v7, v9
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v2, v2, v6, v8
+; GFX89-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v5
+; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v3, v3, v7, v9
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, 1.0
+; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-FLUSH-LABEL: v_rcp_v2f32_arcp:
+; GFX89-FLUSH: ; %bb.0:
+; GFX89-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v4, v5, v4, v4
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v3, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v3
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v4, v5
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v3
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, 1.0
+; GFX89-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX89-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, -v3, v5, 1.0
+; GFX89-FLUSH-NEXT: v_fma_f32 v2, v2, v5, v5
+; GFX89-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX89-FLUSH-NEXT: v_fma_f32 v6, -v3, v5, v4
+; GFX89-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX89-FLUSH-NEXT: v_fma_f32 v3, -v3, v5, v4
+; GFX89-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX89-FLUSH-NEXT: v_div_fmas_f32 v2, v3, v2, v5
+; GFX89-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX89-FLUSH-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x float> <float 1.0, float 1.0>, %x
ret <2 x float> %fdiv
}
@@ -604,14 +886,32 @@ define <2 x float> @v_fdiv_v2f32_afn_ulp25(<2 x float> %a, <2 x float> %b) {
}
define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
-; GCN-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
-; GCN-IEEE: ; %bb.0:
-; GCN-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-NEXT: v_rcp_f32_e32 v2, v2
-; GCN-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GCN-IEEE-NEXT: v_mul_f32_e32 v0, v0, v2
-; GCN-IEEE-NEXT: v_mul_f32_e32 v1, v1, v3
-; GCN-IEEE-NEXT: s_setpc_b64 s[30:31]
+; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
+; GFX6-IEEE: ; %bb.0:
+; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v4
+; GFX6-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v4, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v7, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v7, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v4, v7, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v7, v6
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v4, v4, v5, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, vcc, v1, v3, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v4, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v5, v6, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v5, v6, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v3, v1
+; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GCN-FLUSH-LABEL: v_fdiv_v2f32_arcp_ulp25:
; GCN-FLUSH: ; %bb.0:
@@ -631,6 +931,34 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
; GCN-FLUSH-NEXT: v_mul_f32_e32 v0, v5, v0
; GCN-FLUSH-NEXT: v_mul_f32_e32 v1, v4, v1
; GCN-FLUSH-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX89-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
+; GFX89-IEEE: ; %bb.0:
+; GFX89-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX89-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v2, v2, v0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v3, v3, v1
+; GFX89-IEEE-NEXT: v_div_scale_f32 v6, vcc, v0, v2, v0
+; GFX89-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v1, v3, v1
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v8, v4
+; GFX89-IEEE-NEXT: v_rcp_f32_e32 v9, v5
+; GFX89-IEEE-NEXT: v_fma_f32 v10, -v4, v8, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v11, -v5, v9, 1.0
+; GFX89-IEEE-NEXT: v_fma_f32 v8, v10, v8, v8
+; GFX89-IEEE-NEXT: v_fma_f32 v9, v11, v9, v9
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v10, v6, v8
+; GFX89-IEEE-NEXT: v_fma_f32 v12, -v4, v10, v6
+; GFX89-IEEE-NEXT: v_mul_f32_e32 v11, v7, v9
+; GFX89-IEEE-NEXT: v_fma_f32 v13, -v5, v11, v7
+; GFX89-IEEE-NEXT: v_fma_f32 v10, v12, v8, v10
+; GFX89-IEEE-NEXT: v_fma_f32 v4, -v4, v10, v6
+; GFX89-IEEE-NEXT: v_fma_f32 v11, v13, v9, v11
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v4, v4, v8, v10
+; GFX89-IEEE-NEXT: v_fma_f32 v5, -v5, v11, v7
+; GFX89-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
+; GFX89-IEEE-NEXT: v_div_fmas_f32 v5, v5, v9, v11
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v0, v4, v2, v0
+; GFX89-IEEE-NEXT: v_div_fixup_f32 v1, v5, v3, v1
+; GFX89-IEEE-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
ret <2 x float> %fdiv
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 3b9e56b1a742..b7546223f7d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -410,21 +410,12 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
; CI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; CI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v0, s2
-; CI-NEXT: v_mov_b32_e32 v1, s3
-; CI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1]
-; CI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
-; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; CI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; CI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; CI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; CI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; CI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
-; CI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
+; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; CI-NEXT: v_mov_b32_e32 v2, s0
+; CI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
+; CI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
+; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; CI-NEXT: v_mov_b32_e32 v2, s4
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -438,21 +429,12 @@ define amdgpu_kernel void @fast_frem_f64(double addrspace(1)* %out, double addrs
; VI-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
; VI-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v0, s2
-; VI-NEXT: v_mov_b32_e32 v1, s3
-; VI-NEXT: v_div_scale_f64 v[2:3], s[2:3], v[0:1], v[0:1], s[0:1]
-; VI-NEXT: v_div_scale_f64 v[8:9], vcc, s[0:1], v[0:1], s[0:1]
-; VI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0
-; VI-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5]
-; VI-NEXT: v_mul_f64 v[6:7], v[8:9], v[4:5]
-; VI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[6:7], v[8:9]
-; VI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[6:7]
-; VI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], v[0:1], s[0:1]
-; VI-NEXT: v_trunc_f64_e32 v[2:3], v[2:3]
-; VI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[0:1], s[0:1]
+; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3]
+; VI-NEXT: v_mov_b32_e32 v2, s0
+; VI-NEXT: v_mul_f64 v[0:1], s[0:1], v[0:1]
+; VI-NEXT: v_trunc_f64_e32 v[0:1], v[0:1]
+; VI-NEXT: v_fma_f64 v[0:1], -v[0:1], s[2:3], v[2:3]
; VI-NEXT: v_mov_b32_e32 v2, s4
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
index 9bef474d08e3..4554bc81ef93 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fdiv.mir
@@ -308,21 +308,60 @@ body: |
; SI-LABEL: name: test_fdiv_s32_denorms_off_arcp
; SI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; SI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; SI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
- ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
- ; SI: $vgpr0 = COPY [[FMUL]](s32)
+ ; SI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; SI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+ ; SI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+ ; SI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; SI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+ ; SI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode
+ ; SI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+ ; SI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+ ; SI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+ ; SI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+ ; SI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+ ; SI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+ ; SI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode
+ ; SI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+ ; SI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+ ; SI: $vgpr0 = COPY [[INT6]](s32)
; VI-LABEL: name: test_fdiv_s32_denorms_off_arcp
; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; VI: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
- ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
- ; VI: $vgpr0 = COPY [[FMUL]](s32)
+ ; VI: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; VI: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+ ; VI: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+ ; VI: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; VI: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+ ; VI: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode
+ ; VI: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+ ; VI: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+ ; VI: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+ ; VI: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+ ; VI: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+ ; VI: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+ ; VI: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode
+ ; VI: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+ ; VI: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+ ; VI: $vgpr0 = COPY [[INT6]](s32)
; GFX9-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX9: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
- ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
- ; GFX9: $vgpr0 = COPY [[FMUL]](s32)
+ ; GFX9: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX9: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+ ; GFX9: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+ ; GFX9: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; GFX9: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+ ; GFX9: S_SETREG_IMM32_B32 3, 2305, implicit-def $mode, implicit $mode
+ ; GFX9: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+ ; GFX9: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+ ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+ ; GFX9: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+ ; GFX9: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+ ; GFX9: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+ ; GFX9: S_SETREG_IMM32_B32 0, 2305, implicit-def $mode, implicit $mode
+ ; GFX9: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+ ; GFX9: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+ ; GFX9: $vgpr0 = COPY [[INT6]](s32)
; GFX9-UNSAFE-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9-UNSAFE: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
@@ -332,9 +371,22 @@ body: |
; GFX10-LABEL: name: test_fdiv_s32_denorms_off_arcp
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
- ; GFX10: [[INT:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[COPY1]](s32)
- ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[COPY]], [[INT]]
- ; GFX10: $vgpr0 = COPY [[FMUL]](s32)
+ ; GFX10: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
+ ; GFX10: [[INT:%[0-9]+]]:_(s32), [[INT1:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 0
+ ; GFX10: [[INT2:%[0-9]+]]:_(s32), [[INT3:%[0-9]+]]:_(s1) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.scale), [[COPY]](s32), [[COPY1]](s32), 1
+ ; GFX10: [[INT4:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[INT]](s32)
+ ; GFX10: [[FNEG:%[0-9]+]]:_(s32) = arcp G_FNEG [[INT]]
+ ; GFX10: S_DENORM_MODE 15, implicit-def $mode, implicit $mode
+ ; GFX10: [[FMA:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[INT4]], [[C]]
+ ; GFX10: [[FMA1:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA]], [[INT4]], [[INT4]]
+ ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = arcp G_FMUL [[INT2]], [[FMA1]]
+ ; GFX10: [[FMA2:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMUL]], [[INT2]]
+ ; GFX10: [[FMA3:%[0-9]+]]:_(s32) = arcp G_FMA [[FMA2]], [[FMA1]], [[FMUL]]
+ ; GFX10: [[FMA4:%[0-9]+]]:_(s32) = arcp G_FMA [[FNEG]], [[FMA3]], [[INT2]]
+ ; GFX10: S_DENORM_MODE 12, implicit-def $mode, implicit $mode
+ ; GFX10: [[INT5:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fmas), [[FMA4]](s32), [[FMA1]](s32), [[FMA3]](s32), [[INT3]](s1)
+ ; GFX10: [[INT6:%[0-9]+]]:_(s32) = arcp G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[INT5]](s32), [[COPY1]](s32), [[COPY]](s32)
+ ; GFX10: $vgpr0 = COPY [[INT6]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = arcp G_FDIV %0, %1
@@ -1898,16 +1950,28 @@ body: |
; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI: $vgpr0 = COPY [[ANYEXT]](s32)
; VI-LABEL: name: test_fdiv_s16_constant_one_rcp
+ ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
- ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; VI: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-LABEL: name: test_fdiv_s16_constant_one_rcp
+ ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
- ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_one_rcp
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -1916,10 +1980,16 @@ body: |
; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-LABEL: name: test_fdiv_s16_constant_one_rcp
+ ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH3C00
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[TRUNC]](s16)
- ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s16) = G_FCONSTANT half 1.0
%1:_(s32) = COPY $vgpr0
@@ -1958,18 +2028,28 @@ body: |
; SI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FPTRUNC]](s16)
; SI: $vgpr0 = COPY [[ANYEXT]](s32)
; VI-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
+ ; VI: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00
; VI: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; VI: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; VI: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
- ; VI: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
- ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; VI: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; VI: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; VI: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; VI: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; VI: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; VI: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; VI: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; VI: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
+ ; GFX9: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00
; GFX9: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX9: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX9: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
- ; GFX9: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
- ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; GFX9: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; GFX9: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; GFX9: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; GFX9: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; GFX9: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX9: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; GFX9: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX9: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX9-UNSAFE-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
; GFX9-UNSAFE: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
@@ -1979,11 +2059,16 @@ body: |
; GFX9-UNSAFE: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
; GFX9-UNSAFE: $vgpr0 = COPY [[ANYEXT]](s32)
; GFX10-LABEL: name: test_fdiv_s16_constant_negative_one_rcp
+ ; GFX10: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xHBC00
; GFX10: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; GFX10: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
- ; GFX10: [[FNEG:%[0-9]+]]:_(s16) = G_FNEG [[TRUNC]]
- ; GFX10: [[INT:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FNEG]](s16)
- ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT]](s16)
+ ; GFX10: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[C]](s16)
+ ; GFX10: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[TRUNC]](s16)
+ ; GFX10: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.amdgcn.rcp), [[FPEXT1]](s32)
+ ; GFX10: [[FMUL:%[0-9]+]]:_(s32) = G_FMUL [[FPEXT]], [[INT]]
+ ; GFX10: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[FMUL]](s32)
+ ; GFX10: [[INT1:%[0-9]+]]:_(s16) = G_INTRINSIC intrinsic(@llvm.amdgcn.div.fixup), [[FPTRUNC]](s16), [[TRUNC]](s16), [[C]](s16)
+ ; GFX10: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[INT1]](s16)
; GFX10: $vgpr0 = COPY [[ANYEXT]](s32)
%0:_(s16) = G_FCONSTANT half -1.0
%1:_(s32) = COPY $vgpr0
More information about the llvm-branch-commits
mailing list