[llvm] Revert "[AArch64][GlobalISel] Add G_FPEXT(G_FCONSTANT) folding" (PR #162805)
Ryan Cowan via llvm-commits
llvm-commits at lists.llvm.org
Fri Oct 10 01:47:13 PDT 2025
https://github.com/HolyMolyCowMan created https://github.com/llvm/llvm-project/pull/162805
Reverts llvm/llvm-project#160902
>From a4ae3e2442d767f13070ea64b2ae71aba9c09d36 Mon Sep 17 00:00:00 2001
From: Ryan Cowan <ryan at holycowman.com>
Date: Fri, 10 Oct 2025 09:46:40 +0100
Subject: [PATCH] Revert "[AArch64][GlobalISel] Add G_FPEXT(G_FCONSTANT)
folding (#160902)"
This reverts commit 66da680330b27be569eb6a93056e53f3769a2910.
---
.../include/llvm/Target/GlobalISel/Combine.td | 2 -
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 1 -
llvm/lib/Target/AArch64/AArch64Combine.td | 2 +-
.../CodeGen/AArch64/arm64-indexed-memory.ll | 10 +-
llvm/test/CodeGen/AArch64/f16-instructions.ll | 18 +-
llvm/test/CodeGen/AArch64/fcvt-fixed.ll | 561 ++-
llvm/test/CodeGen/AArch64/frem-power2.ll | 3 +-
.../CodeGen/AArch64/vecreduce-fadd-strict.ll | 52 +-
.../CodeGen/AArch64/vecreduce-fmul-strict.ll | 30 +-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 3217 +++++++++--------
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 17 +-
llvm/test/CodeGen/AMDGPU/fmed3.ll | 46 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 15 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 15 +-
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 14 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 76 +-
llvm/test/CodeGen/AMDGPU/maximumnum.ll | 3 +-
llvm/test/CodeGen/AMDGPU/minimumnum.ll | 3 +-
18 files changed, 2298 insertions(+), 1787 deletions(-)
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3d21f522e97ce..e2b7a5ead2cd3 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -695,7 +695,6 @@ def constant_fold_fabs : constant_fold_unary_fp_op_rule<G_FABS>;
def constant_fold_fsqrt : constant_fold_unary_fp_op_rule<G_FSQRT>;
def constant_fold_flog2 : constant_fold_unary_fp_op_rule<G_FLOG2>;
def constant_fold_fptrunc : constant_fold_unary_fp_op_rule<G_FPTRUNC>;
-def constant_fold_fpext : constant_fold_unary_fp_op_rule<G_FPEXT>;
// Fold constant zero int to fp conversions.
class itof_const_zero_fold_rule<Instruction opcode> : GICombineRule <
@@ -714,7 +713,6 @@ def constant_fold_fp_ops : GICombineGroup<[
constant_fold_fsqrt,
constant_fold_flog2,
constant_fold_fptrunc,
- constant_fold_fpext,
itof_const_zero_fold_si,
itof_const_zero_fold_ui
]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b425b952bfc1d..906d62a33d51d 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1728,7 +1728,6 @@ static APFloat constantFoldFpUnary(const MachineInstr &MI,
Result.clearSign();
return Result;
}
- case TargetOpcode::G_FPEXT:
case TargetOpcode::G_FPTRUNC: {
bool Unused;
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index ecaeff77fcb4b..639ddcba28468 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -350,7 +350,7 @@ def AArch64PostLegalizerLowering
// Post-legalization combines which are primarily optimizations.
def AArch64PostLegalizerCombiner
: GICombiner<"AArch64PostLegalizerCombinerImpl",
- [copy_prop, cast_of_cast_combines, constant_fold_fp_ops,
+ [copy_prop, cast_of_cast_combines,
buildvector_of_truncate, integer_of_truncate,
mutate_anyext_to_zext, combines_for_extload,
combine_indexed_load_store, sext_trunc_sextload,
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index e8e563135acc5..322a96aca5db2 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -739,12 +739,14 @@ define ptr @postidx32_shalf(ptr %src, ptr %out, half %a) {
;
; GISEL-LABEL: postidx32_shalf:
; GISEL: ; %bb.0:
-; GISEL-NEXT: ldr h1, [x0], #4
+; GISEL-NEXT: movi d1, #0000000000000000
+; GISEL-NEXT: ldr h2, [x0], #4
; GISEL-NEXT: ; kill: def $h0 killed $h0 def $s0
; GISEL-NEXT: fmov w9, s0
-; GISEL-NEXT: fcvt s2, h1
-; GISEL-NEXT: fmov w8, s1
-; GISEL-NEXT: fcmp s2, #0.0
+; GISEL-NEXT: fcvt s3, h2
+; GISEL-NEXT: fmov w8, s2
+; GISEL-NEXT: fcvt s1, h1
+; GISEL-NEXT: fcmp s3, s1
; GISEL-NEXT: csel w8, w8, w9, mi
; GISEL-NEXT: strh w8, [x1]
; GISEL-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 085170c7ba381..b234ef7a5ff8b 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -782,16 +782,18 @@ define void @test_fccmp(half %in, ptr %out) {
;
; CHECK-CVT-GI-LABEL: test_fccmp:
; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-GI-NEXT: fcvt s1, h0
-; CHECK-CVT-GI-NEXT: fmov s2, #5.00000000
; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0
-; CHECK-CVT-GI-NEXT: fmov s3, #8.00000000
-; CHECK-CVT-GI-NEXT: fcmp s1, s2
-; CHECK-CVT-GI-NEXT: ldr h2, [x8, :lo12:.LCPI29_0]
+; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0
+; CHECK-CVT-GI-NEXT: fcvt s2, h0
+; CHECK-CVT-GI-NEXT: ldr h1, [x8, :lo12:.LCPI29_0]
+; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_1
+; CHECK-CVT-GI-NEXT: ldr h4, [x8, :lo12:.LCPI29_1]
; CHECK-CVT-GI-NEXT: fmov w8, s0
-; CHECK-CVT-GI-NEXT: fmov w9, s2
-; CHECK-CVT-GI-NEXT: fccmp s1, s3, #4, mi
+; CHECK-CVT-GI-NEXT: fcvt s3, h1
+; CHECK-CVT-GI-NEXT: fmov w9, s1
+; CHECK-CVT-GI-NEXT: fcvt s4, h4
+; CHECK-CVT-GI-NEXT: fcmp s2, s3
+; CHECK-CVT-GI-NEXT: fccmp s2, s4, #4, mi
; CHECK-CVT-GI-NEXT: csel w8, w8, w9, gt
; CHECK-CVT-GI-NEXT: strh w8, [x0]
; CHECK-CVT-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
index 743d1604388de..7409bfb91454c 100644
--- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -149,21 +149,33 @@ define i64 @fcvtzs_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_f16_i32_7(half %flt) {
-; CHECK-NO16-LABEL: fcvtzs_f16_i32_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI8_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI8_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI8_0
@@ -177,21 +189,33 @@ define i32 @fcvtzs_f16_i32_7(half %flt) {
}
define i32 @fcvtzs_f16_i32_15(half %flt) {
-; CHECK-NO16-LABEL: fcvtzs_f16_i32_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI9_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI9_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI9_0
@@ -205,21 +229,33 @@ define i32 @fcvtzs_f16_i32_15(half %flt) {
}
define i64 @fcvtzs_f16_i64_7(half %flt) {
-; CHECK-NO16-LABEL: fcvtzs_f16_i64_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI10_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI10_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI10_0
@@ -233,21 +269,33 @@ define i64 @fcvtzs_f16_i64_7(half %flt) {
}
define i64 @fcvtzs_f16_i64_15(half %flt) {
-; CHECK-NO16-LABEL: fcvtzs_f16_i64_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI11_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI11_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI11_0
@@ -405,21 +453,33 @@ define i64 @fcvtzu_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_f16_i32_7(half %flt) {
-; CHECK-NO16-LABEL: fcvtzu_f16_i32_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI20_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI20_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI20_0
@@ -433,21 +493,33 @@ define i32 @fcvtzu_f16_i32_7(half %flt) {
}
define i32 @fcvtzu_f16_i32_15(half %flt) {
-; CHECK-NO16-LABEL: fcvtzu_f16_i32_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI21_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI21_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI21_0
@@ -461,21 +533,33 @@ define i32 @fcvtzu_f16_i32_15(half %flt) {
}
define i64 @fcvtzu_f16_i64_7(half %flt) {
-; CHECK-NO16-LABEL: fcvtzu_f16_i64_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI22_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI22_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI22_0
@@ -489,21 +573,33 @@ define i64 @fcvtzu_f16_i64_7(half %flt) {
}
define i64 @fcvtzu_f16_i64_15(half %flt) {
-; CHECK-NO16-LABEL: fcvtzu_f16_i64_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI23_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI23_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI23_0
@@ -678,11 +774,13 @@ define half @scvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s1, w0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: scvtf s0, w0
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI32_0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI32_0]
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -716,11 +814,13 @@ define half @scvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s1, w0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: scvtf s0, w0
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI33_0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI33_0]
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -754,11 +854,13 @@ define half @scvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s1, x0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: scvtf s0, x0
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI34_0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI34_0]
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -792,11 +894,13 @@ define half @scvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s1, x0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: scvtf s0, x0
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI35_0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI35_0]
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -974,11 +1078,13 @@ define half @ucvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s1, w0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: ucvtf s0, w0
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI44_0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI44_0]
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1012,11 +1118,13 @@ define half @ucvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s1, w0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: ucvtf s0, w0
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI45_0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI45_0]
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1050,11 +1158,13 @@ define half @ucvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s1, x0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: ucvtf s0, x0
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI46_0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI46_0]
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1088,11 +1198,13 @@ define half @ucvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s1, x0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: ucvtf s0, x0
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI47_0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI47_0]
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1244,21 +1356,33 @@ define i64 @fcvtzs_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI55_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI55_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI55_0
@@ -1272,21 +1396,33 @@ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI56_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI56_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI56_0
@@ -1300,21 +1436,33 @@ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI57_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI57_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI57_0
@@ -1328,21 +1476,33 @@ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_15(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI58_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI58_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI58_0
@@ -1490,21 +1650,33 @@ define i64 @fcvtzu_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI66_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI66_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI66_0
@@ -1518,21 +1690,33 @@ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI67_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI67_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI67_0
@@ -1546,21 +1730,33 @@ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI68_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI68_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI68_0
@@ -1574,21 +1770,33 @@ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: adrp x8, .LCPI69_0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI69_0]
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI69_0
@@ -1603,3 +1811,4 @@ define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
; CHECK-FP16: {{.*}}
+; CHECK-NO16: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll
index e1bc7426ad63e..98276b68481a1 100644
--- a/llvm/test/CodeGen/AArch64/frem-power2.ll
+++ b/llvm/test/CodeGen/AArch64/frem-power2.ll
@@ -100,8 +100,9 @@ define half @hrem2_nsz(half %x) {
; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: fmov h1, #2.00000000
; CHECK-GI-NEXT: fcvt s0, h0
-; CHECK-GI-NEXT: fmov s1, #2.00000000
+; CHECK-GI-NEXT: fcvt s1, h1
; CHECK-GI-NEXT: bl fmodf
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index 8e0328eaa2658..be07978cd8516 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -38,11 +38,17 @@ define half @add_v2HalfH(<2 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v2HalfH:
; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0]
+; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
; CHECK-GI-NOFP16-NEXT: ret
;
@@ -82,13 +88,19 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v3HalfH:
; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2]
+; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0]
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
@@ -140,11 +152,17 @@ define half @add_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI3_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI3_0]
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -232,10 +250,16 @@ define half @add_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI4_0
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI4_0]
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -424,10 +448,16 @@ define half @add_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI7_0
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
+; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI7_0]
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
+; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fadd s2, s3, s2
+; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index 716401e2ebafe..c10d6e94226f2 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -52,11 +52,17 @@ define half @mul_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0]
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -138,10 +144,16 @@ define half @mul_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0]
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -309,10 +321,16 @@ define half @mul_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI5_0
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
+; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI5_0]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fmul s2, s3, s2
+; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
+; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 1b879a604d715..1aee6ab24eea0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -403,38 +403,40 @@ define half @v_neg_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -458,38 +460,40 @@ define half @v_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -513,38 +517,40 @@ define half @v_rcp_f16_arcp(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -569,7 +575,9 @@ define half @v_rcp_f16_arcp_afn(half %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -592,38 +600,40 @@ define half @v_rcp_f16_ulp25(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -1444,67 +1454,70 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1513,27 +1526,30 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1545,23 +1561,26 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1575,27 +1594,30 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1606,24 +1628,26 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
-; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1636,27 +1660,30 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1669,21 +1696,24 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1696,25 +1726,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
-; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -1725,67 +1757,70 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1794,27 +1829,30 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
-; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1826,23 +1864,26 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1856,27 +1897,30 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
-; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1887,24 +1931,26 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v1, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, -v1
-; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1917,27 +1963,30 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -1950,21 +1999,24 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -1977,25 +2029,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
-; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
@@ -2010,32 +2064,33 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2046,37 +2101,39 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v3, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2086,27 +2143,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2119,23 +2179,26 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2150,27 +2213,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2182,24 +2248,26 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, 1.0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v2, v5 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, v2
-; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -2211,29 +2279,32 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2245,23 +2316,26 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2272,30 +2346,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_mov_b32_e32 v5, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v7, v7, v4, v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v5 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
-; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2312,32 +2386,33 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v4, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2348,37 +2423,39 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v3, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2388,27 +2465,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
-; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2421,23 +2501,26 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2452,27 +2535,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
-; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2484,24 +2570,26 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, -1.0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v2, v5 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, -v2
-; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, -v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -2513,29 +2601,32 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2547,23 +2638,26 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2574,30 +2668,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_mov_b32_e32 v5, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v7, v7, v4, -v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
+; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
+; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
-; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
-; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2610,67 +2704,70 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2717,8 +2814,11 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
+; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
+; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2764,67 +2864,70 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2833,27 +2936,30 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2865,23 +2971,26 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2895,27 +3004,30 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2926,24 +3038,26 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
-; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
+; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2956,27 +3070,30 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2989,21 +3106,24 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -3016,25 +3136,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
-; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -3911,38 +4033,40 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -3975,38 +4099,40 @@ define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, -1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -4040,20 +4166,21 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rsq_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
@@ -4061,23 +4188,24 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-FLUSH-LABEL: s_rsq_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-FLUSH-NEXT: ; return to shader part epilog
@@ -4113,35 +4241,36 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[0:1], v1, v1, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v8, v4, v4
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v9, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v4, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v9, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v8, -v5, v6, 1.0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v9
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], -1.0, v1, -1.0
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v8, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v4, v2, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v6, s[0:1], v1, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v9, v5, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v5, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v6, v8, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[0:1]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -4154,40 +4283,42 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4199,28 +4330,31 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v10, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v8, v9
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4235,22 +4369,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4265,22 +4402,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
-; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
+; GFX9-IEEE-NEXT: v_fma_f32 v8, -v3, v9, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v8, v8, v6, v9
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4294,23 +4434,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4324,23 +4466,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
-; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4354,22 +4498,25 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, s1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4383,27 +4530,29 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
-; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -4419,20 +4568,21 @@ define half @v_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4440,23 +4590,24 @@ define half @v_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4481,20 +4632,21 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4502,23 +4654,24 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4553,20 +4706,21 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v2, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4574,23 +4728,24 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4630,20 +4785,21 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4651,23 +4807,24 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4702,20 +4859,21 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4723,23 +4881,24 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4774,20 +4933,21 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4795,23 +4955,24 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4846,20 +5007,21 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4867,23 +5029,24 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4918,20 +5081,21 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4939,23 +5103,24 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4991,20 +5156,21 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5012,23 +5178,24 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5053,20 +5220,21 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5074,23 +5242,24 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5125,10 +5294,12 @@ define half @v_rsq_f16_afn(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5153,10 +5324,12 @@ define half @v_rsq_f16_afn_nocontract(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
+; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5192,35 +5365,36 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], 1.0, v1, 1.0
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5230,40 +5404,42 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5272,28 +5448,31 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5307,22 +5486,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5336,22 +5518,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX9-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
-; GFX9-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
-; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, v5
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, 1.0
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, 1.0
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
+; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5364,23 +5549,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, v2
-; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5393,23 +5580,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, v2
-; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5422,22 +5611,25 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5450,7 +5642,7 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5458,20 +5650,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
-; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -5485,35 +5679,36 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], -1.0, v1, -1.0
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5523,40 +5718,42 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5565,28 +5762,31 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
-; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5600,22 +5800,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
+; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5629,22 +5832,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
-; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
+; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
+; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5657,23 +5863,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
-; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
+; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5686,23 +5894,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
-; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5715,22 +5925,25 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
-; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
+; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
+; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5743,7 +5956,7 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
+; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5751,20 +5964,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
-; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
-; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
-; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
-; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
+; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 549af87c94949..302b2395642d0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -88,10 +88,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT: v_or_b32_e32 v1, s4, v0
; CI-NEXT: .LBB0_8: ; %Flow19
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
+; CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2
; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
; CI-NEXT: s_and_b32 s2, 1, s2
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
@@ -1196,15 +1197,16 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v1, s4, v1
; CI-NEXT: .LBB9_16: ; %Flow54
; CI-NEXT: v_cvt_f32_f16_e32 v2, s1
+; CI-NEXT: v_cvt_f32_f16_e32 v3, 0
; CI-NEXT: s_and_b32 s0, s0, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00
; CI-NEXT: s_cselect_b32 s4, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v2
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v2, 0x7e00
; CI-NEXT: s_and_b32 s3, 1, s4
@@ -1728,25 +1730,26 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v3, s1, v3
; CI-NEXT: .LBB10_32: ; %Flow124
; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
+; CI-NEXT: v_cvt_f32_f16_e32 v5, 0
; CI-NEXT: s_and_b32 s1, s4, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00
; CI-NEXT: s_cselect_b32 s11, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5
; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
; CI-NEXT: s_and_b32 s2, s6, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s6, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v4
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5
; CI-NEXT: v_cvt_f32_f16_e32 v4, s3
; CI-NEXT: s_and_b32 s4, s5, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
; CI-NEXT: s_cselect_b32 s12, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0, v4
+; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5
; CI-NEXT: v_cvt_f32_f16_e32 v4, s10
; CI-NEXT: s_and_b32 s7, s7, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00
; CI-NEXT: s_cselect_b32 s7, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], 0, v4
+; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v4, 0x7e00
; CI-NEXT: s_and_b32 s10, 1, s11
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9e152253bb6ca..9233f8059a202 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -7464,15 +7464,18 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 1.0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, 2.0
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
+; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
@@ -7636,24 +7639,27 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0
; SI-GISEL-NEXT: s_mov_b32 s10, 0
; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 1.0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 2.0
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
-; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v5, 4.0
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5]
-; SI-GISEL-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6
+; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v7
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4
+; SI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -8706,10 +8712,12 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 2.0
+; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 4.0
+; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -8788,15 +8796,17 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a)
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 2.0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
-; SI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
+; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
-; SI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v1
+; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index ac356fad5b2da..af79c911f29f9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6011,7 +6011,8 @@ define half @v_exp_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6511,9 +6512,10 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6707,11 +6709,12 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index d12ebe49814d8..a99c1991a7909 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6092,7 +6092,8 @@ define half @v_exp10_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6593,9 +6594,10 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6789,11 +6791,12 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 259ee0b26d2d8..3f66c23e1a73b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -488,11 +488,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -580,13 +582,15 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; GISEL-CI-NEXT: s_mov_b32 s7, 0xf000
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0
; GISEL-CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; GISEL-CI-NEXT: s_waitcnt vmcnt(0)
-; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v1, v2, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index ba77552e5809b..21e6faf46f58d 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -313,11 +313,13 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -1007,26 +1009,28 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
@@ -1221,23 +1225,25 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4
; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v8
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v2, v3, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
-; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v3
+; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v3
+; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
@@ -1435,28 +1441,30 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
-; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v3, 0, v3
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5
+; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5
+; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5
+; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -1614,14 +1622,16 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
@@ -1780,15 +1790,17 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index c90b2c9170414..4f73e8e9c1883 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -271,7 +271,8 @@ define half @v_maximumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, 1.0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 64e8b7b50de08..558006d2b6957 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -271,7 +271,8 @@ define half @v_minimumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
+; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
More information about the llvm-commits
mailing list