[llvm] [AArch64][GlobalISel] Add G_FPEXT(G_FCONSTANT) folding (PR #160902)
Ryan Cowan via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 8 08:33:24 PDT 2025
https://github.com/HolyMolyCowMan updated https://github.com/llvm/llvm-project/pull/160902
>From af4a9c68f44fc55cbd28de8044dc9687668853af Mon Sep 17 00:00:00 2001
From: Ryan Cowan <ryan.cowan at arm.com>
Date: Fri, 26 Sep 2025 15:03:28 +0000
Subject: [PATCH] [AArch64][GlobalISel] Add G_FPEXT(G_FCONSTANT) folding
---
.../include/llvm/Target/GlobalISel/Combine.td | 2 +
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 1 +
llvm/lib/Target/AArch64/AArch64Combine.td | 2 +-
.../CodeGen/AArch64/arm64-indexed-memory.ll | 10 +-
llvm/test/CodeGen/AArch64/f16-instructions.ll | 18 +-
llvm/test/CodeGen/AArch64/fcvt-fixed.ll | 561 +--
llvm/test/CodeGen/AArch64/frem-power2.ll | 3 +-
.../CodeGen/AArch64/vecreduce-fadd-strict.ll | 52 +-
.../CodeGen/AArch64/vecreduce-fmul-strict.ll | 30 +-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 3217 ++++++++---------
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 17 +-
llvm/test/CodeGen/AMDGPU/fmed3.ll | 46 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 15 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 15 +-
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 14 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 76 +-
llvm/test/CodeGen/AMDGPU/maximumnum.ll | 3 +-
llvm/test/CodeGen/AMDGPU/minimumnum.ll | 3 +-
18 files changed, 1787 insertions(+), 2298 deletions(-)
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index e2b7a5ead2cd3..3d21f522e97ce 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -695,6 +695,7 @@ def constant_fold_fabs : constant_fold_unary_fp_op_rule<G_FABS>;
def constant_fold_fsqrt : constant_fold_unary_fp_op_rule<G_FSQRT>;
def constant_fold_flog2 : constant_fold_unary_fp_op_rule<G_FLOG2>;
def constant_fold_fptrunc : constant_fold_unary_fp_op_rule<G_FPTRUNC>;
+def constant_fold_fpext : constant_fold_unary_fp_op_rule<G_FPEXT>;
// Fold constant zero int to fp conversions.
class itof_const_zero_fold_rule<Instruction opcode> : GICombineRule <
@@ -713,6 +714,7 @@ def constant_fold_fp_ops : GICombineGroup<[
constant_fold_fsqrt,
constant_fold_flog2,
constant_fold_fptrunc,
+ constant_fold_fpext,
itof_const_zero_fold_si,
itof_const_zero_fold_ui
]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index fa0ccd625b504..152c508585968 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1728,6 +1728,7 @@ static APFloat constantFoldFpUnary(const MachineInstr &MI,
Result.clearSign();
return Result;
}
+ case TargetOpcode::G_FPEXT:
case TargetOpcode::G_FPTRUNC: {
bool Unused;
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 639ddcba28468..ecaeff77fcb4b 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -350,7 +350,7 @@ def AArch64PostLegalizerLowering
// Post-legalization combines which are primarily optimizations.
def AArch64PostLegalizerCombiner
: GICombiner<"AArch64PostLegalizerCombinerImpl",
- [copy_prop, cast_of_cast_combines,
+ [copy_prop, cast_of_cast_combines, constant_fold_fp_ops,
buildvector_of_truncate, integer_of_truncate,
mutate_anyext_to_zext, combines_for_extload,
combine_indexed_load_store, sext_trunc_sextload,
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index 322a96aca5db2..e8e563135acc5 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -739,14 +739,12 @@ define ptr @postidx32_shalf(ptr %src, ptr %out, half %a) {
;
; GISEL-LABEL: postidx32_shalf:
; GISEL: ; %bb.0:
-; GISEL-NEXT: movi d1, #0000000000000000
-; GISEL-NEXT: ldr h2, [x0], #4
+; GISEL-NEXT: ldr h1, [x0], #4
; GISEL-NEXT: ; kill: def $h0 killed $h0 def $s0
; GISEL-NEXT: fmov w9, s0
-; GISEL-NEXT: fcvt s3, h2
-; GISEL-NEXT: fmov w8, s2
-; GISEL-NEXT: fcvt s1, h1
-; GISEL-NEXT: fcmp s3, s1
+; GISEL-NEXT: fcvt s2, h1
+; GISEL-NEXT: fmov w8, s1
+; GISEL-NEXT: fcmp s2, #0.0
; GISEL-NEXT: csel w8, w8, w9, mi
; GISEL-NEXT: strh w8, [x1]
; GISEL-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index b234ef7a5ff8b..085170c7ba381 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -782,18 +782,16 @@ define void @test_fccmp(half %in, ptr %out) {
;
; CHECK-CVT-GI-LABEL: test_fccmp:
; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0
; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-GI-NEXT: fcvt s2, h0
-; CHECK-CVT-GI-NEXT: ldr h1, [x8, :lo12:.LCPI29_0]
-; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_1
-; CHECK-CVT-GI-NEXT: ldr h4, [x8, :lo12:.LCPI29_1]
+; CHECK-CVT-GI-NEXT: fcvt s1, h0
+; CHECK-CVT-GI-NEXT: fmov s2, #5.00000000
+; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0
+; CHECK-CVT-GI-NEXT: fmov s3, #8.00000000
+; CHECK-CVT-GI-NEXT: fcmp s1, s2
+; CHECK-CVT-GI-NEXT: ldr h2, [x8, :lo12:.LCPI29_0]
; CHECK-CVT-GI-NEXT: fmov w8, s0
-; CHECK-CVT-GI-NEXT: fcvt s3, h1
-; CHECK-CVT-GI-NEXT: fmov w9, s1
-; CHECK-CVT-GI-NEXT: fcvt s4, h4
-; CHECK-CVT-GI-NEXT: fcmp s2, s3
-; CHECK-CVT-GI-NEXT: fccmp s2, s4, #4, mi
+; CHECK-CVT-GI-NEXT: fmov w9, s2
+; CHECK-CVT-GI-NEXT: fccmp s1, s3, #4, mi
; CHECK-CVT-GI-NEXT: csel w8, w8, w9, gt
; CHECK-CVT-GI-NEXT: strh w8, [x0]
; CHECK-CVT-GI-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
index 7409bfb91454c..743d1604388de 100644
--- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -149,33 +149,21 @@ define i64 @fcvtzs_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_f16_i32_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI8_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI8_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI8_0
@@ -189,33 +177,21 @@ define i32 @fcvtzs_f16_i32_7(half %flt) {
}
define i32 @fcvtzs_f16_i32_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI9_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI9_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI9_0
@@ -229,33 +205,21 @@ define i32 @fcvtzs_f16_i32_15(half %flt) {
}
define i64 @fcvtzs_f16_i64_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI10_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI10_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI10_0
@@ -269,33 +233,21 @@ define i64 @fcvtzs_f16_i64_7(half %flt) {
}
define i64 @fcvtzs_f16_i64_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI11_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI11_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI11_0
@@ -453,33 +405,21 @@ define i64 @fcvtzu_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_f16_i32_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI20_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI20_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI20_0
@@ -493,33 +433,21 @@ define i32 @fcvtzu_f16_i32_7(half %flt) {
}
define i32 @fcvtzu_f16_i32_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI21_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI21_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI21_0
@@ -533,33 +461,21 @@ define i32 @fcvtzu_f16_i32_15(half %flt) {
}
define i64 @fcvtzu_f16_i64_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI22_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI22_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI22_0
@@ -573,33 +489,21 @@ define i64 @fcvtzu_f16_i64_7(half %flt) {
}
define i64 @fcvtzu_f16_i64_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI23_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI23_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI23_0
@@ -774,13 +678,11 @@ define half @scvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, w0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI32_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI32_0]
+; CHECK-GI-NO16-NEXT: scvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -814,13 +716,11 @@ define half @scvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, w0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI33_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI33_0]
+; CHECK-GI-NO16-NEXT: scvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -854,13 +754,11 @@ define half @scvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, x0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI34_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI34_0]
+; CHECK-GI-NO16-NEXT: scvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -894,13 +792,11 @@ define half @scvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, x0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI35_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI35_0]
+; CHECK-GI-NO16-NEXT: scvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1078,13 +974,11 @@ define half @ucvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, w0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI44_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI44_0]
+; CHECK-GI-NO16-NEXT: ucvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1118,13 +1012,11 @@ define half @ucvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, w0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI45_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI45_0]
+; CHECK-GI-NO16-NEXT: ucvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1158,13 +1050,11 @@ define half @ucvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, x0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI46_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI46_0]
+; CHECK-GI-NO16-NEXT: ucvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1198,13 +1088,11 @@ define half @ucvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, x0
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI47_0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI47_0]
+; CHECK-GI-NO16-NEXT: ucvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1356,33 +1244,21 @@ define i64 @fcvtzs_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI55_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI55_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI55_0
@@ -1396,33 +1272,21 @@ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI56_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI56_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI56_0
@@ -1436,33 +1300,21 @@ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI57_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI57_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI57_0
@@ -1476,33 +1328,21 @@ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI58_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI58_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI58_0
@@ -1650,33 +1490,21 @@ define i64 @fcvtzu_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI66_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI66_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI66_0
@@ -1690,33 +1518,21 @@ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI67_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI67_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI67_0
@@ -1730,33 +1546,21 @@ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI68_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI68_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI68_0
@@ -1770,33 +1574,21 @@ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: adrp x8, .LCPI69_0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: ldr h1, [x8, :lo12:.LCPI69_0]
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI69_0
@@ -1811,4 +1603,3 @@ define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
; CHECK-FP16: {{.*}}
-; CHECK-NO16: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll
index 98276b68481a1..e1bc7426ad63e 100644
--- a/llvm/test/CodeGen/AArch64/frem-power2.ll
+++ b/llvm/test/CodeGen/AArch64/frem-power2.ll
@@ -100,9 +100,8 @@ define half @hrem2_nsz(half %x) {
; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w30, -16
-; CHECK-GI-NEXT: fmov h1, #2.00000000
; CHECK-GI-NEXT: fcvt s0, h0
-; CHECK-GI-NEXT: fcvt s1, h1
+; CHECK-GI-NEXT: fmov s1, #2.00000000
; CHECK-GI-NEXT: bl fmodf
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index be07978cd8516..8e0328eaa2658 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -38,17 +38,11 @@ define half @add_v2HalfH(<2 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v2HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0]
-; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
+; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
; CHECK-GI-NOFP16-NEXT: ret
;
@@ -88,19 +82,13 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v3HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
@@ -152,17 +140,11 @@ define half @add_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI3_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI3_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -250,16 +232,10 @@ define half @add_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI4_0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI4_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -448,16 +424,10 @@ define half @add_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI7_0
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
-; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI7_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
-; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: fadd s2, s3, s2
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index c10d6e94226f2..716401e2ebafe 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -52,17 +52,11 @@ define half @mul_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI1_0
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI1_0]
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -144,16 +138,10 @@ define half @mul_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI2_0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: ldr h1, [x8, :lo12:.LCPI2_0]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -321,16 +309,10 @@ define half @mul_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: adrp x8, .LCPI5_0
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
-; CHECK-GI-NOFP16-NEXT: ldr h2, [x8, :lo12:.LCPI5_0]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
-; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: fmul s2, s3, s2
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 1aee6ab24eea0..1b879a604d715 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -403,40 +403,38 @@ define half @v_neg_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -460,40 +458,38 @@ define half @v_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -517,40 +513,38 @@ define half @v_rcp_f16_arcp(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -575,9 +569,7 @@ define half @v_rcp_f16_arcp_afn(half %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -600,40 +592,38 @@ define half @v_rcp_f16_ulp25(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -1454,70 +1444,67 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1526,30 +1513,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1561,26 +1545,23 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1594,30 +1575,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1628,26 +1606,24 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1660,30 +1636,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1696,24 +1669,21 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1726,27 +1696,25 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -1757,70 +1725,67 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1829,30 +1794,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1864,26 +1826,23 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1897,30 +1856,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1931,26 +1887,24 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, -v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1963,30 +1917,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -1999,24 +1950,21 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2029,27 +1977,25 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
@@ -2064,33 +2010,32 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2101,39 +2046,37 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v3, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2143,30 +2086,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2179,26 +2119,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2213,30 +2150,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2248,26 +2182,24 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v2, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -2279,32 +2211,29 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2316,26 +2245,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2346,30 +2272,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mov_b32_e32 v5, 1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v7, v7, v4, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2386,33 +2312,32 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v4, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2423,39 +2348,37 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v3, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2465,30 +2388,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2501,26 +2421,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2535,30 +2452,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2570,26 +2484,24 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v2, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, -v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -2601,32 +2513,29 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2638,26 +2547,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2668,30 +2574,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mov_b32_e32 v5, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v7, v7, v4, -v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2704,70 +2610,67 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2814,11 +2717,8 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2864,70 +2764,67 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2936,30 +2833,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2971,26 +2865,23 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3004,30 +2895,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3038,26 +2926,24 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3070,30 +2956,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -3106,24 +2989,21 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -3136,27 +3016,25 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -4033,40 +3911,38 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, 1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -4099,40 +3975,38 @@ define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, -1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -4166,21 +4040,20 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rsq_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
@@ -4188,24 +4061,23 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-FLUSH-LABEL: s_rsq_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-FLUSH-NEXT: ; return to shader part epilog
@@ -4241,36 +4113,35 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, s[0:1], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v5, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v9, v5, v5
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v5, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v6, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[0:1], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v8, v4, v4
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v9, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v4, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v9, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v5, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v9
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v8, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v4, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[0:1]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -4283,42 +4154,40 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4330,31 +4199,28 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v10, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v8, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4369,25 +4235,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4402,25 +4265,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, -v3, v9, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v8, v6, v9
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4434,25 +4294,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4466,25 +4324,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4498,25 +4354,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4530,29 +4383,27 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -4568,21 +4419,20 @@ define half @v_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4590,24 +4440,23 @@ define half @v_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4632,21 +4481,20 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4654,24 +4502,23 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4706,21 +4553,20 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4728,24 +4574,23 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4785,21 +4630,20 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4807,24 +4651,23 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4859,21 +4702,20 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4881,24 +4723,23 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4933,21 +4774,20 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4955,24 +4795,23 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5007,21 +4846,20 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5029,24 +4867,23 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5081,21 +4918,20 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5103,24 +4939,23 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5156,21 +4991,20 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5178,24 +5012,23 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5220,21 +5053,20 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5242,24 +5074,23 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5294,12 +5125,10 @@ define half @v_rsq_f16_afn(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5324,12 +5153,10 @@ define half @v_rsq_f16_afn_nocontract(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5365,36 +5192,35 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5404,42 +5230,40 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5448,31 +5272,28 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5486,25 +5307,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5518,25 +5336,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, 1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5549,25 +5364,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5580,25 +5393,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5611,25 +5422,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5642,7 +5450,7 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5650,22 +5458,20 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -5679,36 +5485,35 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5718,42 +5523,40 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5762,31 +5565,28 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5800,25 +5600,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5832,25 +5629,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5863,25 +5657,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5894,25 +5686,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5925,25 +5715,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5956,7 +5743,7 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5964,22 +5751,20 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 302b2395642d0..549af87c94949 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -88,11 +88,10 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT: v_or_b32_e32 v1, s4, v0
; CI-NEXT: .LBB0_8: ; %Flow19
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
; CI-NEXT: s_and_b32 s2, 1, s2
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
@@ -1197,16 +1196,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v1, s4, v1
; CI-NEXT: .LBB9_16: ; %Flow54
; CI-NEXT: v_cvt_f32_f16_e32 v2, s1
-; CI-NEXT: v_cvt_f32_f16_e32 v3, 0
; CI-NEXT: s_and_b32 s0, s0, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00
; CI-NEXT: s_cselect_b32 s4, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v2
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v2, 0x7e00
; CI-NEXT: s_and_b32 s3, 1, s4
@@ -1730,26 +1728,25 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v3, s1, v3
; CI-NEXT: .LBB10_32: ; %Flow124
; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v5, 0
; CI-NEXT: s_and_b32 s1, s4, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00
; CI-NEXT: s_cselect_b32 s11, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
; CI-NEXT: s_and_b32 s2, s6, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s6, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s3
; CI-NEXT: s_and_b32 s4, s5, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
; CI-NEXT: s_cselect_b32 s12, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s10
; CI-NEXT: s_and_b32 s7, s7, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00
; CI-NEXT: s_cselect_b32 s7, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], 0, v4
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v4, 0x7e00
; CI-NEXT: s_and_b32 s10, 1, s11
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9233f8059a202..9e152253bb6ca 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -7464,18 +7464,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 1.0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, 2.0
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
-; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
@@ -7639,27 +7636,24 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0
; SI-GISEL-NEXT: s_mov_b32 s10, 0
; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 2.0
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
-; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v5, 4.0
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5]
-; SI-GISEL-NEXT: buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; SI-GISEL-NEXT: buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v7
+; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
+; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -8712,12 +8706,10 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 2.0
-; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 4.0
-; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -8796,17 +8788,15 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a)
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 2.0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
-; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
+; SI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
+; SI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index af79c911f29f9..ac356fad5b2da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6011,8 +6011,7 @@ define half @v_exp_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6512,10 +6511,9 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6709,12 +6707,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index a99c1991a7909..d12ebe49814d8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6092,8 +6092,7 @@ define half @v_exp10_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6594,10 +6593,9 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6791,12 +6789,11 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 3f66c23e1a73b..259ee0b26d2d8 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -488,13 +488,11 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -582,15 +580,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; GISEL-CI-NEXT: s_mov_b32 s7, 0xf000
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; GISEL-CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; GISEL-CI-NEXT: s_waitcnt vmcnt(0)
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v2, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 21e6faf46f58d..ba77552e5809b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -313,13 +313,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -1009,28 +1007,26 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
@@ -1225,25 +1221,23 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4
; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v8
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v2, v3, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v3
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v3
-; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v3
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
@@ -1441,30 +1435,28 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v3, 0, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v3, 1.0, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -1622,16 +1614,14 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
@@ -1790,17 +1780,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 4f73e8e9c1883..c90b2c9170414 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -271,8 +271,7 @@ define half @v_maximumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_max_f32_e32 v0, 1.0, v0
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 558006d2b6957..64e8b7b50de08 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -271,8 +271,7 @@ define half @v_minimumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_min_f32_e32 v0, 1.0, v0
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
More information about the llvm-commits
mailing list