[llvm] [AArch64][GlobalISel] Add G_FPEXT(G_FCONSTANT) folding (PR #160902)
Ryan Cowan via llvm-commits
llvm-commits at lists.llvm.org
Wed Oct 8 06:35:21 PDT 2025
https://github.com/HolyMolyCowMan updated https://github.com/llvm/llvm-project/pull/160902
>From ab016f126847b72f4b3e9fa359cd98f832f69169 Mon Sep 17 00:00:00 2001
From: Ryan Cowan <ryan.cowan at arm.com>
Date: Fri, 26 Sep 2025 15:03:28 +0000
Subject: [PATCH 1/2] [AArch64][GlobalISel] Add G_FPEXT(G_FCONSTANT) folding
---
.../include/llvm/Target/GlobalISel/Combine.td | 2 +
.../lib/CodeGen/GlobalISel/CombinerHelper.cpp | 1 +
llvm/lib/Target/AArch64/AArch64Combine.td | 2 +-
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 4 +-
.../AArch64/GlobalISel/legalize-constant.mir | 5 +-
.../GlobalISel/legalize-fp16-fconstant.mir | 6 +-
.../CodeGen/AArch64/arm64-indexed-memory.ll | 7 +-
llvm/test/CodeGen/AArch64/dup.ll | 30 +-
llvm/test/CodeGen/AArch64/f16-instructions.ll | 21 +-
llvm/test/CodeGen/AArch64/fcvt-fixed.ll | 561 +--
llvm/test/CodeGen/AArch64/frem-power2.ll | 3 +-
.../CodeGen/AArch64/vecreduce-fadd-strict.ll | 52 +-
llvm/test/CodeGen/AArch64/vecreduce-fadd.ll | 42 +-
.../CodeGen/AArch64/vecreduce-fmul-strict.ll | 30 +-
.../CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll | 3217 ++++++++---------
llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll | 17 +-
llvm/test/CodeGen/AMDGPU/fmed3.ll | 46 +-
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 15 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 15 +-
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 14 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 76 +-
llvm/test/CodeGen/AMDGPU/maximumnum.ll | 3 +-
llvm/test/CodeGen/AMDGPU/minimumnum.ll | 3 +-
23 files changed, 1833 insertions(+), 2339 deletions(-)
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 204e1f6887fa2..57828a270ec00 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -694,6 +694,7 @@ def constant_fold_fabs : constant_fold_unary_fp_op_rule<G_FABS>;
def constant_fold_fsqrt : constant_fold_unary_fp_op_rule<G_FSQRT>;
def constant_fold_flog2 : constant_fold_unary_fp_op_rule<G_FLOG2>;
def constant_fold_fptrunc : constant_fold_unary_fp_op_rule<G_FPTRUNC>;
+def constant_fold_fpext : constant_fold_unary_fp_op_rule<G_FPEXT>;
// Fold constant zero int to fp conversions.
class itof_const_zero_fold_rule<Instruction opcode> : GICombineRule <
@@ -712,6 +713,7 @@ def constant_fold_fp_ops : GICombineGroup<[
constant_fold_fsqrt,
constant_fold_flog2,
constant_fold_fptrunc,
+ constant_fold_fpext,
itof_const_zero_fold_si,
itof_const_zero_fold_ui
]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 0ebee2cfd8688..2206a558f9f4c 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -1728,6 +1728,7 @@ static APFloat constantFoldFpUnary(const MachineInstr &MI,
Result.clearSign();
return Result;
}
+ case TargetOpcode::G_FPEXT:
case TargetOpcode::G_FPTRUNC: {
bool Unused;
LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index 076a6235eef0a..121ed198a5958 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -351,7 +351,7 @@ def AArch64PostLegalizerLowering
// Post-legalization combines which are primarily optimizations.
def AArch64PostLegalizerCombiner
: GICombiner<"AArch64PostLegalizerCombinerImpl",
- [copy_prop, cast_of_cast_combines,
+ [copy_prop, cast_of_cast_combines, constant_fold_fp_ops,
buildvector_of_truncate, integer_of_truncate,
mutate_anyext_to_zext, combines_for_extload,
combine_indexed_load_store, sext_trunc_sextload,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index ea2196a584127..5613364626692 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -678,8 +678,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.widenScalarToNextPow2(0)
.clampScalar(0, s8, s64);
getActionDefinitionsBuilder(G_FCONSTANT)
- .legalFor({s32, s64, s128})
- .legalFor(HasFP16, {s16})
+ // Always legalize S16 to prevent G_FCONSTANT being widened to G_CONSTANT
+ .legalFor({s16, s32, s64, s128})
.clampScalar(0, MinFPScalar, s128);
// FIXME: fix moreElementsToNextPow2
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
index c301e76852b54..c00ce2242a888 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
@@ -48,8 +48,9 @@ body: |
; CHECK-NEXT: $w0 = COPY [[C]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
; CHECK-NEXT: $x0 = COPY [[C1]](s64)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
- ; CHECK-NEXT: $w0 = COPY [[C2]](s32)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000
+ ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C2]](s16)
+ ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
%0:_(s32) = G_FCONSTANT float 1.0
$w0 = COPY %0
%1:_(s64) = G_FCONSTANT double 2.0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp16-fconstant.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp16-fconstant.mir
index ddf219dc4927e..c6df3456a8445 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp16-fconstant.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp16-fconstant.mir
@@ -8,7 +8,7 @@ tracksRegLiveness: true
body: |
bb.0:
; NO-FP16-LABEL: name: fp16
- ; NO-FP16: %cst:_(s16) = G_CONSTANT i16 0
+ ; NO-FP16: %cst:_(s16) = G_FCONSTANT half 0xH0000
; NO-FP16-NEXT: $h0 = COPY %cst(s16)
; NO-FP16-NEXT: RET_ReallyLR implicit $h0
;
@@ -26,7 +26,7 @@ tracksRegLiveness: true
body: |
bb.0:
; NO-FP16-LABEL: name: fp16_non_zero
- ; NO-FP16: %cst:_(s16) = G_CONSTANT i16 16384
+ ; NO-FP16: %cst:_(s16) = G_FCONSTANT half 0xH4000
; NO-FP16-NEXT: $h0 = COPY %cst(s16)
; NO-FP16-NEXT: RET_ReallyLR implicit $h0
;
@@ -44,7 +44,7 @@ tracksRegLiveness: true
body: |
bb.1.entry:
; NO-FP16-LABEL: name: nan
- ; NO-FP16: %cst:_(s16) = G_CONSTANT i16 31745
+ ; NO-FP16: %cst:_(s16) = G_FCONSTANT half 0xH7C01
; NO-FP16-NEXT: %ext:_(s32) = G_FPEXT %cst(s16)
; NO-FP16-NEXT: $w0 = COPY %ext(s32)
; NO-FP16-NEXT: RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index cb5df07c7ede4..e8e563135acc5 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -739,15 +739,12 @@ define ptr @postidx32_shalf(ptr %src, ptr %out, half %a) {
;
; GISEL-LABEL: postidx32_shalf:
; GISEL: ; %bb.0:
-; GISEL-NEXT: mov w8, #0 ; =0x0
; GISEL-NEXT: ldr h1, [x0], #4
-; GISEL-NEXT: fmov s2, w8
; GISEL-NEXT: ; kill: def $h0 killed $h0 def $s0
; GISEL-NEXT: fmov w9, s0
-; GISEL-NEXT: fcvt s3, h1
+; GISEL-NEXT: fcvt s2, h1
; GISEL-NEXT: fmov w8, s1
-; GISEL-NEXT: fcvt s2, h2
-; GISEL-NEXT: fcmp s3, s2
+; GISEL-NEXT: fcmp s2, #0.0
; GISEL-NEXT: csel w8, w8, w9, mi
; GISEL-NEXT: strh w8, [x1]
; GISEL-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 079ff1076b110..1c4a6ab2217b0 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -1469,8 +1469,9 @@ define <2 x half> @loaddup_str_v2half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v2half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
+; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1526,8 +1527,9 @@ define <3 x half> @loaddup_str_v3half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v3half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
+; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1583,8 +1585,9 @@ define <4 x half> @loaddup_str_v4half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v4half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
+; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1639,8 +1642,9 @@ define <8 x half> @loaddup_str_v8half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v8half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: dup v0.8h, v0.h[0]
+; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1713,9 +1717,10 @@ define <16 x half> @loaddup_str_v16half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v16half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d2, #0000000000000000
; CHECK-GI-NEXT: dup v0.8h, v1.h[0]
; CHECK-GI-NEXT: dup v1.8h, v1.h[0]
+; CHECK-GI-NEXT: str h2, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1771,8 +1776,9 @@ define <2 x bfloat> @loaddup_str_v2bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v2bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
+; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
@@ -1828,8 +1834,9 @@ define <3 x bfloat> @loaddup_str_v3bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v3bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
+; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
@@ -1885,8 +1892,9 @@ define <4 x bfloat> @loaddup_str_v4bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v4bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
+; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
@@ -1941,8 +1949,9 @@ define <8 x bfloat> @loaddup_str_v8bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v8bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d1, #0000000000000000
; CHECK-GI-NEXT: dup v0.8h, v0.h[0]
+; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
@@ -2015,9 +2024,10 @@ define <16 x bfloat> @loaddup_str_v16bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v16bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: strh wzr, [x0]
+; CHECK-GI-NEXT: movi d2, #0000000000000000
; CHECK-GI-NEXT: dup v0.8h, v1.h[0]
; CHECK-GI-NEXT: dup v1.8h, v1.h[0]
+; CHECK-GI-NEXT: str h2, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index adc536da26f26..085170c7ba381 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -782,18 +782,17 @@ define void @test_fccmp(half %in, ptr %out) {
;
; CHECK-CVT-GI-LABEL: test_fccmp:
; CHECK-CVT-GI: // %bb.0:
-; CHECK-CVT-GI-NEXT: mov w8, #17664 // =0x4500
-; CHECK-CVT-GI-NEXT: mov w9, #18432 // =0x4800
; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-GI-NEXT: fcvt s2, h0
-; CHECK-CVT-GI-NEXT: fmov s1, w8
-; CHECK-CVT-GI-NEXT: fmov s3, w9
-; CHECK-CVT-GI-NEXT: fmov w9, s0
-; CHECK-CVT-GI-NEXT: fcvt s1, h1
-; CHECK-CVT-GI-NEXT: fcvt s3, h3
-; CHECK-CVT-GI-NEXT: fcmp s2, s1
-; CHECK-CVT-GI-NEXT: fccmp s2, s3, #4, mi
-; CHECK-CVT-GI-NEXT: csel w8, w9, w8, gt
+; CHECK-CVT-GI-NEXT: fcvt s1, h0
+; CHECK-CVT-GI-NEXT: fmov s2, #5.00000000
+; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0
+; CHECK-CVT-GI-NEXT: fmov s3, #8.00000000
+; CHECK-CVT-GI-NEXT: fcmp s1, s2
+; CHECK-CVT-GI-NEXT: ldr h2, [x8, :lo12:.LCPI29_0]
+; CHECK-CVT-GI-NEXT: fmov w8, s0
+; CHECK-CVT-GI-NEXT: fmov w9, s2
+; CHECK-CVT-GI-NEXT: fccmp s1, s3, #4, mi
+; CHECK-CVT-GI-NEXT: csel w8, w8, w9, gt
; CHECK-CVT-GI-NEXT: strh w8, [x0]
; CHECK-CVT-GI-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
index 51aad4fe25d3b..743d1604388de 100644
--- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -149,33 +149,21 @@ define i64 @fcvtzs_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_f16_i32_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI8_0
@@ -189,33 +177,21 @@ define i32 @fcvtzs_f16_i32_7(half %flt) {
}
define i32 @fcvtzs_f16_i32_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI9_0
@@ -229,33 +205,21 @@ define i32 @fcvtzs_f16_i32_15(half %flt) {
}
define i64 @fcvtzs_f16_i64_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI10_0
@@ -269,33 +233,21 @@ define i64 @fcvtzs_f16_i64_7(half %flt) {
}
define i64 @fcvtzs_f16_i64_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI11_0
@@ -453,33 +405,21 @@ define i64 @fcvtzu_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_f16_i32_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI20_0
@@ -493,33 +433,21 @@ define i32 @fcvtzu_f16_i32_7(half %flt) {
}
define i32 @fcvtzu_f16_i32_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI21_0
@@ -533,33 +461,21 @@ define i32 @fcvtzu_f16_i32_15(half %flt) {
}
define i64 @fcvtzu_f16_i64_7(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI22_0
@@ -573,33 +489,21 @@ define i64 @fcvtzu_f16_i64_7(half %flt) {
}
define i64 @fcvtzu_f16_i64_15(half %flt) {
-; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI23_0
@@ -774,13 +678,11 @@ define half @scvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, w0
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: scvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -814,13 +716,11 @@ define half @scvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, w0
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: scvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -854,13 +754,11 @@ define half @scvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, x0
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: scvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -894,13 +792,11 @@ define half @scvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s0, x0
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: scvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1078,13 +974,11 @@ define half @ucvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, w0
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: ucvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1118,13 +1012,11 @@ define half @ucvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, w0
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: ucvtf s1, w0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1158,13 +1050,11 @@ define half @ucvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, x0
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: ucvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1198,13 +1088,11 @@ define half @ucvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s0, x0
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: ucvtf s1, x0
+; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
+; CHECK-GI-NO16-NEXT: fcvt h1, s1
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
+; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1356,33 +1244,21 @@ define i64 @fcvtzs_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI55_0
@@ -1396,33 +1272,21 @@ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI56_0
@@ -1436,33 +1300,21 @@ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI57_0
@@ -1476,33 +1328,21 @@ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzs x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI58_0
@@ -1650,33 +1490,21 @@ define i64 @fcvtzu_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI66_0
@@ -1690,33 +1518,21 @@ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu w0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI67_0
@@ -1730,33 +1546,21 @@ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_7:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_7:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_7:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI68_0
@@ -1770,33 +1574,21 @@ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
-; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_15:
-; CHECK-SD-NO16: // %bb.0:
-; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
-; CHECK-SD-NO16-NEXT: fcvt h0, s0
-; CHECK-SD-NO16-NEXT: fcvt s0, h0
-; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
-; CHECK-SD-NO16-NEXT: ret
+; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_15:
+; CHECK-NO16: // %bb.0:
+; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fmul s0, s0, s1
+; CHECK-NO16-NEXT: fcvt h0, s0
+; CHECK-NO16-NEXT: fcvt s0, h0
+; CHECK-NO16-NEXT: fcvtzu x0, s0
+; CHECK-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
-; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_15:
-; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fmov s1, w8
-; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
-; CHECK-GI-NO16-NEXT: fcvt h0, s0
-; CHECK-GI-NO16-NEXT: fcvt s0, h0
-; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
-; CHECK-GI-NO16-NEXT: ret
-;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI69_0
@@ -1811,4 +1603,3 @@ define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
; CHECK-FP16: {{.*}}
-; CHECK-NO16: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/frem-power2.ll b/llvm/test/CodeGen/AArch64/frem-power2.ll
index 98276b68481a1..e1bc7426ad63e 100644
--- a/llvm/test/CodeGen/AArch64/frem-power2.ll
+++ b/llvm/test/CodeGen/AArch64/frem-power2.ll
@@ -100,9 +100,8 @@ define half @hrem2_nsz(half %x) {
; CHECK-GI-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w30, -16
-; CHECK-GI-NEXT: fmov h1, #2.00000000
; CHECK-GI-NEXT: fcvt s0, h0
-; CHECK-GI-NEXT: fcvt s1, h1
+; CHECK-GI-NEXT: fmov s1, #2.00000000
; CHECK-GI-NEXT: bl fmodf
; CHECK-GI-NEXT: fcvt h0, s0
; CHECK-GI-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index 594a3ab79d73b..8e0328eaa2658 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -38,17 +38,11 @@ define half @add_v2HalfH(<2 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v2HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: fmov s1, w8
-; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
+; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
; CHECK-GI-NOFP16-NEXT: ret
;
@@ -88,19 +82,13 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v3HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: fmov s1, w8
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
@@ -152,17 +140,11 @@ define half @add_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: fmov s1, w8
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -250,16 +232,10 @@ define half @add_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: fmov s1, w8
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -448,16 +424,10 @@ define half @add_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
-; CHECK-GI-NOFP16-NEXT: fmov s2, w8
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
-; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: fadd s2, s3, s2
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 18f463cfcf7c9..40925da0557ec 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -405,26 +405,23 @@ define half @fadd_reduction_v4f16_in_loop(ptr %ptr.start) {
;
; CHECK-GI-NOFP16-LABEL: fadd_reduction_v4f16_in_loop:
; CHECK-GI-NOFP16: // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT: movi d0, #0000000000000000
; CHECK-GI-NOFP16-NEXT: mov x8, xzr
-; CHECK-GI-NOFP16-NEXT: mov w9, #0 // =0x0
; CHECK-GI-NOFP16-NEXT: .LBB13_1: // %loop
; CHECK-GI-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NOFP16-NEXT: ldr d0, [x0, x8]
-; CHECK-GI-NOFP16-NEXT: fmov s1, w9
+; CHECK-GI-NOFP16-NEXT: ldr d1, [x0, x8]
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
; CHECK-GI-NOFP16-NEXT: add x8, x8, #8
; CHECK-GI-NOFP16-NEXT: cmp w8, #56
-; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: faddp v1.4s, v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: faddp s1, v1.2s
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
-; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT: fmov w9, s0
; CHECK-GI-NOFP16-NEXT: b.ne .LBB13_1
; CHECK-GI-NOFP16-NEXT: // %bb.2: // %exit
-; CHECK-GI-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fadd_reduction_v4f16_in_loop:
@@ -521,28 +518,25 @@ define half @fadd_reduction_v8f16_in_loop(ptr %ptr.start) {
;
; CHECK-GI-NOFP16-LABEL: fadd_reduction_v8f16_in_loop:
; CHECK-GI-NOFP16: // %bb.0: // %entry
+; CHECK-GI-NOFP16-NEXT: movi d0, #0000000000000000
; CHECK-GI-NOFP16-NEXT: mov x8, xzr
-; CHECK-GI-NOFP16-NEXT: mov w9, #0 // =0x0
; CHECK-GI-NOFP16-NEXT: .LBB14_1: // %loop
; CHECK-GI-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NOFP16-NEXT: ldr q0, [x0, x8]
+; CHECK-GI-NOFP16-NEXT: ldr q1, [x0, x8]
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
; CHECK-GI-NOFP16-NEXT: add x8, x8, #8
; CHECK-GI-NOFP16-NEXT: cmp w8, #56
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
-; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: fmov s1, w9
+; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v1.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
+; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: faddp v1.4s, v1.4s, v1.4s
+; CHECK-GI-NOFP16-NEXT: faddp s1, v1.2s
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
-; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
-; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
-; CHECK-GI-NOFP16-NEXT: fmov w9, s0
; CHECK-GI-NOFP16-NEXT: b.ne .LBB14_1
; CHECK-GI-NOFP16-NEXT: // %bb.2: // %exit
-; CHECK-GI-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fadd_reduction_v8f16_in_loop:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index e1b21705c95f3..716401e2ebafe 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -52,17 +52,11 @@ define half @mul_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov w8, #15360 // =0x3c00
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: fmov s1, w8
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -144,16 +138,10 @@ define half @mul_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov w8, #15360 // =0x3c00
+; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: fmov s1, w8
-; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -321,16 +309,10 @@ define half @mul_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov w8, #15360 // =0x3c00
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
-; CHECK-GI-NOFP16-NEXT: fmov s2, w8
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
-; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
-; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
-; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
-; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: fmul s2, s3, s2
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
index 1aee6ab24eea0..1b879a604d715 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f16.ll
@@ -403,40 +403,38 @@ define half @v_neg_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -460,40 +458,38 @@ define half @v_rcp_f16(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -517,40 +513,38 @@ define half @v_rcp_f16_arcp(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -575,9 +569,7 @@ define half @v_rcp_f16_arcp_afn(half %x) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -600,40 +592,38 @@ define half @v_rcp_f16_ulp25(half %x) {
; GFX6-IEEE-LABEL: v_rcp_f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
@@ -1454,70 +1444,67 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1526,30 +1513,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1561,26 +1545,23 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1594,30 +1575,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1628,26 +1606,24 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -1660,30 +1636,27 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1696,24 +1669,21 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -1726,27 +1696,25 @@ define <2 x half> @v_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -1757,70 +1725,67 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_neg_rcp_v2f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -1829,30 +1794,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1864,26 +1826,23 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1897,30 +1856,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1931,26 +1887,24 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_neg_rcp_v2f16:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, -v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -1963,30 +1917,27 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -1999,24 +1950,21 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2029,27 +1977,25 @@ define <2 x half> @v_neg_rcp_v2f16(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half -1.0, half -1.0>, %x
@@ -2064,33 +2010,32 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, 1.0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2101,39 +2046,37 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v3, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2143,30 +2086,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2179,26 +2119,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2213,30 +2150,27 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2248,26 +2182,24 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v2, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -2279,32 +2211,29 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2316,26 +2245,23 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -2346,30 +2272,30 @@ define <2 x half> @v_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, 1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mov_b32_e32 v5, 1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, -|v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v7, v7, v4, v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, 1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2386,33 +2312,32 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-IEEE-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-IEEE-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v2, v1
-; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v5, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v5, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v6, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v5, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v0, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v0, v1, -1.0
+; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v4, v4, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v4, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v5, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v1, v4, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -2423,39 +2348,37 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v1, v0
; GFX6-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX6-FLUSH-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v5, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v3, v3, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, v4, v5, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v3, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v2, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, v6, v2, v2
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v3, v2
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v1, v6, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v2, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v6, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v5, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v1, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v2, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v5, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v1, v3, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2465,30 +2388,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2501,26 +2421,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v1, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2535,30 +2452,27 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, -1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_sub_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, -1.0
@@ -2570,26 +2484,24 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, v3
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v5, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v5, -1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v4, v4
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v6, v7, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v7, v4
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v2, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v7, v7, v4, -v4
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v8, v2
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v0, 0xff800000, v0
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v0, v0, v7
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -2601,32 +2513,29 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-IEEE: ; %bb.0:
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2638,26 +2547,23 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX10-FLUSH: ; %bb.0:
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_and_b32_e32 v0, 0x7fff7fff, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -2668,30 +2574,30 @@ define <2 x half> @v_neg_rcp_v2f16_fabs(<2 x half> %x) {
; GFX11-LABEL: v_neg_rcp_v2f16_fabs:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_cvt_f32_f16_e32 v5, -1.0
; GFX11-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX11-NEXT: v_rcp_f32_e32 v3, v3
-; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v6, v5, v3
; GFX11-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
; GFX11-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GFX11-NEXT: v_fmac_f32_e32 v6, v7, v3
; GFX11-NEXT: v_rcp_f32_e32 v4, v4
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v6, -1.0 op_sel_hi:[1,0,1]
+; GFX11-NEXT: v_mov_b32_e32 v5, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v5, v4
-; GFX11-NEXT: v_fma_mix_f32 v8, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v8, v4
-; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v5, -1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v3, v7, v3 :: v_dual_mul_f32 v0, v0, v4
+; GFX11-NEXT: v_fma_mix_f32 v7, |v0|, v4, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v7, v7, v4, -v4
+; GFX11-NEXT: v_fma_mix_f32 v0, -|v0|, v7, v5 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX11-NEXT: v_and_b32_e32 v0, 0xff800000, v0
-; GFX11-NEXT: v_dual_add_f32 v0, v0, v5 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v0, v2, -1.0
+; GFX11-NEXT: s_waitcnt_depctr 0xfff
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v8, -v1, v6, v5 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_mul_f32_e32 v3, v8, v3
+; GFX11-NEXT: v_and_b32_e32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v1, v0
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -2704,70 +2610,67 @@ define <2 x half> @v_rcp_v2f16_arcp(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_arcp:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_arcp:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2814,11 +2717,8 @@ define <2 x half> @v_rcp_v2f16_arcp_afn(<2 x half> %x) {
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, v2, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, v2, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2864,70 +2764,67 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX6-IEEE-LABEL: v_rcp_v2f16_ulp25:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v2, v1, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -2936,30 +2833,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -2971,26 +2865,23 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX8-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX8-FLUSH: ; %bb.0:
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v7, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v1
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v1, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v1, -v1, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX8-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3004,30 +2895,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_lshrrev_b32_e32 v2, 16, v0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v8, -v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v7
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v10, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v5, v5, v6
-; GFX9-IEEE-NEXT: v_add_f32_e32 v5, v5, v9
-; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v1
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v6, -v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v1, -v1, v6
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, 1.0, v1
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v1, v1, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v4, v7, v5
+; GFX9-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v4
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v1, 0xff800000, v1
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
+; GFX9-IEEE-NEXT: v_add_f32_e32 v1, v1, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3038,26 +2926,24 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX9-FLUSH-LABEL: v_rcp_v2f16_ulp25:
; GFX9-FLUSH: ; %bb.0:
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_lshrrev_b32_e32 v2, 16, v0
+; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v1
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v1
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v1, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v1, v1
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v1, v7, v1
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v1, 0xff800000, v1
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v5, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v1, v1, v5
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v1, v0, 1.0
@@ -3070,30 +2956,27 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v9, v9, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v10, v10, v6
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v9, v9, v4
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v10, v10, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX10-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -3106,24 +2989,21 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
@@ -3136,27 +3016,25 @@ define <2 x half> @v_rcp_v2f16_ulp25(<2 x half> %x) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel:[1,0,0] op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel:[1,0,0] op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%fdiv = fdiv <2 x half> <half 1.0, half 1.0>, %x
@@ -4033,40 +3911,38 @@ define amdgpu_ps i32 @s_fdiv_v2f16(i32 inreg %a.arg, i32 inreg %b.arg) {
define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, 1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, 1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -4099,40 +3975,38 @@ define amdgpu_ps i16 @s_rcp_f16(i16 inreg %a.arg) {
define amdgpu_ps i16 @s_neg_rcp_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_neg_rcp_f16:
; GFX6-IEEE: ; %bb.0:
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, -1.0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
;
; GFX6-FLUSH-LABEL: s_neg_rcp_f16:
; GFX6-FLUSH: ; %bb.0:
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, -1.0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v0
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v0, v1, v0
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
@@ -4166,21 +4040,20 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-IEEE-LABEL: s_rsq_f16:
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-IEEE-NEXT: ; return to shader part epilog
@@ -4188,24 +4061,23 @@ define amdgpu_ps i16 @s_rsq_f16(i16 inreg %a.arg) {
; GFX6-FLUSH-LABEL: s_rsq_f16:
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[0:1], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_readfirstlane_b32 s0, v0
; GFX6-FLUSH-NEXT: ; return to shader part epilog
@@ -4241,36 +4113,35 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v6, s[0:1], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v5, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v9, v5, v5
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v5
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v5, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v5, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v6, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v6, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[0:1], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v4, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v8, v4, v4
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v5
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v3, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v9, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v4, v9
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v9, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v5, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v4, v9
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[0:1], -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v8, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v4, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v5, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[0:1]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_lshlrev_b32_e32 v1, 16, v1
@@ -4283,42 +4154,40 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, s0
; GFX6-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, s0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[0:1], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[0:1], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX6-FLUSH-NEXT: v_or_b32_e32 v0, v0, v1
@@ -4330,31 +4199,28 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v8, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v8, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v8, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v9
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v8, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v10, v6
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v5, v8, v9
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4369,25 +4235,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX8-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v8, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v8, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v9, v8, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v9, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v9
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
@@ -4402,25 +4265,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-IEEE-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_fma_f32 v8, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v8, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, -v3, v9, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v8, v6, v9
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4434,25 +4294,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX9-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, s0
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v6, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v4, -1.0 op_sel_hi:[1,0,1]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v6, v3
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4466,25 +4324,23 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-IEEE-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4498,25 +4354,22 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX10-FLUSH-NEXT: s_lshr_b32 s1, s0, 16
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v0, s0
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
@@ -4530,29 +4383,27 @@ define amdgpu_ps i32 @s_rsq_v2f16(i32 inreg %a.arg) {
; GFX11-NEXT: s_lshr_b32 s1, s0, 16
; GFX11-NEXT: v_sqrt_f16_e32 v0, s0
; GFX11-NEXT: v_sqrt_f16_e32 v1, s1
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
; GFX11-NEXT: v_cvt_f32_f16_e32 v3, v1
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: v_readfirstlane_b32 s0, v0
; GFX11-NEXT: ; return to shader part epilog
@@ -4568,21 +4419,20 @@ define half @v_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4590,24 +4440,23 @@ define half @v_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4632,21 +4481,20 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4654,24 +4502,23 @@ define half @v_neg_rsq_f16(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4706,21 +4553,20 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v6, v5
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4728,24 +4574,23 @@ define { half, half } @v_rsq_f16_multi_use(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v1, v2, v1
+; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v0
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v3, v2, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4785,21 +4630,20 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4807,24 +4651,23 @@ define half @v_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4859,21 +4702,20 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4881,24 +4723,23 @@ define half @v_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -4933,21 +4774,20 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -4955,24 +4795,23 @@ define half @v_neg_rsq_f16_missing_contract0(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5007,21 +4846,20 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5029,24 +4867,23 @@ define half @v_neg_rsq_f16_missing_contract1(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5081,21 +4918,20 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5103,24 +4939,23 @@ define half @v_neg_rsq_f16_fabs(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5156,21 +4991,20 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5178,24 +5012,23 @@ define half @v_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5220,21 +5053,20 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-IEEE: ; %bb.0:
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v5, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-IEEE-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
;
@@ -5242,24 +5074,23 @@ define half @v_neg_rsq_f16_arcp(half %a) {
; GFX6-FLUSH: ; %bb.0:
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, v1
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, v1, v0, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v1, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v2, v1
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, -v1, v2, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, v4, v2, v2
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v4, v5, v2, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, v1
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v1, v1, v2, v4
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5294,12 +5125,10 @@ define half @v_rsq_f16_afn(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5324,12 +5153,10 @@ define half @v_rsq_f16_afn_nocontract(half %a) {
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GFX6-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_rcp_f32_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, v1, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -5365,36 +5192,35 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, 1.0, v0, 1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], 1.0, v1, 1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5404,42 +5230,40 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, 1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, 1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, 1.0, v1, 1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, 1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5448,31 +5272,28 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v6, -v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v7, -v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, 1.0, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, 1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, 1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, 1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5486,25 +5307,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, 1.0
@@ -5518,25 +5336,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, -v2, v4, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, -v3, v5, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, 1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, 1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5549,25 +5364,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, 1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5580,25 +5393,23 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, 1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, -v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5611,25 +5422,22 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, 1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, -v2, v4, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, -v3, v5, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, 1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, 1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, 1.0
@@ -5642,7 +5450,7 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, 1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, 1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5650,22 +5458,20 @@ define <2 x half> @v_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, 1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, -v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, -v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, 1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, 1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
@@ -5679,36 +5485,35 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-IEEE-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX6-IEEE-NEXT: v_div_scale_f32 v4, vcc, v2, v0, v2
-; GFX6-IEEE-NEXT: v_div_scale_f32 v5, s[4:5], v1, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v9, -v3, v6, 1.0
-; GFX6-IEEE-NEXT: v_fma_f32 v6, v9, v6, v6
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v9, v4, v6
-; GFX6-IEEE-NEXT: v_rcp_f32_e32 v8, v5
-; GFX6-IEEE-NEXT: v_fma_f32 v10, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v9, v10, v6, v9
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v3, v9, v4
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v3, v6, v9
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v3, v0, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, -v5, v8, 1.0
-; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], v2, v1, v2
-; GFX6-IEEE-NEXT: v_fma_f32 v3, v3, v8, v8
-; GFX6-IEEE-NEXT: v_mul_f32_e32 v4, v7, v3
-; GFX6-IEEE-NEXT: v_fma_f32 v6, -v5, v4, v7
-; GFX6-IEEE-NEXT: v_fma_f32 v4, v6, v3, v4
-; GFX6-IEEE-NEXT: v_fma_f32 v5, -v5, v4, v7
+; GFX6-IEEE-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v5, v2
+; GFX6-IEEE-NEXT: v_div_scale_f32 v3, vcc, -1.0, v0, -1.0
+; GFX6-IEEE-NEXT: v_div_scale_f32 v4, s[4:5], v1, v1, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v8, -v2, v5, 1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v5, v8, v5, v5
+; GFX6-IEEE-NEXT: v_rcp_f32_e32 v6, v4
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v8, v3, v5
+; GFX6-IEEE-NEXT: v_fma_f32 v10, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v8, v10, v5, v8
+; GFX6-IEEE-NEXT: v_fma_f32 v2, -v2, v8, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v9, -v4, v6, 1.0
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v2, v5, v8
+; GFX6-IEEE-NEXT: v_div_scale_f32 v7, s[4:5], -1.0, v1, -1.0
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
+; GFX6-IEEE-NEXT: v_fma_f32 v2, v9, v6, v6
+; GFX6-IEEE-NEXT: v_mul_f32_e32 v3, v7, v2
+; GFX6-IEEE-NEXT: v_fma_f32 v5, -v4, v3, v7
+; GFX6-IEEE-NEXT: v_fma_f32 v3, v5, v2, v3
+; GFX6-IEEE-NEXT: v_fma_f32 v4, -v4, v3, v7
; GFX6-IEEE-NEXT: s_mov_b64 vcc, s[4:5]
-; GFX6-IEEE-NEXT: v_div_fmas_f32 v3, v5, v3, v4
-; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v3, v1, v2
+; GFX6-IEEE-NEXT: v_div_fmas_f32 v2, v4, v2, v3
+; GFX6-IEEE-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-IEEE-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-IEEE-NEXT: s_setpc_b64 s[30:31]
@@ -5718,42 +5523,40 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX6-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, -1.0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_sqrt_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v3, s[4:5], v0, v0, v2
-; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v4, v3
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v2, v0, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v0, v0, -1.0
+; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v0, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v4, v6, v4, v4
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v4
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v3, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v4, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, -v3, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v3, v3, v4, v6
-; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v3, v0, v2
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v0, v2, v0, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, -1.0
; GFX6-FLUSH-NEXT: v_rcp_f32_e32 v3, v2
-; GFX6-FLUSH-NEXT: v_div_scale_f32 v5, vcc, v4, v1, v4
+; GFX6-FLUSH-NEXT: v_div_scale_f32 v4, vcc, -1.0, v1, -1.0
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v3, 1.0
-; GFX6-FLUSH-NEXT: v_fma_f32 v3, v6, v3, v3
-; GFX6-FLUSH-NEXT: v_mul_f32_e32 v6, v5, v3
-; GFX6-FLUSH-NEXT: v_fma_f32 v7, -v2, v6, v5
-; GFX6-FLUSH-NEXT: v_fma_f32 v6, v7, v3, v6
-; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v6, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-NEXT: v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-NEXT: v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-NEXT: v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX6-FLUSH-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
-; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v6
-; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, v4
+; GFX6-FLUSH-NEXT: v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-NEXT: v_div_fixup_f32 v1, v2, v1, -1.0
; GFX6-FLUSH-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-FLUSH-NEXT: s_setpc_b64 s[30:31]
;
@@ -5762,31 +5565,28 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v10, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v9, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v10, v10, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v9, v9, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v10, v10, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v9, v9, v5
-; GFX8-IEEE-NEXT: v_add_f32_e32 v8, v10, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v7, v9, v7
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v8
-; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v7
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v4
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v2, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v3, v5
+; GFX8-IEEE-NEXT: v_add_f32_e32 v7, -1.0, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v6, -1.0, v6
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v7, v7, v5
+; GFX8-IEEE-NEXT: v_sub_f32_e32 v6, v6, v4
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v3, -v3, v7
+; GFX8-IEEE-NEXT: v_mul_f32_e64 v2, -v2, v6
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, -1.0, v3
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, -1.0, v2
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-IEEE-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5800,25 +5600,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX8-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX8-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX8-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v2
-; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v6, v3
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX8-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v6
-; GFX8-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v5
-; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v4
-; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v4
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v6
-; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v5
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
+; GFX8-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX8-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX8-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
+; GFX8-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
+; GFX8-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX8-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX8-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
-; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX8-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX8-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX8-FLUSH-NEXT: v_div_fixup_f16 v0, v3, v0, -1.0
@@ -5832,25 +5629,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v2
-; GFX9-IEEE-NEXT: v_rcp_f32_e32 v6, v3
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v7, v4, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v8, v4, v6
-; GFX9-IEEE-NEXT: v_fma_f32 v9, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v10, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v7, v9, v5, v7
-; GFX9-IEEE-NEXT: v_fma_f32 v8, v10, v6, v8
-; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v7, v4
-; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v8, v4
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v5
-; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v6
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v4, v2
+; GFX9-IEEE-NEXT: v_rcp_f32_e32 v5, v3
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v2, v4, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v3, v5, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v6, v6, v4, -v4
+; GFX9-IEEE-NEXT: v_fma_f32 v7, v7, v5, -v5
+; GFX9-IEEE-NEXT: v_fma_f32 v2, -v2, v6, -1.0
+; GFX9-IEEE-NEXT: v_fma_f32 v3, -v3, v7, -1.0
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v2, v2, v4
+; GFX9-IEEE-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX9-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX9-IEEE-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX9-IEEE-NEXT: v_add_f32_e32 v3, v3, v7
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5863,25 +5657,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX9-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX9-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX9-FLUSH-NEXT: v_mov_b32_e32 v4, -1.0
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX9-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_rcp_f32_e32 v3, v3
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v5, v6, v2
-; GFX9-FLUSH-NEXT: v_mac_f32_e32 v4, v7, v3
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_f32 v5, v5, v2, -v2
+; GFX9-FLUSH-NEXT: v_mad_f32 v6, v6, v3, -v3
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mad_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX9-FLUSH-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX9-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX9-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX9-FLUSH-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX9-FLUSH-NEXT: v_add_f32_e32 v3, v3, v6
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX9-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX9-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5894,25 +5686,23 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-IEEE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-IEEE-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-IEEE-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX10-IEEE-NEXT: v_mov_b32_e32 v4, -1.0
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-IEEE-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_rcp_f32_e32 v3, v3
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX10-IEEE-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, -v1, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v0, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v6, v2
-; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v7, v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v5, v1, v2, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v6, v0, v3, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX10-IEEE-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v7, -v1, v5, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_fma_mix_f32 v4, -v0, v6, v4 op_sel_hi:[1,0,0]
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v2, v7, v2
+; GFX10-IEEE-NEXT: v_mul_f32_e32 v3, v4, v3
; GFX10-IEEE-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-IEEE-NEXT: v_and_b32_e32 v3, 0xff800000, v3
; GFX10-IEEE-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v4
+; GFX10-IEEE-NEXT: v_add_f32_e32 v3, v3, v6
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-IEEE-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-IEEE-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5925,25 +5715,22 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX10-FLUSH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-FLUSH-NEXT: v_sqrt_f16_e32 v1, v0
; GFX10-FLUSH-NEXT: v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v6, -1.0
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v2, v1
; GFX10-FLUSH-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v4, v2
; GFX10-FLUSH-NEXT: v_rcp_f32_e32 v5, v3
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v7, v6, v4
-; GFX10-FLUSH-NEXT: v_mul_f32_e32 v8, v6, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v9, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v10, -v3, v8, v6
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v7, v9, v4
-; GFX10-FLUSH-NEXT: v_mac_f32_e32 v8, v10, v5
-; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v7, v6
-; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v8, v6
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v2, v4, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v3, v5, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v6, v6, v4, -v4
+; GFX10-FLUSH-NEXT: v_mad_f32 v7, v7, v5, -v5
+; GFX10-FLUSH-NEXT: v_mad_f32 v2, -v2, v6, -1.0
+; GFX10-FLUSH-NEXT: v_mad_f32 v3, -v3, v7, -1.0
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v2, v2, v4
; GFX10-FLUSH-NEXT: v_mul_f32_e32 v3, v3, v5
; GFX10-FLUSH-NEXT: v_and_b32_e32 v2, 0xff800000, v2
; GFX10-FLUSH-NEXT: v_and_b32_e32 v3, 0xff800000, v3
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v7
-; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v8
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX10-FLUSH-NEXT: v_add_f32_e32 v3, v3, v7
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX10-FLUSH-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX10-FLUSH-NEXT: v_div_fixup_f16 v1, v2, v1, -1.0
@@ -5956,7 +5743,7 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GFX11-NEXT: v_sqrt_f16_e32 v0, v0
-; GFX11-NEXT: v_cvt_f32_f16_e32 v4, -1.0
+; GFX11-NEXT: v_mov_b32_e32 v4, -1.0
; GFX11-NEXT: v_sqrt_f16_e32 v1, v1
; GFX11-NEXT: s_waitcnt_depctr 0xfff
; GFX11-NEXT: v_cvt_f32_f16_e32 v2, v0
@@ -5964,22 +5751,20 @@ define <2 x half> @v_neg_rsq_v2f16(<2 x half> %a) {
; GFX11-NEXT: v_rcp_f32_e32 v2, v2
; GFX11-NEXT: v_rcp_f32_e32 v3, v3
; GFX11-NEXT: s_waitcnt_depctr 0xfff
-; GFX11-NEXT: v_mul_f32_e32 v5, v4, v2
-; GFX11-NEXT: v_mul_f32_e32 v4, v4, v3
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v4, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fma_mix_f32 v7, -v1, v4, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_fmac_f32_e32 v5, v6, v2
-; GFX11-NEXT: v_mul_f32_e32 v3, v7, v3
-; GFX11-NEXT: v_fma_mix_f32 v6, -v0, v5, -1.0 op_sel_hi:[1,0,1]
-; GFX11-NEXT: v_dual_mul_f32 v2, v6, v2 :: v_dual_and_b32 v3, 0xff800000, v3
-; GFX11-NEXT: v_dual_add_f32 v3, v3, v4 :: v_dual_and_b32 v2, 0xff800000, v2
-; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX11-NEXT: v_add_f32_e32 v2, v2, v5
-; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
+; GFX11-NEXT: v_fma_mix_f32 v5, v0, v2, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v6, v1, v3, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_f32 v5, v5, v2, -v2
+; GFX11-NEXT: v_fma_f32 v6, v6, v3, -v3
+; GFX11-NEXT: v_fma_mix_f32 v7, -v0, v5, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_fma_mix_f32 v4, -v1, v6, v4 op_sel_hi:[1,0,0]
+; GFX11-NEXT: v_dual_mul_f32 v2, v7, v2 :: v_dual_mul_f32 v3, v4, v3
+; GFX11-NEXT: v_and_b32_e32 v2, 0xff800000, v2
+; GFX11-NEXT: v_dual_add_f32 v2, v2, v5 :: v_dual_and_b32 v3, 0xff800000, v3
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v6
; GFX11-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX11-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX11-NEXT: v_div_fixup_f16 v0, v2, v0, -1.0
+; GFX11-NEXT: v_div_fixup_f16 v1, v3, v1, -1.0
; GFX11-NEXT: v_pack_b32_f16 v0, v0, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
%sqrt = call contract <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
index 302b2395642d0..549af87c94949 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll
@@ -88,11 +88,10 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
; CI-NEXT: v_or_b32_e32 v1, s4, v0
; CI-NEXT: .LBB0_8: ; %Flow19
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v2
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
; CI-NEXT: v_mov_b32_e32 v0, 0x7e00
; CI-NEXT: s_and_b32 s2, 1, s2
; CI-NEXT: v_cndmask_b32_e32 v1, v1, v0, vcc
@@ -1197,16 +1196,15 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v1, s4, v1
; CI-NEXT: .LBB9_16: ; %Flow54
; CI-NEXT: v_cvt_f32_f16_e32 v2, s1
-; CI-NEXT: v_cvt_f32_f16_e32 v3, 0
; CI-NEXT: s_and_b32 s0, s0, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s0, 0x7c00
; CI-NEXT: s_cselect_b32 s4, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v2, v3
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v2
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s2, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v2, v3
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v2
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v2, 0x7e00
; CI-NEXT: s_and_b32 s3, 1, s4
@@ -1730,26 +1728,25 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-NEXT: v_or_b32_e32 v3, s1, v3
; CI-NEXT: .LBB10_32: ; %Flow124
; CI-NEXT: v_cvt_f32_f16_e32 v4, s2
-; CI-NEXT: v_cvt_f32_f16_e32 v5, 0
; CI-NEXT: s_and_b32 s1, s4, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s1, 0x7c00
; CI-NEXT: s_cselect_b32 s11, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e32 vcc, v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
; CI-NEXT: s_and_b32 s2, s6, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s2, 0x7c00
; CI-NEXT: s_cselect_b32 s6, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[0:1], 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s3
; CI-NEXT: s_and_b32 s4, s5, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
; CI-NEXT: s_cselect_b32 s12, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[2:3], 0, v4
; CI-NEXT: v_cvt_f32_f16_e32 v4, s10
; CI-NEXT: s_and_b32 s7, s7, 0x7fff
; CI-NEXT: s_cmpk_lg_i32 s7, 0x7c00
; CI-NEXT: s_cselect_b32 s7, 1, 0
-; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], v4, v5
+; CI-NEXT: v_cmp_nlg_f32_e64 s[4:5], 0, v4
; CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; CI-NEXT: v_mov_b32_e32 v4, 0x7e00
; CI-NEXT: s_and_b32 s10, 1, s11
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 9233f8059a202..9e152253bb6ca 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -7464,18 +7464,15 @@ define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(ptr addrspace(1) %o
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[4:5], s[2:3]
; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 1.0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, 2.0
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_max_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT: v_max_f32_e32 v2, 2.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
-; SI-GISEL-NEXT: v_min_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v2, 4.0, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-GISEL-NEXT: s_mov_b64 s[2:3], s[6:7]
; SI-GISEL-NEXT: buffer_store_short v2, v[0:1], s[0:3], 0 addr64
@@ -7639,27 +7636,24 @@ define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(ptr addrspace(1) %out, pt
; SI-GISEL-NEXT: v_mov_b32_e32 v1, 0
; SI-GISEL-NEXT: s_mov_b32 s10, 0
; SI-GISEL-NEXT: s_mov_b32 s11, 0xf000
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 1.0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 2.0
; SI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[2:3]
-; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v2, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v5, 4.0
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[4:5]
-; SI-GISEL-NEXT: buffer_load_ushort v6, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v3, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
; SI-GISEL-NEXT: s_mov_b64 s[8:9], s[6:7]
-; SI-GISEL-NEXT: buffer_load_ushort v7, v[0:1], s[8:11], 0 addr64 glc
+; SI-GISEL-NEXT: buffer_load_ushort v4, v[0:1], s[8:11], 0 addr64 glc
; SI-GISEL-NEXT: s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-GISEL-NEXT: v_add_f32_e32 v2, v4, v2
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v6
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-GISEL-NEXT: v_add_f32_e32 v2, 1.0, v2
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-GISEL-NEXT: v_add_f32_e32 v3, v4, v3
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v7
+; SI-GISEL-NEXT: v_add_f32_e32 v3, 2.0, v3
+; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v4, v4
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-GISEL-NEXT: v_add_f32_e32 v4, v4, v5
+; SI-GISEL-NEXT: v_add_f32_e32 v4, 4.0, v4
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v4, v4
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -8712,12 +8706,10 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) #1 {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 2.0
-; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 4.0
-; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
;
@@ -8796,17 +8788,15 @@ define <2 x half> @v_test_fmed3_r_i_i_v2f16_minimumnum_maximumnum(<2 x half> %a)
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 2.0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 4.0
-; SI-GISEL-NEXT: v_max_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_max_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_max_f32_e32 v0, 2.0, v0
+; SI-GISEL-NEXT: v_max_f32_e32 v1, 2.0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_min_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_min_f32_e32 v1, v1, v3
+; SI-GISEL-NEXT: v_min_f32_e32 v0, 4.0, v0
+; SI-GISEL-NEXT: v_min_f32_e32 v1, 4.0, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index af79c911f29f9..ac356fad5b2da 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -6011,8 +6011,7 @@ define half @v_exp_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6512,10 +6511,9 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6709,12 +6707,11 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index a99c1991a7909..d12ebe49814d8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -6092,8 +6092,7 @@ define half @v_exp10_f16_fast(half %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 0x3dc5
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
@@ -6594,10 +6593,9 @@ define <2 x half> @v_exp10_v2f16_fast(<2 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
@@ -6791,12 +6789,11 @@ define <3 x half> @v_exp10_v3f16_afn(<3 x half> %in) {
; SI-GISEL: ; %bb.0:
; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v3, 0x3dc5
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, v0, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT: v_mul_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
+; SI-GISEL-NEXT: v_mul_f32_e32 v1, 0x3fb8a000, v1
+; SI-GISEL-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v2
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v2, v2
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 3f66c23e1a73b..259ee0b26d2d8 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -488,13 +488,11 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -582,15 +580,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; GISEL-CI-NEXT: s_mov_b32 s7, 0xf000
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; GISEL-CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; GISEL-CI-NEXT: s_waitcnt vmcnt(0)
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v2, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index 21e6faf46f58d..ba77552e5809b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -313,13 +313,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext half %src0 to float
@@ -1009,28 +1007,26 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt(<2 x half> %src0, <2 x half> %s
; GISEL-CI: ; %bb.0:
; GISEL-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_mac_f32_e32 v5, v1, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v5
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
%src0.ext = fpext <2 x half> %src0 to <2 x float>
@@ -1225,25 +1221,23 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v7, v1, v4
; GISEL-CI-NEXT: v_mac_f32_e32 v8, v2, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v7
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v8
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v8
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v2, v3, v2
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v3
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v3
-; GISEL-CI-NEXT: v_min_f32_e32 v2, v2, v3
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; GISEL-CI-NEXT: s_setpc_b64 s[30:31]
@@ -1441,30 +1435,28 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; GISEL-CI-NEXT: v_mac_f32_e32 v11, v3, v7
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v8
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v9
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v10
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v4, v11
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v10
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v11
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GISEL-CI-NEXT: v_max_f32_e32 v0, v0, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v3, v3, v2
-; GISEL-CI-NEXT: v_max_f32_e32 v2, v4, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v0, 0, v0
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
+; GISEL-CI-NEXT: v_max_f32_e32 v2, 0, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v3, 0, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v5, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v4, v2
-; GISEL-CI-NEXT: v_min_f32_e32 v0, v0, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v2, v3, v5
-; GISEL-CI-NEXT: v_min_f32_e32 v3, v4, v5
+; GISEL-CI-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
+; GISEL-CI-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v3, 1.0, v3
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -1622,16 +1614,14 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_lo(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
@@ -1790,17 +1780,15 @@ define <2 x half> @v_mad_mix_v2f32_clamp_postcvt_hi(<2 x half> %src0, <2 x half>
; GISEL-CI-NEXT: v_mac_f32_e32 v4, v0, v2
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v0, v5
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v4
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 0
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GISEL-CI-NEXT: v_or_b32_e32 v0, v1, v0
; GISEL-CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; GISEL-CI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GISEL-CI-NEXT: v_max_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_max_f32_e32 v1, 0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v2, 1.0
; GISEL-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GISEL-CI-NEXT: v_min_f32_e32 v1, v1, v2
+; GISEL-CI-NEXT: v_min_f32_e32 v1, 1.0, v1
; GISEL-CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; GISEL-CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GISEL-CI-NEXT: v_or_b32_e32 v0, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 4f73e8e9c1883..c90b2c9170414 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -271,8 +271,7 @@ define half @v_maximumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_max_f32_e32 v0, 1.0, v0
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 558006d2b6957..64e8b7b50de08 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -271,8 +271,7 @@ define half @v_minimumnum_f16_1.0(half %x) {
; GFX7-GISEL: ; %bb.0:
; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, 1.0
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-GISEL-NEXT: v_min_f32_e32 v0, 1.0, v0
; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
;
>From 24c35a94ff1782fd0c4390279795e1cd62b6cfc2 Mon Sep 17 00:00:00 2001
From: Ryan Cowan <ryan.cowan at arm.com>
Date: Mon, 29 Sep 2025 13:23:38 +0000
Subject: [PATCH 2/2] Undo legalizer changes and keep commit to fold
---
.../AArch64/GISel/AArch64LegalizerInfo.cpp | 4 +-
.../AArch64/GlobalISel/legalize-constant.mir | 5 +-
.../GlobalISel/legalize-fp16-fconstant.mir | 6 +-
.../CodeGen/AArch64/arm64-indexed-memory.ll | 7 +-
llvm/test/CodeGen/AArch64/dup.ll | 30 +-
llvm/test/CodeGen/AArch64/f16-instructions.ll | 21 +-
llvm/test/CodeGen/AArch64/fcvt-fixed.ll | 561 ++++++++++++------
.../CodeGen/AArch64/vecreduce-fadd-strict.ll | 52 +-
llvm/test/CodeGen/AArch64/vecreduce-fadd.ll | 42 +-
.../CodeGen/AArch64/vecreduce-fmul-strict.ll | 30 +-
10 files changed, 507 insertions(+), 251 deletions(-)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 5613364626692..ea2196a584127 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -678,8 +678,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
.widenScalarToNextPow2(0)
.clampScalar(0, s8, s64);
getActionDefinitionsBuilder(G_FCONSTANT)
- // Always legalize S16 to prevent G_FCONSTANT being widened to G_CONSTANT
- .legalFor({s16, s32, s64, s128})
+ .legalFor({s32, s64, s128})
+ .legalFor(HasFP16, {s16})
.clampScalar(0, MinFPScalar, s128);
// FIXME: fix moreElementsToNextPow2
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
index c00ce2242a888..c301e76852b54 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
@@ -48,9 +48,8 @@ body: |
; CHECK-NEXT: $w0 = COPY [[C]](s32)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_FCONSTANT double 2.000000e+00
; CHECK-NEXT: $x0 = COPY [[C1]](s64)
- ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH0000
- ; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[C2]](s16)
- ; CHECK-NEXT: $w0 = COPY [[ANYEXT]](s32)
+ ; CHECK-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+ ; CHECK-NEXT: $w0 = COPY [[C2]](s32)
%0:_(s32) = G_FCONSTANT float 1.0
$w0 = COPY %0
%1:_(s64) = G_FCONSTANT double 2.0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp16-fconstant.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp16-fconstant.mir
index c6df3456a8445..ddf219dc4927e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp16-fconstant.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-fp16-fconstant.mir
@@ -8,7 +8,7 @@ tracksRegLiveness: true
body: |
bb.0:
; NO-FP16-LABEL: name: fp16
- ; NO-FP16: %cst:_(s16) = G_FCONSTANT half 0xH0000
+ ; NO-FP16: %cst:_(s16) = G_CONSTANT i16 0
; NO-FP16-NEXT: $h0 = COPY %cst(s16)
; NO-FP16-NEXT: RET_ReallyLR implicit $h0
;
@@ -26,7 +26,7 @@ tracksRegLiveness: true
body: |
bb.0:
; NO-FP16-LABEL: name: fp16_non_zero
- ; NO-FP16: %cst:_(s16) = G_FCONSTANT half 0xH4000
+ ; NO-FP16: %cst:_(s16) = G_CONSTANT i16 16384
; NO-FP16-NEXT: $h0 = COPY %cst(s16)
; NO-FP16-NEXT: RET_ReallyLR implicit $h0
;
@@ -44,7 +44,7 @@ tracksRegLiveness: true
body: |
bb.1.entry:
; NO-FP16-LABEL: name: nan
- ; NO-FP16: %cst:_(s16) = G_FCONSTANT half 0xH7C01
+ ; NO-FP16: %cst:_(s16) = G_CONSTANT i16 31745
; NO-FP16-NEXT: %ext:_(s32) = G_FPEXT %cst(s16)
; NO-FP16-NEXT: $w0 = COPY %ext(s32)
; NO-FP16-NEXT: RET_ReallyLR implicit $w0
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
index e8e563135acc5..cb5df07c7ede4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-memory.ll
@@ -739,12 +739,15 @@ define ptr @postidx32_shalf(ptr %src, ptr %out, half %a) {
;
; GISEL-LABEL: postidx32_shalf:
; GISEL: ; %bb.0:
+; GISEL-NEXT: mov w8, #0 ; =0x0
; GISEL-NEXT: ldr h1, [x0], #4
+; GISEL-NEXT: fmov s2, w8
; GISEL-NEXT: ; kill: def $h0 killed $h0 def $s0
; GISEL-NEXT: fmov w9, s0
-; GISEL-NEXT: fcvt s2, h1
+; GISEL-NEXT: fcvt s3, h1
; GISEL-NEXT: fmov w8, s1
-; GISEL-NEXT: fcmp s2, #0.0
+; GISEL-NEXT: fcvt s2, h2
+; GISEL-NEXT: fcmp s3, s2
; GISEL-NEXT: csel w8, w8, w9, mi
; GISEL-NEXT: strh w8, [x1]
; GISEL-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 1c4a6ab2217b0..079ff1076b110 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -1469,9 +1469,8 @@ define <2 x half> @loaddup_str_v2half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v2half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
-; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1527,9 +1526,8 @@ define <3 x half> @loaddup_str_v3half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v3half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
-; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1585,9 +1583,8 @@ define <4 x half> @loaddup_str_v4half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v4half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
-; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1642,9 +1639,8 @@ define <8 x half> @loaddup_str_v8half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v8half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.8h, v0.h[0]
-; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1717,10 +1713,9 @@ define <16 x half> @loaddup_str_v16half(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v16half:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: movi d2, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.8h, v1.h[0]
; CHECK-GI-NEXT: dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT: str h2, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load half, ptr %p
@@ -1776,9 +1771,8 @@ define <2 x bfloat> @loaddup_str_v2bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v2bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
-; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
@@ -1834,9 +1828,8 @@ define <3 x bfloat> @loaddup_str_v3bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v3bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
-; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
@@ -1892,9 +1885,8 @@ define <4 x bfloat> @loaddup_str_v4bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v4bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.4h, v0.h[0]
-; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
@@ -1949,9 +1941,8 @@ define <8 x bfloat> @loaddup_str_v8bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v8bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h0, [x0]
-; CHECK-GI-NEXT: movi d1, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.8h, v0.h[0]
-; CHECK-GI-NEXT: str h1, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
@@ -2024,10 +2015,9 @@ define <16 x bfloat> @loaddup_str_v16bfloat(ptr %p) {
; CHECK-GI-LABEL: loaddup_str_v16bfloat:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr h1, [x0]
-; CHECK-GI-NEXT: movi d2, #0000000000000000
+; CHECK-GI-NEXT: strh wzr, [x0]
; CHECK-GI-NEXT: dup v0.8h, v1.h[0]
; CHECK-GI-NEXT: dup v1.8h, v1.h[0]
-; CHECK-GI-NEXT: str h2, [x0]
; CHECK-GI-NEXT: ret
entry:
%a = load bfloat, ptr %p
diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll
index 085170c7ba381..adc536da26f26 100644
--- a/llvm/test/CodeGen/AArch64/f16-instructions.ll
+++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll
@@ -782,17 +782,18 @@ define void @test_fccmp(half %in, ptr %out) {
;
; CHECK-CVT-GI-LABEL: test_fccmp:
; CHECK-CVT-GI: // %bb.0:
+; CHECK-CVT-GI-NEXT: mov w8, #17664 // =0x4500
+; CHECK-CVT-GI-NEXT: mov w9, #18432 // =0x4800
; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $s0
-; CHECK-CVT-GI-NEXT: fcvt s1, h0
-; CHECK-CVT-GI-NEXT: fmov s2, #5.00000000
-; CHECK-CVT-GI-NEXT: adrp x8, .LCPI29_0
-; CHECK-CVT-GI-NEXT: fmov s3, #8.00000000
-; CHECK-CVT-GI-NEXT: fcmp s1, s2
-; CHECK-CVT-GI-NEXT: ldr h2, [x8, :lo12:.LCPI29_0]
-; CHECK-CVT-GI-NEXT: fmov w8, s0
-; CHECK-CVT-GI-NEXT: fmov w9, s2
-; CHECK-CVT-GI-NEXT: fccmp s1, s3, #4, mi
-; CHECK-CVT-GI-NEXT: csel w8, w8, w9, gt
+; CHECK-CVT-GI-NEXT: fcvt s2, h0
+; CHECK-CVT-GI-NEXT: fmov s1, w8
+; CHECK-CVT-GI-NEXT: fmov s3, w9
+; CHECK-CVT-GI-NEXT: fmov w9, s0
+; CHECK-CVT-GI-NEXT: fcvt s1, h1
+; CHECK-CVT-GI-NEXT: fcvt s3, h3
+; CHECK-CVT-GI-NEXT: fcmp s2, s1
+; CHECK-CVT-GI-NEXT: fccmp s2, s3, #4, mi
+; CHECK-CVT-GI-NEXT: csel w8, w9, w8, gt
; CHECK-CVT-GI-NEXT: strh w8, [x0]
; CHECK-CVT-GI-NEXT: ret
;
diff --git a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
index 743d1604388de..51aad4fe25d3b 100644
--- a/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
+++ b/llvm/test/CodeGen/AArch64/fcvt-fixed.ll
@@ -149,21 +149,33 @@ define i64 @fcvtzs_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_f16_i32_7(half %flt) {
-; CHECK-NO16-LABEL: fcvtzs_f16_i32_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI8_0
@@ -177,21 +189,33 @@ define i32 @fcvtzs_f16_i32_7(half %flt) {
}
define i32 @fcvtzs_f16_i32_15(half %flt) {
-; CHECK-NO16-LABEL: fcvtzs_f16_i32_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_f16_i32_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_f16_i32_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI9_0
@@ -205,21 +229,33 @@ define i32 @fcvtzs_f16_i32_15(half %flt) {
}
define i64 @fcvtzs_f16_i64_7(half %flt) {
-; CHECK-NO16-LABEL: fcvtzs_f16_i64_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI10_0
@@ -233,21 +269,33 @@ define i64 @fcvtzs_f16_i64_7(half %flt) {
}
define i64 @fcvtzs_f16_i64_15(half %flt) {
-; CHECK-NO16-LABEL: fcvtzs_f16_i64_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_f16_i64_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_f16_i64_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI11_0
@@ -405,21 +453,33 @@ define i64 @fcvtzu_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_f16_i32_7(half %flt) {
-; CHECK-NO16-LABEL: fcvtzu_f16_i32_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI20_0
@@ -433,21 +493,33 @@ define i32 @fcvtzu_f16_i32_7(half %flt) {
}
define i32 @fcvtzu_f16_i32_15(half %flt) {
-; CHECK-NO16-LABEL: fcvtzu_f16_i32_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_f16_i32_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_f16_i32_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI21_0
@@ -461,21 +533,33 @@ define i32 @fcvtzu_f16_i32_15(half %flt) {
}
define i64 @fcvtzu_f16_i64_7(half %flt) {
-; CHECK-NO16-LABEL: fcvtzu_f16_i64_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI22_0
@@ -489,21 +573,33 @@ define i64 @fcvtzu_f16_i64_7(half %flt) {
}
define i64 @fcvtzu_f16_i64_15(half %flt) {
-; CHECK-NO16-LABEL: fcvtzu_f16_i64_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_f16_i64_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_f16_i64_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI23_0
@@ -678,11 +774,13 @@ define half @scvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s1, w0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: scvtf s0, w0
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -716,11 +814,13 @@ define half @scvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s1, w0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: scvtf s0, w0
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -754,11 +854,13 @@ define half @scvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s1, x0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: scvtf s0, x0
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -792,11 +894,13 @@ define half @scvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: scvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: scvtf s1, x0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: scvtf s0, x0
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -974,11 +1078,13 @@ define half @ucvtf_f16_i32_7(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s1, w0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: ucvtf s0, w0
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1012,11 +1118,13 @@ define half @ucvtf_f16_i32_15(i32 %int) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i32_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s1, w0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: ucvtf s0, w0
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1050,11 +1158,13 @@ define half @ucvtf_f16_i64_7(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_7:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s1, x0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #67, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: ucvtf s0, x0
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1088,11 +1198,13 @@ define half @ucvtf_f16_i64_15(i64 %long) {
;
; CHECK-GI-NO16-LABEL: ucvtf_f16_i64_15:
; CHECK-GI-NO16: // %bb.0:
-; CHECK-GI-NO16-NEXT: ucvtf s1, x0
-; CHECK-GI-NO16-NEXT: movi v0.2s, #71, lsl #24
-; CHECK-GI-NO16-NEXT: fcvt h1, s1
+; CHECK-GI-NO16-NEXT: ucvtf s0, x0
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: fcvt s1, h1
-; CHECK-GI-NO16-NEXT: fdiv s0, s1, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fdiv s0, s0, s1
; CHECK-GI-NO16-NEXT: fcvt h0, s0
; CHECK-GI-NO16-NEXT: ret
;
@@ -1244,21 +1356,33 @@ define i64 @fcvtzs_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI55_0
@@ -1272,21 +1396,33 @@ define i32 @fcvtzs_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzs_sat_f16_i32_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i32_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i32_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI56_0
@@ -1300,21 +1436,33 @@ define i32 @fcvtzs_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI57_0
@@ -1328,21 +1476,33 @@ define i64 @fcvtzs_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzs_sat_f16_i64_15(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzs_sat_f16_i64_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzs x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzs_sat_f16_i64_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzs x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzs x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzs_sat_f16_i64_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzs x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzs_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI58_0
@@ -1490,21 +1650,33 @@ define i64 @fcvtzu_sat_f64_i64_64(double %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI66_0
@@ -1518,21 +1690,33 @@ define i32 @fcvtzu_sat_f16_i32_7(half %dbl) {
}
define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzu_sat_f16_i32_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu w0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i32_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu w0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu w0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i32_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu w0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i32_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI67_0
@@ -1546,21 +1730,33 @@ define i32 @fcvtzu_sat_f16_i32_15(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_7:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #67, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_7:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #67, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #7
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_7:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #22528 // =0x5800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_7:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI68_0
@@ -1574,21 +1770,33 @@ define i64 @fcvtzu_sat_f16_i64_7(half %dbl) {
}
define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
-; CHECK-NO16-LABEL: fcvtzu_sat_f16_i64_15:
-; CHECK-NO16: // %bb.0:
-; CHECK-NO16-NEXT: movi v1.2s, #71, lsl #24
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fmul s0, s0, s1
-; CHECK-NO16-NEXT: fcvt h0, s0
-; CHECK-NO16-NEXT: fcvt s0, h0
-; CHECK-NO16-NEXT: fcvtzu x0, s0
-; CHECK-NO16-NEXT: ret
+; CHECK-SD-NO16-LABEL: fcvtzu_sat_f16_i64_15:
+; CHECK-SD-NO16: // %bb.0:
+; CHECK-SD-NO16-NEXT: movi v1.2s, #71, lsl #24
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fmul s0, s0, s1
+; CHECK-SD-NO16-NEXT: fcvt h0, s0
+; CHECK-SD-NO16-NEXT: fcvt s0, h0
+; CHECK-SD-NO16-NEXT: fcvtzu x0, s0
+; CHECK-SD-NO16-NEXT: ret
;
; CHECK-SD-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-SD-FP16: // %bb.0:
; CHECK-SD-FP16-NEXT: fcvtzu x0, h0, #15
; CHECK-SD-FP16-NEXT: ret
;
+; CHECK-GI-NO16-LABEL: fcvtzu_sat_f16_i64_15:
+; CHECK-GI-NO16: // %bb.0:
+; CHECK-GI-NO16-NEXT: mov w8, #30720 // =0x7800
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fmov s1, w8
+; CHECK-GI-NO16-NEXT: fcvt s1, h1
+; CHECK-GI-NO16-NEXT: fmul s0, s0, s1
+; CHECK-GI-NO16-NEXT: fcvt h0, s0
+; CHECK-GI-NO16-NEXT: fcvt s0, h0
+; CHECK-GI-NO16-NEXT: fcvtzu x0, s0
+; CHECK-GI-NO16-NEXT: ret
+;
; CHECK-GI-FP16-LABEL: fcvtzu_sat_f16_i64_15:
; CHECK-GI-FP16: // %bb.0:
; CHECK-GI-FP16-NEXT: adrp x8, .LCPI69_0
@@ -1603,3 +1811,4 @@ define i64 @fcvtzu_sat_f16_i64_15(half %dbl) {
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
; CHECK-FP16: {{.*}}
+; CHECK-NO16: {{.*}}
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
index 8e0328eaa2658..594a3ab79d73b 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll
@@ -38,11 +38,17 @@ define half @add_v2HalfH(<2 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v2HalfH:
; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: fmov s1, w8
+; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
+; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
; CHECK-GI-NOFP16-NEXT: ret
;
@@ -82,13 +88,19 @@ define half @add_v3HalfH(<3 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_v3HalfH:
; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
-; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2]
+; CHECK-GI-NOFP16-NEXT: fmov s1, w8
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[2]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
@@ -140,11 +152,17 @@ define half @add_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: fmov s1, w8
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -232,10 +250,16 @@ define half @add_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: fmov s1, w8
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fadd s1, s1, s2
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -424,10 +448,16 @@ define half @add_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: add_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov w8, #32768 // =0x8000
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
+; CHECK-GI-NOFP16-NEXT: fmov s2, w8
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
+; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fadd s2, s3, s2
+; CHECK-GI-NOFP16-NEXT: fadd s2, s2, s3
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
index 40925da0557ec..18f463cfcf7c9 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd.ll
@@ -405,23 +405,26 @@ define half @fadd_reduction_v4f16_in_loop(ptr %ptr.start) {
;
; CHECK-GI-NOFP16-LABEL: fadd_reduction_v4f16_in_loop:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: movi d0, #0000000000000000
; CHECK-GI-NOFP16-NEXT: mov x8, xzr
+; CHECK-GI-NOFP16-NEXT: mov w9, #0 // =0x0
; CHECK-GI-NOFP16-NEXT: .LBB13_1: // %loop
; CHECK-GI-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NOFP16-NEXT: ldr d1, [x0, x8]
-; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: ldr d0, [x0, x8]
+; CHECK-GI-NOFP16-NEXT: fmov s1, w9
; CHECK-GI-NOFP16-NEXT: add x8, x8, #8
; CHECK-GI-NOFP16-NEXT: cmp w8, #56
-; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: faddp v1.4s, v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: faddp s1, v1.2s
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
+; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: fmov w9, s0
; CHECK-GI-NOFP16-NEXT: b.ne .LBB13_1
; CHECK-GI-NOFP16-NEXT: // %bb.2: // %exit
+; CHECK-GI-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fadd_reduction_v4f16_in_loop:
@@ -518,25 +521,28 @@ define half @fadd_reduction_v8f16_in_loop(ptr %ptr.start) {
;
; CHECK-GI-NOFP16-LABEL: fadd_reduction_v8f16_in_loop:
; CHECK-GI-NOFP16: // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT: movi d0, #0000000000000000
; CHECK-GI-NOFP16-NEXT: mov x8, xzr
+; CHECK-GI-NOFP16-NEXT: mov w9, #0 // =0x0
; CHECK-GI-NOFP16-NEXT: .LBB14_1: // %loop
; CHECK-GI-NOFP16-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-GI-NOFP16-NEXT: ldr q1, [x0, x8]
-; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: ldr q0, [x0, x8]
; CHECK-GI-NOFP16-NEXT: add x8, x8, #8
; CHECK-GI-NOFP16-NEXT: cmp w8, #56
-; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v1.4h
-; CHECK-GI-NOFP16-NEXT: fcvtl2 v1.4s, v1.8h
-; CHECK-GI-NOFP16-NEXT: fadd v1.4s, v2.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: faddp v1.4s, v1.4s, v1.4s
-; CHECK-GI-NOFP16-NEXT: faddp s1, v1.2s
-; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v0.4h
+; CHECK-GI-NOFP16-NEXT: fcvtl2 v0.4s, v0.8h
+; CHECK-GI-NOFP16-NEXT: fadd v0.4s, v1.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: fmov s1, w9
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fadd s0, s1, s0
+; CHECK-GI-NOFP16-NEXT: faddp v0.4s, v0.4s, v0.4s
+; CHECK-GI-NOFP16-NEXT: faddp s0, v0.2s
+; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: fcvt s0, h0
+; CHECK-GI-NOFP16-NEXT: fadd s0, s0, s1
; CHECK-GI-NOFP16-NEXT: fcvt h0, s0
+; CHECK-GI-NOFP16-NEXT: fmov w9, s0
; CHECK-GI-NOFP16-NEXT: b.ne .LBB14_1
; CHECK-GI-NOFP16-NEXT: // %bb.2: // %exit
+; CHECK-GI-NOFP16-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-GI-NOFP16-NEXT: ret
;
; CHECK-GI-FP16-LABEL: fadd_reduction_v8f16_in_loop:
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
index 716401e2ebafe..e1b21705c95f3 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll
@@ -52,11 +52,17 @@ define half @mul_HalfH(<4 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_HalfH:
; CHECK-GI-NOFP16: // %bb.0:
+; CHECK-GI-NOFP16-NEXT: mov w8, #15360 // =0x3c00
; CHECK-GI-NOFP16-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: fmov s1, w8
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: mov h0, v0.h[3]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
@@ -138,10 +144,16 @@ define half @mul_H(<8 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov h1, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov w8, #15360 // =0x3c00
; CHECK-GI-NOFP16-NEXT: fcvt s2, h0
+; CHECK-GI-NOFP16-NEXT: fmov s1, w8
+; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
+; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
; CHECK-GI-NOFP16-NEXT: fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT: fmul s1, s2, s1
+; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s2
; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h1, s1
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
@@ -309,10 +321,16 @@ define half @mul_2H(<16 x half> %bin.rdx) {
;
; CHECK-GI-NOFP16-LABEL: mul_2H:
; CHECK-GI-NOFP16: // %bb.0:
-; CHECK-GI-NOFP16-NEXT: mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: mov w8, #15360 // =0x3c00
; CHECK-GI-NOFP16-NEXT: fcvt s3, h0
+; CHECK-GI-NOFP16-NEXT: fmov s2, w8
; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
-; CHECK-GI-NOFP16-NEXT: fmul s2, s3, s2
+; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
+; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[1]
+; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
+; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
+; CHECK-GI-NOFP16-NEXT: fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT: fmul s2, s2, s3
; CHECK-GI-NOFP16-NEXT: mov h3, v0.h[2]
; CHECK-GI-NOFP16-NEXT: fcvt h2, s2
; CHECK-GI-NOFP16-NEXT: fcvt s3, h3
More information about the llvm-commits
mailing list