[llvm-branch-commits] [llvm] AMDGPU: Change ABI of 16-bit scalar values for gfx6/gfx7 (PR #175795)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jan 13 12:09:43 PST 2026
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/175795
>From e80256895aa2bff147512b185c89c672aef7b7df Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 13 Jan 2026 14:57:43 +0100
Subject: [PATCH] AMDGPU: Change ABI of 16-bit scalar values for gfx6/gfx7
Keep bf16/f16 values encoded as the low half of a 32-bit register,
instead of promoting to float. This avoids unwanted FP effects
from the fpext/fptrunc which should not be implied by just
passing an argument. This also fixes ABI divergence between
SelectionDAG and GlobalISel.
I've wanted to make this change for ages, and failed the last
few times. The main complication was the hack to return
shader integer types in SGPRs, which now needs to inspect
the underlying IR type.
---
llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td | 11 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 3 +
llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll | 1 +
.../CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll | 94 ++-
llvm/test/CodeGen/AMDGPU/bf16.ll | 677 ++++++---------
.../buffer-fat-pointer-atomicrmw-fadd.ll | 186 ++--
.../buffer-fat-pointer-atomicrmw-fmax.ll | 186 ++--
.../buffer-fat-pointer-atomicrmw-fmin.ll | 186 ++--
.../CodeGen/AMDGPU/call-argument-types.ll | 2 +-
.../CodeGen/AMDGPU/calling-conventions.ll | 22 +-
llvm/test/CodeGen/AMDGPU/clamp.ll | 10 +-
.../AMDGPU/constant-address-space-32bit.ll | 276 +++---
llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll | 101 +--
.../CodeGen/AMDGPU/dagcombine-fmul-sel.ll | 202 ++---
.../AMDGPU/divergence-driven-buildvector.ll | 5 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll | 36 +-
llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 290 +++----
llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll | 254 +++---
llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll | 18 +-
llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll | 28 +-
llvm/test/CodeGen/AMDGPU/fdiv.f16.ll | 8 +-
.../CodeGen/AMDGPU/flat-atomicrmw-fadd.ll | 208 ++---
.../CodeGen/AMDGPU/flat-atomicrmw-fmax.ll | 210 ++---
.../CodeGen/AMDGPU/flat-atomicrmw-fmin.ll | 210 ++---
.../CodeGen/AMDGPU/flat-atomicrmw-fsub.ll | 208 ++---
llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll | 32 +-
llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll | 6 +-
.../test/CodeGen/AMDGPU/fmed3-cast-combine.ll | 185 ++--
llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fmed3.ll | 4 +-
llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll | 32 +-
llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll | 6 +-
llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll | 791 ++++++++----------
llvm/test/CodeGen/AMDGPU/fneg-combines.ll | 6 +-
llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll | 131 ++-
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 28 +-
llvm/test/CodeGen/AMDGPU/fneg.ll | 1 +
llvm/test/CodeGen/AMDGPU/fpow.ll | 3 +-
llvm/test/CodeGen/AMDGPU/fract-match.ll | 56 +-
llvm/test/CodeGen/AMDGPU/function-args.ll | 76 +-
llvm/test/CodeGen/AMDGPU/function-returns.ll | 26 +-
.../CodeGen/AMDGPU/global-atomicrmw-fadd.ll | 228 ++---
.../CodeGen/AMDGPU/global-atomicrmw-fmax.ll | 232 ++---
.../CodeGen/AMDGPU/global-atomicrmw-fmin.ll | 232 ++---
.../CodeGen/AMDGPU/global-atomicrmw-fsub.ll | 228 ++---
llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll | 4 +-
.../integer-canonicalizing-src-modifiers.ll | 4 -
.../llvm.amdgcn.raw.ptr.buffer.load.bf16.ll | 1 -
.../llvm.amdgcn.raw.ptr.buffer.store.bf16.ll | 2 -
llvm/test/CodeGen/AMDGPU/llvm.exp.ll | 77 +-
llvm/test/CodeGen/AMDGPU/llvm.exp10.ll | 77 +-
llvm/test/CodeGen/AMDGPU/llvm.exp2.ll | 85 +-
llvm/test/CodeGen/AMDGPU/llvm.frexp.ll | 38 +-
.../CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll | 151 ++--
.../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll | 181 ++--
llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll | 94 +--
llvm/test/CodeGen/AMDGPU/llvm.log.ll | 88 +-
llvm/test/CodeGen/AMDGPU/llvm.log10.ll | 88 +-
llvm/test/CodeGen/AMDGPU/llvm.log2.ll | 83 +-
llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll | 32 +-
llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll | 4 +-
.../CodeGen/AMDGPU/local-atomicrmw-fadd.ll | 50 +-
.../CodeGen/AMDGPU/local-atomicrmw-fmax.ll | 52 +-
.../CodeGen/AMDGPU/local-atomicrmw-fmin.ll | 52 +-
.../CodeGen/AMDGPU/local-atomicrmw-fsub.ll | 50 +-
llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll | 30 +-
llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll | 23 +-
llvm/test/CodeGen/AMDGPU/mad-mix.ll | 96 ++-
llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll | 24 +-
llvm/test/CodeGen/AMDGPU/maximumnum.ll | 200 ++---
llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll | 24 +-
llvm/test/CodeGen/AMDGPU/minimumnum.ll | 174 ++--
llvm/test/CodeGen/AMDGPU/omod.ll | 9 +-
.../CodeGen/AMDGPU/private-memory-atomics.ll | 4 +-
llvm/test/CodeGen/AMDGPU/repeated-divisor.ll | 11 +-
llvm/test/CodeGen/AMDGPU/roundeven.ll | 4 +-
.../AMDGPU/select-fabs-fneg-extract.f16.ll | 265 +++---
.../AMDGPU/select-flags-to-fmin-fmax.ll | 24 +-
llvm/test/CodeGen/AMDGPU/strict_fpext.ll | 13 +-
llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll | 4 +-
.../AMDGPU/strictfp_f16_abi_promote.ll | 16 +-
.../test/CodeGen/AMDGPU/vector-reduce-fadd.ll | 20 +-
.../test/CodeGen/AMDGPU/vector-reduce-fmax.ll | 5 +
.../CodeGen/AMDGPU/vector-reduce-fmaximum.ll | 5 +
.../test/CodeGen/AMDGPU/vector-reduce-fmin.ll | 5 +
.../CodeGen/AMDGPU/vector-reduce-fminimum.ll | 5 +
.../test/CodeGen/AMDGPU/vector-reduce-fmul.ll | 20 +-
87 files changed, 3576 insertions(+), 4357 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index e891fdba4e03e..2932bbf0e7bbd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -14,6 +14,10 @@
class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
class CCIfExtend<CCAction A>
: CCIf<"ArgFlags.isSExt() || ArgFlags.isZExt()", A>;
+class CCIfOrigTypeShaderCCIsSGPR<CCAction A>
+ : CCIf<[{(!OrigTy->getScalarType()->isFloatTy() &&
+ !OrigTy->getScalarType()->isHalfTy()) }], A>;
+
// Calling convention for SI
def CC_SI_Gfx : CallingConv<[
@@ -56,14 +60,15 @@ def CC_SI_SHADER : CallingConv<[
>>>
]>;
+
def RetCC_SI_Shader : CallingConv<[
CCIfType<[i1, i16], CCIfExtend<CCPromoteToType<i32>>>,
- CCIfType<[i32, i16, v2i16] , CCAssignToReg<
+ CCIfType<[i32, i16, v2i16], CCIfOrigTypeShaderCCIsSGPR<CCAssignToReg<
!foreach(i, !range(0, 44), !cast<Register>("SGPR"#i)) // SGPR0-43
- >>,
+ >>>,
// 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
- CCIfType<[f32, f16, v2f16, bf16, v2bf16] , CCAssignToReg<
+ CCIfType<[f32, f16, v2f16, bf16, v2bf16, i32, i16, v2i16] , CCAssignToReg<
!foreach(i, !range(0, 136), !cast<Register>("VGPR"#i)) // VGPR0-135
>>
]>;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 49f5d514071e2..5dd3e929941eb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1121,6 +1121,9 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
return Size == 32 ? ScalarVT.getSimpleVT() : MVT::i32;
}
+ if (!Subtarget->has16BitInsts() && VT.getSizeInBits() == 16)
+ return MVT::i32;
+
if (VT.getSizeInBits() > 32)
return MVT::i32;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
index f48c72688533a..97fb83e0b6f45 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fma.ll
@@ -1664,6 +1664,7 @@ define amdgpu_ps <2 x half> @fma_v2s16_uniform(<2 x half> inreg %a, <2 x half> i
; GFX6-NEXT: s_and_b32 s1, 0xffff, s1
; GFX6-NEXT: s_lshl_b32 s1, s1, 16
; GFX6-NEXT: s_or_b32 s0, s0, s1
+; GFX6-NEXT: v_mov_b32_e32 v0, s0
; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: fma_v2s16_uniform:
diff --git a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
index 5344095e99217..ed44b1c0b294a 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgcn.bitcast.16bit.ll
@@ -15,22 +15,17 @@ define half @bitcast_i16_to_f16(i16 %a, i32 %b) {
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; SI-NEXT: s_cbranch_execnz .LBB0_3
-; SI-NEXT: ; %bb.1: ; %Flow
-; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: s_cbranch_execnz .LBB0_4
-; SI-NEXT: .LBB0_2: ; %end
-; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB0_3: ; %cmp.false
+; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_cvt_f32_f16_e32 v0, v2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; %bb.2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: s_cbranch_execz .LBB0_2
-; SI-NEXT: .LBB0_4: ; %cmp.true
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_add_i32_e32 v0, vcc, 3, v2
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: ; %bb.4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_i16_to_f16:
@@ -125,6 +120,7 @@ define inreg half @bitcast_i16_to_f16_scalar(i16 inreg %a, i32 inreg %b) {
; SI-NEXT: s_add_i32 s6, s6, 3
; SI-NEXT: v_cvt_f32_f16_e32 v0, s6
; SI-NEXT: .LBB1_3: ; %end
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB1_4:
; SI-NEXT: ; implicit-def: $vgpr0
@@ -199,8 +195,9 @@ define i16 @bitcast_f16_to_i16(half %a, i32 %b) {
; SI-LABEL: bitcast_f16_to_i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
@@ -294,8 +291,9 @@ define inreg i16 @bitcast_f16_to_i16_scalar(half inreg %a, i32 inreg %b) {
; SI-LABEL: bitcast_f16_to_i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s16
; SI-NEXT: s_cmp_lg_u32 s17, 0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_cbranch_scc0 .LBB3_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: s_cbranch_execnz .LBB3_3
@@ -408,6 +406,8 @@ define bfloat @bitcast_i16_to_bf16(i16 %a, i32 %b) {
; SI-NEXT: v_add_i32_e32 v0, vcc, 0x30000, v0
; SI-NEXT: ; %bb.2: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_i16_to_bf16:
@@ -502,7 +502,8 @@ define inreg bfloat @bitcast_i16_to_bf16_scalar(i16 inreg %a, i32 inreg %b) {
; SI-NEXT: s_lshl_b32 s4, s6, 16
; SI-NEXT: s_add_i32 s7, s4, 0x30000
; SI-NEXT: .LBB5_3: ; %end
-; SI-NEXT: v_mov_b32_e32 v0, s7
+; SI-NEXT: v_mul_f32_e64 v0, 1.0, s7
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB5_4:
; SI-NEXT: ; implicit-def: $sgpr7
@@ -577,6 +578,7 @@ define i16 @bitcast_bf16_to_i16(bfloat %a, i32 %b) {
; SI-LABEL: bitcast_bf16_to_i16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0
; SI-NEXT: ; implicit-def: $vgpr0
@@ -720,8 +722,9 @@ define inreg i16 @bitcast_bf16_to_i16_scalar(bfloat inreg %a, i32 inreg %b) {
; SI-LABEL: bitcast_bf16_to_i16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_lshl_b32 s4, s16, 16
; SI-NEXT: s_cmp_lg_u32 s17, 0
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v1, 1.0, s4
; SI-NEXT: s_cbranch_scc0 .LBB7_4
; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
@@ -835,29 +838,27 @@ define bfloat @bitcast_f16_to_bf16(half %a, i32 %b) {
; SI-LABEL: bitcast_f16_to_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v0
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; SI-NEXT: s_cbranch_execnz .LBB8_3
-; SI-NEXT: ; %bb.1: ; %Flow
-; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: s_cbranch_execnz .LBB8_4
-; SI-NEXT: .LBB8_2: ; %end
-; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB8_3: ; %cmp.false
+; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v2
; SI-NEXT: ; implicit-def: $vgpr2
+; SI-NEXT: ; %bb.2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: s_cbranch_execz .LBB8_2
-; SI-NEXT: .LBB8_4: ; %cmp.true
+; SI-NEXT: s_cbranch_execz .LBB8_4
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_cvt_f32_f16_e32 v0, v2
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: .LBB8_4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_f16_to_bf16:
@@ -942,21 +943,24 @@ define inreg bfloat @bitcast_f16_to_bf16_scalar(half inreg %a, i32 inreg %b) {
; SI-LABEL: bitcast_f16_to_bf16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, s16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s16
; SI-NEXT: s_cmp_lg_u32 s17, 0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_cbranch_scc0 .LBB9_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; SI-NEXT: s_cbranch_execnz .LBB9_3
; SI-NEXT: .LBB9_2: ; %cmp.true
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e32 v0, 0x38000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v0
; SI-NEXT: .LBB9_3: ; %end
+; SI-NEXT: v_mul_f32_e32 v0, 1.0, v1
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB9_4:
-; SI-NEXT: ; implicit-def: $vgpr0
+; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_branch .LBB9_2
;
; VI-LABEL: bitcast_f16_to_bf16_scalar:
@@ -1049,30 +1053,26 @@ define half @bitcast_bf16_to_f16(bfloat %a, i32 %b) {
; SI-LABEL: bitcast_bf16_to_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
; SI-NEXT: v_mul_f32_e32 v1, 1.0, v0
; SI-NEXT: ; implicit-def: $vgpr0
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
-; SI-NEXT: s_cbranch_execnz .LBB10_3
-; SI-NEXT: ; %bb.1: ; %Flow
-; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: s_cbranch_execnz .LBB10_4
-; SI-NEXT: .LBB10_2: ; %end
-; SI-NEXT: s_or_b64 exec, exec, s[4:5]
-; SI-NEXT: s_setpc_b64 s[30:31]
-; SI-NEXT: .LBB10_3: ; %cmp.false
+; SI-NEXT: ; %bb.1: ; %cmp.false
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: ; implicit-def: $vgpr1
+; SI-NEXT: ; %bb.2: ; %Flow
; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
-; SI-NEXT: s_cbranch_execz .LBB10_2
-; SI-NEXT: .LBB10_4: ; %cmp.true
+; SI-NEXT: ; %bb.3: ; %cmp.true
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: ; %bb.4: ; %end
; SI-NEXT: s_or_b64 exec, exec, s[4:5]
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: bitcast_bf16_to_f16:
@@ -1194,22 +1194,24 @@ define inreg half @bitcast_bf16_to_f16_scalar(bfloat inreg %a, i32 inreg %b) {
; SI-LABEL: bitcast_bf16_to_f16_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: s_lshl_b32 s4, s16, 16
; SI-NEXT: s_cmp_lg_u32 s17, 0
-; SI-NEXT: v_mul_f32_e64 v1, 1.0, s16
+; SI-NEXT: v_mul_f32_e64 v0, 1.0, s4
; SI-NEXT: s_cbranch_scc0 .LBB11_4
; SI-NEXT: ; %bb.1: ; %cmp.false
-; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: s_cbranch_execnz .LBB11_3
; SI-NEXT: .LBB11_2: ; %cmp.true
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; SI-NEXT: v_add_f32_e32 v0, 0x40c00000, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
; SI-NEXT: .LBB11_3: ; %end
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
; SI-NEXT: .LBB11_4:
-; SI-NEXT: ; implicit-def: $vgpr0
+; SI-NEXT: ; implicit-def: $vgpr1
; SI-NEXT: s_branch .LBB11_2
;
; VI-LABEL: bitcast_bf16_to_f16_scalar:
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index daa771a843ee6..0394ed7f89633 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -2280,12 +2280,10 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_arg_store:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s6, 0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: s_mov_b32 s7, 0xf000
; GCN-NEXT: s_mov_b32 s4, s6
; GCN-NEXT: s_mov_b32 s5, s6
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2294,11 +2292,9 @@ define void @test_arg_store(bfloat %in, ptr addrspace(1) %out) {
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v0, v[1:2], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -2681,12 +2677,11 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
; GCN-LABEL: test_inreg_arg_store:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s39, 0xf000
; GCN-NEXT: s_mov_b32 s38, 0
-; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s4
+; GCN-NEXT: s_mov_b32 s39, 0xf000
+; GCN-NEXT: v_mov_b32_e32 v2, s4
; GCN-NEXT: s_mov_b32 s36, s38
; GCN-NEXT: s_mov_b32 s37, s38
-; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2695,11 +2690,10 @@ define amdgpu_gfx void @test_inreg_arg_store(bfloat inreg %in, ptr addrspace(1)
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: s_mov_b32 s38, 0
-; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s4
; GFX7-NEXT: s_mov_b32 s39, 0xf000
; GFX7-NEXT: s_mov_b32 s36, s38
; GFX7-NEXT: s_mov_b32 s37, s38
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: buffer_store_short v2, v[0:1], s[36:39], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -2764,18 +2758,14 @@ define bfloat @test_byval(ptr addrspace(5) byval(bfloat) %bv, bfloat %val) {
; GCN-LABEL: test_byval:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: buffer_store_short v1, off, s[0:3], s32
+; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_byval:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: buffer_store_short v1, off, s[0:3], s32
+; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2842,8 +2832,6 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
; GCN-LABEL: test_sret:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -2851,8 +2839,6 @@ define void @test_sret(ptr addrspace(5) sret(bfloat) %sret, bfloat %val) {
; GFX7-LABEL: test_sret:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: buffer_store_short v1, v0, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -3327,8 +3313,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_readlane_b32 s31, v2, 1
@@ -3358,8 +3342,6 @@ define void @test_call(bfloat %in, ptr addrspace(5) %out) {
; GFX7-NEXT: v_writelane_b32 v2, s31, 1
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_readlane_b32 s31, v2, 1
@@ -5068,25 +5050,19 @@ define bfloat @test_alloca_load_store_ret(bfloat %in) {
; GCN-LABEL: test_alloca_load_store_ret:
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: buffer_store_short v0, off, s[0:3], s32
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GCN-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: test_alloca_load_store_ret:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v0, off, s[0:3], s32
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_load_ushort v0, off, s[0:3], s32 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_alloca_load_store_ret:
@@ -5199,7 +5175,6 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GCN-NEXT: buffer_store_dword v26, v29, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
; GCN-NEXT: v_add_i32_e32 v26, vcc, 0x48, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_add_i32_e32 v29, vcc, 0x44, v0
; GCN-NEXT: buffer_store_dword v25, v31, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt expcnt(0)
@@ -5234,7 +5209,6 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GCN-NEXT: v_add_i32_e32 v18, vcc, 8, v0
; GCN-NEXT: v_add_i32_e32 v25, vcc, 4, v0
; GCN-NEXT: v_add_i32_e32 v0, vcc, 0x80, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: buffer_store_dword v17, v31, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v16, v2, s[0:3], 0 offen
; GCN-NEXT: buffer_store_dword v15, v24, s[0:3], 0 offen
@@ -5260,8 +5234,6 @@ define { <32 x i32>, bfloat } @test_overflow_stack(bfloat %a, <32 x i32> %b) {
; GFX7-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_add_i32_e32 v31, vcc, 0x7c, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_store_dword v2, v31, s[0:3], 0 offen
; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4
@@ -9509,23 +9481,19 @@ define bfloat @v_fadd_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fadd_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_bf16:
@@ -13684,19 +13652,17 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GCN-LABEL: v_fadd_bf16_fpimm_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_bf16_fpimm_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_bf16_fpimm_0:
@@ -13809,19 +13775,17 @@ define bfloat @v_fadd_bf16_fpimm_1(bfloat %arg0) {
; GCN-LABEL: v_fadd_bf16_fpimm_1:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_add_f32_e32 v0, 0x42280000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fadd_bf16_fpimm_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_add_f32_e32 v0, 0x42280000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fadd_bf16_fpimm_1:
@@ -13934,23 +13898,19 @@ define bfloat @v_fsub_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fsub_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fsub_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fsub_bf16:
@@ -14792,23 +14752,19 @@ define bfloat @v_fmul_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fmul_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmul_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmul_bf16:
@@ -18964,10 +18920,8 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fdiv_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GCN-NEXT: v_rcp_f32_e32 v3, v2
; GCN-NEXT: v_fma_f32 v4, -v2, v3, 1.0
@@ -18979,16 +18933,14 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_fma_f32 v2, -v2, v5, v4
; GCN-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GCN-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fdiv_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
; GFX7-NEXT: v_rcp_f32_e32 v3, v2
; GFX7-NEXT: v_fma_f32 v4, -v2, v3, 1.0
@@ -19000,7 +18952,7 @@ define bfloat @v_fdiv_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_fma_f32 v2, -v2, v5, v4
; GFX7-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; GFX7-NEXT: v_div_fixup_f32 v0, v2, v1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_bf16:
@@ -19227,15 +19179,13 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN-LABEL: v_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fabs_bf16:
@@ -19288,16 +19238,12 @@ define bfloat @v_fabs_bf16(bfloat %a) {
define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fabs_bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fabs_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fabs_bf16:
@@ -19343,13 +19289,13 @@ define bfloat @v_fneg_bf16(bfloat %a) {
; GCN-LABEL: v_fneg_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fneg_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fneg_bf16:
@@ -19405,16 +19351,14 @@ declare i32 @llvm.amdgcn.readfirstlane(i32)
define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, -1.0, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_xor_b32 s0, s0, 0x8000
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, -1.0, s0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_xor_b32 s0, s0, 0x8000
+; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_bf16:
@@ -19460,17 +19404,13 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN-LABEL: v_fneg_fabs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fneg_fabs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_or_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_or_b32_e32 v0, 0xffff8000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fneg_fabs_bf16:
@@ -19525,18 +19465,14 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
; GCN-LABEL: s_fneg_fabs_bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_bitset1_b32 s0, 15
+; GCN-NEXT: s_and_b32 s0, s0, 0xffff
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_fneg_fabs_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_bitset1_b32 s0, 15
+; GFX7-NEXT: s_and_b32 s0, s0, 0xffff
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_fneg_fabs_bf16:
@@ -19591,23 +19527,23 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_minnum_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_minnum_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minnum_bf16:
@@ -24035,23 +23971,23 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_maxnum_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_maxnum_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maxnum_bf16:
@@ -28472,10 +28408,9 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GCN-LABEL: v_sqrt_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0xf800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x260
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -28492,14 +28427,13 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc
; GCN-NEXT: v_cmp_class_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sqrt_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0xf800000
; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
@@ -28518,7 +28452,7 @@ define bfloat @v_sqrt_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x260
; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v2
; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sqrt_bf16:
@@ -28737,10 +28671,9 @@ define bfloat @v_rsq_bf16(bfloat %x) {
; GCN-LABEL: v_rsq_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0xf800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x260
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -28769,14 +28702,13 @@ define bfloat @v_rsq_bf16(bfloat %x) {
; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3
; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GCN-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_rsq_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0xf800000
; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
@@ -28807,7 +28739,7 @@ define bfloat @v_rsq_bf16(bfloat %x) {
; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, 1.0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rsq_bf16:
@@ -29144,10 +29076,9 @@ define bfloat @v_neg_rsq_bf16(bfloat %x) {
; GCN-LABEL: v_neg_rsq_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0xf800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x260
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x4f800000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
@@ -29176,14 +29107,13 @@ define bfloat @v_neg_rsq_bf16(bfloat %x) {
; GCN-NEXT: v_fma_f32 v1, -v1, v4, v3
; GCN-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GCN-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_neg_rsq_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0xf800000
; GFX7-NEXT: v_mul_f32_e32 v1, 0x4f800000, v0
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
@@ -29214,7 +29144,7 @@ define bfloat @v_neg_rsq_bf16(bfloat %x) {
; GFX7-NEXT: v_fma_f32 v1, -v1, v4, v3
; GFX7-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX7-NEXT: v_div_fixup_f32 v0, v1, v0, -1.0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_neg_rsq_bf16:
@@ -29559,19 +29489,17 @@ define bfloat @v_ldexp_bf16_i32(bfloat %a, i32 %b) {
; GCN-LABEL: v_ldexp_bf16_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_ldexp_bf16_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ldexp_bf16_i32:
@@ -29686,25 +29614,24 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GCN-LABEL: v_frexp_bf16_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0x7f800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_frexp_mant_f32_e32 v1, v0
; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_frexp_bf16_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
-; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v0
+; GFX7-NEXT: v_frexp_mant_f32_e32 v0, v1
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_frexp_exp_i32_f32_e32 v1, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_frexp_bf16_i16:
@@ -29830,11 +29757,10 @@ define bfloat @v_log_bf16(bfloat %a) {
; GCN-LABEL: v_log_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x41b17218
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
@@ -29852,14 +29778,13 @@ define bfloat @v_log_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_log_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -29877,7 +29802,7 @@ define bfloat @v_log_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v1, 0x41b17218
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_log_bf16:
@@ -30116,24 +30041,22 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GCN-LABEL: v_log2_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x42000000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
; GCN-NEXT: v_log_f32_e32 v0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_log2_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -30142,7 +30065,7 @@ define bfloat @v_log2_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v1, 0x42000000
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_log2_bf16:
@@ -30289,11 +30212,10 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GCN-LABEL: v_log10_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0x800000
; GCN-NEXT: s_mov_b32 s5, 0x7f800000
; GCN-NEXT: v_mov_b32_e32 v1, 0x411a209b
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v2
@@ -30311,14 +30233,13 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v2, s[4:5]
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_sub_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_log10_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -30336,7 +30257,7 @@ define bfloat @v_log10_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v1, 0x411a209b
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_log10_bf16:
@@ -30579,11 +30500,10 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GCN-LABEL: v_exp_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0xc2ce8ed0
; GCN-NEXT: s_mov_b32 s5, 0x42b17218
; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x3fb8a000, v0
; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 0x39a3b295, v0
@@ -30601,14 +30521,13 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_exp_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x3fb8aa3b
; GFX7-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
@@ -30627,7 +30546,7 @@ define bfloat @v_exp_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_exp_bf16:
@@ -30876,25 +30795,23 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GCN-LABEL: v_exp2_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0xc2fc0000
; GCN-NEXT: v_mov_b32_e32 v1, 0x42800000
; GCN-NEXT: v_not_b32_e32 v2, 63
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
; GCN-NEXT: v_exp_f32_e32 v0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_exp2_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0xc2fc0000
; GFX7-NEXT: v_mov_b32_e32 v1, 0x42800000
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
@@ -30904,7 +30821,7 @@ define bfloat @v_exp2_bf16(bfloat %a) {
; GFX7-NEXT: v_not_b32_e32 v1, 63
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_exp2_bf16:
@@ -31053,11 +30970,10 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GCN-LABEL: v_exp10_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0xc23369f4
; GCN-NEXT: s_mov_b32 s5, 0x421a209b
; GCN-NEXT: v_mov_b32_e32 v1, 0x7f800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_mul_f32_e32 v2, 0x40549000, v0
; GCN-NEXT: v_sub_f32_e32 v3, v0, v0
; GCN-NEXT: v_mul_f32_e32 v4, 0x3a2784bc, v0
@@ -31075,14 +30991,13 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s5, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_exp10_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_mov_b32 s4, 0x40549a78
; GFX7-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
; GFX7-NEXT: v_fma_f32 v2, v0, s4, -v1
@@ -31101,7 +31016,7 @@ define bfloat @v_exp10_bf16(bfloat %a) {
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7f800000
; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_exp10_bf16:
@@ -31352,19 +31267,17 @@ define bfloat @v_ceil_bf16(bfloat %a) {
; GCN-LABEL: v_ceil_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_ceil_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_ceil_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_ceil_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_ceil_bf16:
@@ -31479,19 +31392,17 @@ define bfloat @v_trunc_bf16(bfloat %a) {
; GCN-LABEL: v_trunc_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_trunc_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_trunc_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_trunc_bf16:
@@ -31606,19 +31517,17 @@ define bfloat @v_rint_bf16(bfloat %a) {
; GCN-LABEL: v_rint_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_rndne_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_rint_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_rint_bf16:
@@ -31733,19 +31642,17 @@ define bfloat @v_nearbyint_bf16(bfloat %a) {
; GCN-LABEL: v_nearbyint_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_rndne_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_nearbyint_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_nearbyint_bf16:
@@ -31860,8 +31767,7 @@ define bfloat @v_round_bf16(bfloat %a) {
; GCN-LABEL: v_round_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_trunc_f32_e32 v1, v0
; GCN-NEXT: v_sub_f32_e32 v2, v0, v1
; GCN-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -31869,14 +31775,13 @@ define bfloat @v_round_bf16(bfloat %a) {
; GCN-NEXT: s_brev_b32 s4, -2
; GCN-NEXT: v_bfi_b32 v0, s4, v2, v0
; GCN-NEXT: v_add_f32_e32 v0, v1, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_round_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_trunc_f32_e32 v1, v0
; GFX7-NEXT: v_sub_f32_e32 v2, v0, v1
; GFX7-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, 0.5
@@ -31884,7 +31789,7 @@ define bfloat @v_round_bf16(bfloat %a) {
; GFX7-NEXT: s_brev_b32 s4, -2
; GFX7-NEXT: v_bfi_b32 v0, s4, v2, v0
; GFX7-NEXT: v_add_f32_e32 v0, v1, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_round_bf16:
@@ -32053,19 +31958,17 @@ define bfloat @v_roundeven_bf16(bfloat %a) {
; GCN-LABEL: v_roundeven_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_rndne_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_roundeven_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_rndne_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_roundeven_bf16:
@@ -32180,19 +32083,17 @@ define bfloat @v_floor_bf16(bfloat %a) {
; GCN-LABEL: v_floor_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_floor_f32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_floor_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_floor_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_floor_bf16:
@@ -32307,17 +32208,17 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GCN-LABEL: v_canonicalize_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_canonicalize_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_canonicalize_bf16:
@@ -32485,10 +32386,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_oeq_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -32496,10 +32395,8 @@ define i1 @v_fcmp_oeq_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_oeq_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_eq_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -32593,10 +32490,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ogt_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -32604,10 +32499,8 @@ define i1 @v_fcmp_ogt_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_ogt_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_gt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -32701,10 +32594,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_oge_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -32712,10 +32603,8 @@ define i1 @v_fcmp_oge_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_oge_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_ge_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -32809,10 +32698,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_olt_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -32820,10 +32707,8 @@ define i1 @v_fcmp_olt_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_olt_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_lt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -32917,10 +32802,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ole_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -32928,10 +32811,8 @@ define i1 @v_fcmp_ole_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_ole_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_le_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33025,10 +32906,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_one_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33036,10 +32915,8 @@ define i1 @v_fcmp_one_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_one_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_lg_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33133,10 +33010,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_uno_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33144,10 +33019,8 @@ define i1 @v_fcmp_uno_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_uno_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33241,10 +33114,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ueq_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33252,10 +33123,8 @@ define i1 @v_fcmp_ueq_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_ueq_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_nlg_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33349,10 +33218,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ugt_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33360,10 +33227,8 @@ define i1 @v_fcmp_ugt_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_ugt_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_nle_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33457,10 +33322,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_uge_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33468,10 +33331,8 @@ define i1 @v_fcmp_uge_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_uge_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33565,10 +33426,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ult_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33576,10 +33435,8 @@ define i1 @v_fcmp_ult_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_ult_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_nge_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33673,10 +33530,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_ule_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33684,10 +33539,8 @@ define i1 @v_fcmp_ule_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_ule_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_ngt_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33781,10 +33634,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GCN-LABEL: v_fcmp_une_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -33792,10 +33643,8 @@ define i1 @v_fcmp_une_bf16(bfloat %a, bfloat %b) {
; GFX7-LABEL: v_fcmp_une_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -33936,16 +33785,14 @@ define i16 @v_fptosi_bf16_to_i16(bfloat %x) {
; GCN-LABEL: v_fptosi_bf16_to_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_bf16_to_i16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -34336,16 +34183,14 @@ define i32 @v_fptosi_bf16_to_i32(bfloat %x) {
; GCN-LABEL: v_fptosi_bf16_to_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fptosi_bf16_to_i32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_cvt_i32_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -34682,10 +34527,9 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
; GCN-LABEL: v_fptosi_bf16_to_i64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_mov_b32 s4, 0x2f800000
; GCN-NEXT: s_mov_b32 s5, 0xcf800000
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_trunc_f32_e32 v0, v0
; GCN-NEXT: v_mul_f32_e64 v1, |v0|, s4
; GCN-NEXT: v_ashrrev_i32_e32 v2, 31, v0
@@ -34702,8 +34546,7 @@ define i64 @v_fptosi_bf16_to_i64(bfloat %x) {
; GFX7-LABEL: v_fptosi_bf16_to_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_trunc_f32_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s4, 0x2f800000
; GFX7-NEXT: v_mul_f32_e64 v1, |v0|, s4
@@ -35968,7 +35811,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_bfe_i32 v0, v0, 0, 16
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_i16_to_bf16:
@@ -35976,7 +35819,7 @@ define bfloat @v_sitofp_i16_to_bf16(i16 %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 16
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_i16_to_bf16:
@@ -36699,14 +36542,14 @@ define bfloat @v_sitofp_i32_to_bf16(i32 %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_i32_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_i32_to_bf16:
@@ -37372,7 +37215,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GCN-NEXT: v_cvt_f32_i32_e32 v0, v0
; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_sitofp_i64_to_bf16:
@@ -37390,7 +37233,7 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
; GFX7-NEXT: v_cvt_f32_i32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_sitofp_i64_to_bf16:
@@ -39130,7 +38973,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i16_to_bf16:
@@ -39138,7 +38981,7 @@ define bfloat @v_uitofp_i16_to_bf16(i16 %x) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i16_to_bf16:
@@ -39920,14 +39763,14 @@ define bfloat @v_uitofp_i32_to_bf16(i32 %x) {
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i32_to_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i32_to_bf16:
@@ -40589,7 +40432,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GCN-NEXT: v_cvt_f32_u32_e32 v0, v0
; GCN-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GCN-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_uitofp_i64_to_bf16:
@@ -40603,7 +40446,7 @@ define bfloat @v_uitofp_i64_to_bf16(i64 %x) {
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_sub_i32_e32 v1, vcc, 32, v2
; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_uitofp_i64_to_bf16:
@@ -41969,23 +41812,17 @@ define bfloat @v_select_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_bf16:
@@ -42066,23 +41903,19 @@ define bfloat @v_select_fneg_lhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_fneg_lhs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v1, -1.0, v1
+; GCN-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_fneg_lhs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v1, -1.0, v1
+; GFX7-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_fneg_lhs_bf16:
@@ -42172,23 +42005,19 @@ define bfloat @v_select_fneg_rhs_bf16(i1 %cond, bfloat %a, bfloat %b) {
; GCN-LABEL: v_select_fneg_rhs_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v0, 1, v0
-; GCN-NEXT: v_mul_f32_e32 v2, -1.0, v2
+; GCN-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_select_fneg_rhs_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, -1.0, v2
+; GFX7-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_select_fneg_rhs_bf16:
@@ -42537,21 +42366,21 @@ define <2 x bfloat> @v_vselect_v2bf16(<2 x i1> %cond, <2 x bfloat> %a, <2 x bflo
define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
; GCN-LABEL: s_select_bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s0
-; GCN-NEXT: v_mul_f32_e64 v2, 1.0, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s1
+; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_select_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
-; GFX7-NEXT: v_mul_f32_e64 v2, 1.0, s1
+; GFX7-NEXT: v_mov_b32_e32 v1, s1
+; GFX7-NEXT: v_mov_b32_e32 v2, s0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
; GFX7-NEXT: ; return to shader part epilog
;
@@ -47346,27 +47175,21 @@ define bfloat @v_fma_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GCN-LABEL: v_fma_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_fma_f32 v0, v0, v1, v2
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fma_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_fma_f32 v0, v0, v1, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fma_bf16:
@@ -52328,31 +52151,25 @@ define bfloat @v_fmuladd_bf16(bfloat %a, bfloat %b, bfloat %c) {
; GCN-LABEL: v_fmuladd_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: v_mul_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GCN-NEXT: v_add_f32_e32 v0, v0, v1
-; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_fmuladd_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fmuladd_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
index 3546141afe5bb..d8ef44361c40d 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fadd.ll
@@ -3672,13 +3672,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: s_and_b32 s4, s20, -4
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX7-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3702,7 +3703,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -3712,13 +3712,14 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: s_and_b32 s4, s20, -4
; GFX6-NEXT: v_mov_b32_e32 v4, s4
; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-NEXT: .LBB13_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -3743,7 +3744,6 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
@@ -4100,13 +4100,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX7-NEXT: s_and_b32 s4, s20, -4
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX7-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -4138,13 +4139,14 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_f16__offset__amdgpu_no_fine_
; GFX6-NEXT: s_and_b32 s4, s20, -4
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX6-NEXT: .LBB14_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -4830,11 +4832,12 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_and_b32_e32 v9, -4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT: v_not_b32_e32 v9, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX7-NEXT: v_not_b32_e32 v10, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -4845,28 +4848,28 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4
; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB15_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX7-NEXT: v_and_b32_e32 v6, v5, v10
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v11
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: v_mov_b32_e32 v5, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -4878,33 +4881,33 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB15_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v6
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB15_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: v_and_b32_e32 v9, -4, v4
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT: v_not_b32_e32 v9, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX6-NEXT: v_not_b32_e32 v10, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -4915,28 +4918,29 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB15_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4
; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, v5, v10
; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v11
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -4948,22 +4952,21 @@ define half @buffer_fat_ptr_agent_atomic_fadd_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB15_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v6
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB15_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
@@ -5416,6 +5419,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -5445,7 +5449,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -5456,6 +5459,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: v_mov_b32_e32 v4, s4
; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -5486,7 +5490,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
@@ -5926,6 +5929,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -5964,6 +5968,7 @@ define void @buffer_fat_ptr_agent_atomic_fadd_noret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -6738,11 +6743,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v9, -4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT: v_not_b32_e32 v9, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX7-NEXT: v_not_b32_e32 v10, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -6753,28 +6759,28 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GFX7-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB18_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_add_f32_e32 v4, v4, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX7-NEXT: v_and_b32_e32 v6, v5, v10
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_mov_b32_e32 v5, v6
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -6786,33 +6792,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB18_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v6
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB18_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v9, -4, v4
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT: v_not_b32_e32 v9, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX6-NEXT: v_not_b32_e32 v10, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB18_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -6823,28 +6829,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB18_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GFX6-NEXT: .LBB18_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB18_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_add_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_add_f32_e32 v4, v4, v11
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, v5, v10
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_mov_b32_e32 v5, v6
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: .LBB18_4: ; Parent Loop BB18_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -6856,22 +6863,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fadd_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB18_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB18_3 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v6
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB18_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
index 7896edd5016f0..fc3ed6d332211 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmax.ll
@@ -2773,13 +2773,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: s_and_b32 s4, s20, -4
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -2803,7 +2804,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2813,13 +2813,14 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: s_and_b32 s4, s20, -4
; GFX6-NEXT: v_mov_b32_e32 v4, s4
; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -2844,7 +2845,6 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
@@ -3222,13 +3222,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX7-NEXT: s_and_b32 s4, s20, -4
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3260,13 +3261,14 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_f16__offset__amdgpu_no_fine_
; GFX6-NEXT: s_and_b32 s4, s20, -4
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -3974,11 +3976,12 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_and_b32_e32 v9, -4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT: v_not_b32_e32 v9, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX7-NEXT: v_not_b32_e32 v10, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -3989,28 +3992,28 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4
; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB12_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX7-NEXT: v_and_b32_e32 v6, v5, v10
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v11
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: v_mov_b32_e32 v5, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -4022,33 +4025,33 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB12_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v6
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB12_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: v_and_b32_e32 v9, -4, v4
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT: v_not_b32_e32 v9, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX6-NEXT: v_not_b32_e32 v10, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -4059,28 +4062,29 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4
; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB12_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, v5, v10
; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_max_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_max_f32_e32 v4, v4, v11
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -4092,22 +4096,21 @@ define half @buffer_fat_ptr_agent_atomic_fmax_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v6
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB12_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
@@ -4560,6 +4563,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -4590,7 +4594,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4601,6 +4604,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: v_mov_b32_e32 v4, s4
; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -4632,7 +4636,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
@@ -5072,6 +5075,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -5111,6 +5115,7 @@ define void @buffer_fat_ptr_agent_atomic_fmax_noret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -5886,11 +5891,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v9, -4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT: v_not_b32_e32 v9, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX7-NEXT: v_not_b32_e32 v10, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -5901,29 +5907,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB15_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_max_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_max_f32_e32 v4, v4, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX7-NEXT: v_and_b32_e32 v6, v5, v10
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_mov_b32_e32 v5, v6
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -5935,33 +5941,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB15_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v6
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB15_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v9, -4, v4
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT: v_not_b32_e32 v9, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX6-NEXT: v_not_b32_e32 v10, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -5972,29 +5978,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB15_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT: v_max_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_max_f32_e32 v4, v4, v11
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, v5, v10
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_mov_b32_e32 v5, v6
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -6006,22 +6013,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmax_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB15_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v6
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB15_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
index 2ade237eaa6da..8f270f9a466e2 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-fmin.ll
@@ -2773,13 +2773,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: s_and_b32 s4, s20, -4
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX7-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -2803,7 +2804,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_grained_memory:
@@ -2813,13 +2813,14 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: s_and_b32 s4, s20, -4
; GFX6-NEXT: v_mov_b32_e32 v4, s4
; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
; GFX6-NEXT: .LBB10_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -2844,7 +2845,6 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__amdgpu_no_fine_gr
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
@@ -3222,13 +3222,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX7-NEXT: s_and_b32 s4, s20, -4
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_and_b32 s4, s20, 3
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_not_b32 s7, s4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX7-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -3260,13 +3261,14 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_f16__offset__amdgpu_no_fine_
; GFX6-NEXT: s_and_b32 s4, s20, -4
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_and_b32 s4, s20, 3
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_not_b32 s7, s4
; GFX6-NEXT: s_mov_b64 s[4:5], 0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX6-NEXT: .LBB11_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -3974,11 +3976,12 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX7-NEXT: v_and_b32_e32 v9, -4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT: v_not_b32_e32 v9, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX7-NEXT: v_not_b32_e32 v10, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -3989,28 +3992,28 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v6
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX7-NEXT: v_cvt_f32_f16_e32 v11, v4
; GFX7-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB12_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX7-NEXT: v_and_b32_e32 v6, v5, v10
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v11
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
-; GFX7-NEXT: v_mov_b32_e32 v5, v6
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -4022,33 +4025,33 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB12_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v6
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB12_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v5
+; GFX6-NEXT: v_and_b32_e32 v9, -4, v4
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT: v_not_b32_e32 v9, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX6-NEXT: v_not_b32_e32 v10, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB12_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -4059,28 +4062,29 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v6
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v10, v4
+; GFX6-NEXT: v_cvt_f32_f16_e32 v11, v4
; GFX6-NEXT: .LBB12_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB12_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v4
-; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, v5, v10
; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_min_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_min_f32_e32 v4, v4, v11
; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
-; GFX6-NEXT: v_mov_b32_e32 v5, v6
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: .LBB12_4: ; Parent Loop BB12_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -4092,22 +4096,21 @@ define half @buffer_fat_ptr_agent_atomic_fmin_ret_f16__offset__waterfall__amdgpu
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB12_3 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v6
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB12_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(7) %ptr, i32 256
@@ -4560,6 +4563,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: v_mov_b32_e32 v4, s4
; GFX7-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -4590,7 +4594,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine_grained_memory:
@@ -4601,6 +4604,7 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: v_mov_b32_e32 v4, s4
; GFX6-NEXT: buffer_load_dword v1, v4, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -4632,7 +4636,6 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, s6, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
@@ -5072,6 +5075,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX7-NEXT: v_mov_b32_e32 v2, s4
; GFX7-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX7-NEXT: s_and_b32 s4, s20, 3
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_lshl_b32 s6, s4, 3
; GFX7-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -5111,6 +5115,7 @@ define void @buffer_fat_ptr_agent_atomic_fmin_noret_bf16__offset__amdgpu_no_fine
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: buffer_load_dword v1, v2, s[16:19], 0 offen
; GFX6-NEXT: s_and_b32 s4, s20, 3
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_lshl_b32 s6, s4, 3
; GFX6-NEXT: s_lshl_b32 s4, 0xffff, s6
; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -5886,11 +5891,12 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX7-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX7-NEXT: v_and_b32_e32 v9, -4, v4
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX7-NEXT: v_not_b32_e32 v9, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX7-NEXT: v_not_b32_e32 v10, v4
; GFX7-NEXT: s_mov_b64 s[6:7], exec
; GFX7-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -5901,29 +5907,29 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX7-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX7-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB15_1
; GFX7-NEXT: ; %bb.2:
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[6:7], 0
-; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GFX7-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Loop Header: Depth=1
; GFX7-NEXT: ; Child Loop BB15_4 Depth 2
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_min_f32_e32 v4, v4, v10
+; GFX7-NEXT: v_min_f32_e32 v4, v4, v11
; GFX7-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX7-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX7-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX7-NEXT: v_and_b32_e32 v6, v5, v10
+; GFX7-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v7, v5
; GFX7-NEXT: s_mov_b64 s[12:13], exec
-; GFX7-NEXT: v_mov_b32_e32 v5, v6
+; GFX7-NEXT: v_mov_b32_e32 v6, v4
; GFX7-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX7-NEXT: ; => This Inner Loop Header: Depth=2
; GFX7-NEXT: v_readfirstlane_b32 s8, v0
@@ -5935,33 +5941,33 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX7-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX7-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX7-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB15_4
; GFX7-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX7-NEXT: s_mov_b64 exec, s[12:13]
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX7-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7-NEXT: v_mov_b32_e32 v6, v4
+; GFX7-NEXT: v_mov_b32_e32 v5, v6
; GFX7-NEXT: buffer_wbinvl1
; GFX7-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX7-NEXT: s_cbranch_execnz .LBB15_3
; GFX7-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amdgpu_no_fine_grained_memory:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_add_i32_e32 v4, vcc, 0x200, v4
-; GFX6-NEXT: v_and_b32_e32 v8, -4, v4
+; GFX6-NEXT: v_and_b32_e32 v9, -4, v4
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v7, 3, v4
-; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT: v_not_b32_e32 v9, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v8, 3, v4
+; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v8
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v5
+; GFX6-NEXT: v_not_b32_e32 v10, v4
; GFX6-NEXT: s_mov_b64 s[6:7], exec
; GFX6-NEXT: .LBB15_1: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -5972,29 +5978,30 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: v_cmp_eq_u64_e64 s[4:5], s[10:11], v[2:3]
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
-; GFX6-NEXT: buffer_load_dword v6, v8, s[8:11], 0 offen
+; GFX6-NEXT: buffer_load_dword v5, v9, s[8:11], 0 offen
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB15_1
; GFX6-NEXT: ; %bb.2:
; GFX6-NEXT: s_mov_b64 exec, s[6:7]
-; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v5
+; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v6
; GFX6-NEXT: s_mov_b64 s[6:7], 0
-; GFX6-NEXT: v_and_b32_e32 v10, 0xffff0000, v4
+; GFX6-NEXT: v_and_b32_e32 v11, 0xffff0000, v4
; GFX6-NEXT: .LBB15_3: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Loop Header: Depth=1
; GFX6-NEXT: ; Child Loop BB15_4 Depth 2
-; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v4, v7, v6
+; GFX6-NEXT: s_waitcnt vmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v4, v8, v5
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX6-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX6-NEXT: v_min_f32_e32 v4, v4, v10
+; GFX6-NEXT: v_min_f32_e32 v4, v4, v11
; GFX6-NEXT: v_lshrrev_b32_e32 v4, 16, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v4, v7, v4
-; GFX6-NEXT: v_and_b32_e32 v5, v6, v9
-; GFX6-NEXT: v_or_b32_e32 v5, v5, v4
-; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v4, v8, v4
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_and_b32_e32 v6, v5, v10
+; GFX6-NEXT: v_or_b32_e32 v4, v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v7, v5
; GFX6-NEXT: s_mov_b64 s[12:13], exec
-; GFX6-NEXT: v_mov_b32_e32 v5, v6
+; GFX6-NEXT: v_mov_b32_e32 v6, v4
; GFX6-NEXT: .LBB15_4: ; Parent Loop BB15_3 Depth=1
; GFX6-NEXT: ; => This Inner Loop Header: Depth=2
; GFX6-NEXT: v_readfirstlane_b32 s8, v0
@@ -6006,22 +6013,21 @@ define bfloat @buffer_fat_ptr_agent_atomic_fmin_ret_bf16__offset__waterfall__amd
; GFX6-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_and_saveexec_b64 s[4:5], s[4:5]
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v8, s[8:11], 0 offen glc
+; GFX6-NEXT: buffer_atomic_cmpswap v[6:7], v9, s[8:11], 0 offen glc
; GFX6-NEXT: s_xor_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB15_4
; GFX6-NEXT: ; %bb.5: ; in Loop: Header=BB15_3 Depth=1
; GFX6-NEXT: s_mov_b64 exec, s[12:13]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v6, v5
; GFX6-NEXT: s_or_b64 s[6:7], vcc, s[6:7]
-; GFX6-NEXT: v_mov_b32_e32 v6, v4
+; GFX6-NEXT: v_mov_b32_e32 v5, v6
; GFX6-NEXT: buffer_wbinvl1
; GFX6-NEXT: s_andn2_b64 exec, exec, s[6:7]
; GFX6-NEXT: s_cbranch_execnz .LBB15_3
; GFX6-NEXT: ; %bb.6: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[6:7]
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, v7, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, v8, v6
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(7) %ptr, i32 256
diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
index f67e5b86497ba..d3881660bb846 100644
--- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll
@@ -1974,7 +1974,7 @@ define amdgpu_kernel void @test_call_external_void_func_f16_imm() #0 {
; CI-NEXT: s_add_u32 s4, s4, external_void_func_f16 at rel32@lo+4
; CI-NEXT: s_addc_u32 s5, s5, external_void_func_f16 at rel32@hi+12
; CI-NEXT: s_mov_b64 s[2:3], s[38:39]
-; CI-NEXT: v_mov_b32_e32 v0, 4.0
+; CI-NEXT: v_mov_b32_e32 v0, 0x4400
; CI-NEXT: s_mov_b32 s32, 0
; CI-NEXT: s_swappc_b64 s[30:31], s[4:5]
; CI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 30dc25388767d..689f9d7d59550 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -62,9 +62,9 @@ entry:
define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
; SI-LABEL: ps_ret_cc_f16:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: ps_ret_cc_f16:
@@ -102,9 +102,9 @@ define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
; SI-LABEL: ps_ret_cc_inreg_f16:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: ps_ret_cc_inreg_f16:
@@ -420,9 +420,9 @@ define amdgpu_kernel void @call_fastcc() #0 {
define amdgpu_cs half @cs_mesa(half %arg0) {
; SI-LABEL: cs_mesa:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: cs_mesa:
@@ -461,9 +461,9 @@ define amdgpu_cs half @cs_mesa(half %arg0) {
define amdgpu_ps half @ps_mesa_f16(half %arg0) {
; SI-LABEL: ps_mesa_f16:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: ps_mesa_f16:
@@ -502,9 +502,9 @@ define amdgpu_ps half @ps_mesa_f16(half %arg0) {
define amdgpu_vs half @vs_mesa(half %arg0) {
; SI-LABEL: vs_mesa:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: vs_mesa:
@@ -543,9 +543,9 @@ define amdgpu_vs half @vs_mesa(half %arg0) {
define amdgpu_gs half @gs_mesa(half %arg0) {
; SI-LABEL: gs_mesa:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: gs_mesa:
@@ -584,9 +584,9 @@ define amdgpu_gs half @gs_mesa(half %arg0) {
define amdgpu_hs half @hs_mesa(half %arg0) {
; SI-LABEL: hs_mesa:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: hs_mesa:
@@ -635,7 +635,6 @@ define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: ps_mesa_v2f16:
@@ -673,7 +672,6 @@ define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) {
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: ps_mesa_inreg_v2f16:
@@ -804,8 +802,6 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) {
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: v_readfirstlane_b32 s0, v0
-; SI-NEXT: v_readfirstlane_b32 s1, v1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: ps_mesa_v4f16:
@@ -857,8 +853,6 @@ define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) {
; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; SI-NEXT: v_or_b32_e32 v0, v0, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v3
-; SI-NEXT: v_readfirstlane_b32 s0, v0
-; SI-NEXT: v_readfirstlane_b32 s1, v1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: ps_mesa_inreg_v4f16:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 5eb6b2f58474d..711e2f2951fae 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -4352,8 +4352,8 @@ define half @v_clamp_f16_minimumnum_maximumnum(half %a) #1 {
; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_clamp_f16_minimumnum_maximumnum:
@@ -4408,8 +4408,8 @@ define half @v_clamp_f16_minimumnum_maximumnum_no_ieee(half %a) #5 {
; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee:
@@ -4464,11 +4464,10 @@ define half @v_clamp_f16_minimumnum_maximumnum_foldable_source(half %a, half %b)
; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum_foldable_source:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_add_f32_e64 v0, v0, v1 clamp
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_clamp_f16_minimumnum_maximumnum_foldable_source:
@@ -4524,11 +4523,10 @@ define half @v_clamp_f16_minimumnum_maximumnum_no_ieee_foldable_source(half %a,
; GFX6-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee_foldable_source:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_add_f32_e64 v0, v0, v1 clamp
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_clamp_f16_minimumnum_maximumnum_no_ieee_foldable_source:
diff --git a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
index 40efd06c2bdfd..c48efc925ea8b 100644
--- a/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
+++ b/llvm/test/CodeGen/AMDGPU/constant-address-space-32bit.ll
@@ -1057,6 +1057,7 @@ define amdgpu_vs <2 x half> @load_v2i16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: s_and_b32 s0, s0, 0xffff
; GFX67-NEXT: s_lshl_b32 s1, s3, 16
; GFX67-NEXT: s_or_b32 s0, s0, s1
+; GFX67-NEXT: v_mov_b32_e32 v0, s0
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v2i16:
@@ -1114,6 +1115,8 @@ define amdgpu_vs <3 x half> @load_v3i16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: s_lshl_b32 s2, s5, 16
; GFX67-NEXT: s_or_b32 s0, s0, s2
; GFX67-NEXT: s_and_b32 s1, s1, 0xffff
+; GFX67-NEXT: v_mov_b32_e32 v0, s0
+; GFX67-NEXT: v_mov_b32_e32 v1, s1
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v3i16:
@@ -1180,6 +1183,8 @@ define amdgpu_vs <4 x half> @load_v4i16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: s_or_b32 s0, s0, s2
; GFX67-NEXT: s_lshl_b32 s2, s5, 16
; GFX67-NEXT: s_or_b32 s1, s1, s2
+; GFX67-NEXT: v_mov_b32_e32 v0, s0
+; GFX67-NEXT: v_mov_b32_e32 v1, s1
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v4i16:
@@ -1259,6 +1264,9 @@ define amdgpu_vs <6 x half> @load_v6i16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: s_or_b32 s1, s1, s3
; GFX67-NEXT: s_lshl_b32 s3, s8, 16
; GFX67-NEXT: s_or_b32 s2, s2, s3
+; GFX67-NEXT: v_mov_b32_e32 v0, s0
+; GFX67-NEXT: v_mov_b32_e32 v1, s1
+; GFX67-NEXT: v_mov_b32_e32 v2, s2
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v6i16:
@@ -1355,6 +1363,10 @@ define amdgpu_vs <8 x half> @load_v8i16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: s_or_b32 s2, s2, s4
; GFX67-NEXT: s_lshl_b32 s4, s11, 16
; GFX67-NEXT: s_or_b32 s3, s3, s4
+; GFX67-NEXT: v_mov_b32_e32 v0, s0
+; GFX67-NEXT: v_mov_b32_e32 v1, s1
+; GFX67-NEXT: v_mov_b32_e32 v2, s2
+; GFX67-NEXT: v_mov_b32_e32 v3, s3
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v8i16:
@@ -1489,6 +1501,14 @@ define amdgpu_vs <16 x half> @load_v16i16(ptr addrspace(6) inreg %p0, ptr addrsp
; GFX67-NEXT: s_or_b32 s6, s6, s8
; GFX67-NEXT: s_lshl_b32 s8, s23, 16
; GFX67-NEXT: s_or_b32 s7, s7, s8
+; GFX67-NEXT: v_mov_b32_e32 v0, s0
+; GFX67-NEXT: v_mov_b32_e32 v1, s1
+; GFX67-NEXT: v_mov_b32_e32 v2, s2
+; GFX67-NEXT: v_mov_b32_e32 v3, s3
+; GFX67-NEXT: v_mov_b32_e32 v4, s4
+; GFX67-NEXT: v_mov_b32_e32 v5, s5
+; GFX67-NEXT: v_mov_b32_e32 v6, s6
+; GFX67-NEXT: v_mov_b32_e32 v7, s7
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v16i16:
@@ -1810,7 +1830,6 @@ define amdgpu_vs <2 x half> @load_v2f16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX67-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX67-NEXT: v_readfirstlane_b32 s0, v0
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v2f16:
@@ -1868,14 +1887,12 @@ define amdgpu_vs <3 x half> @load_v3f16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3
; GFX67-NEXT: v_add_f32_e32 v1, v1, v3
; GFX67-NEXT: v_add_f32_e32 v0, v0, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v1
; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX67-NEXT: v_add_f32_e32 v2, v4, v5
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX67-NEXT: v_readfirstlane_b32 s0, v0
-; GFX67-NEXT: v_readfirstlane_b32 s1, v2
+; GFX67-NEXT: v_add_f32_e32 v1, v4, v5
+; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX67-NEXT: v_or_b32_e32 v0, v0, v2
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v3f16:
@@ -1933,26 +1950,24 @@ define amdgpu_vs <4 x half> @load_v4f16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: v_cvt_f32_f16_e32 v2, s0
; GFX67-NEXT: s_lshr_b32 s0, s2, 16
; GFX67-NEXT: v_cvt_f32_f16_e32 v1, s4
-; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s0
-; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s2
-; GFX67-NEXT: s_lshr_b32 s0, s3, 16
-; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s0
; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1
-; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s3
-; GFX67-NEXT: v_add_f32_e32 v1, v1, v4
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX67-NEXT: s_lshr_b32 s1, s3, 16
+; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s0
+; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s1
+; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s2
+; GFX67-NEXT: v_cvt_f32_f16_e32 v4, s3
+; GFX67-NEXT: v_add_f32_e32 v1, v1, v7
+; GFX67-NEXT: v_add_f32_e32 v2, v2, v5
; GFX67-NEXT: v_add_f32_e32 v0, v0, v6
-; GFX67-NEXT: v_add_f32_e32 v2, v2, v7
+; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX67-NEXT: v_add_f32_e32 v3, v3, v4
; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX67-NEXT: v_add_f32_e32 v3, v3, v5
; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX67-NEXT: v_readfirstlane_b32 s0, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX67-NEXT: v_readfirstlane_b32 s1, v0
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX67-NEXT: v_or_b32_e32 v1, v3, v1
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v4f16:
@@ -2021,34 +2036,31 @@ define amdgpu_vs <6 x half> @load_v6f16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: v_cvt_f32_f16_e32 v6, s0
; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s4
; GFX67-NEXT: s_lshr_b32 s0, s5, 16
-; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s0
-; GFX67-NEXT: v_add_f32_e32 v1, v1, v6
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX67-NEXT: v_add_f32_e32 v0, v0, v7
; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1
+; GFX67-NEXT: v_add_f32_e32 v1, v1, v6
; GFX67-NEXT: s_lshr_b32 s1, s6, 16
-; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s5
-; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0
+; GFX67-NEXT: v_add_f32_e32 v0, v0, v7
+; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s1
-; GFX67-NEXT: v_add_f32_e32 v2, v2, v11
+; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s5
+; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2
; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s6
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX67-NEXT: v_add_f32_e32 v2, v2, v10
; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX67-NEXT: v_add_f32_e32 v3, v3, v10
-; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
; GFX67-NEXT: v_add_f32_e32 v4, v4, v9
-; GFX67-NEXT: v_readfirstlane_b32 s0, v0
-; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v3
-; GFX67-NEXT: v_add_f32_e32 v5, v5, v8
-; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4
-; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5
+; GFX67-NEXT: v_add_f32_e32 v3, v3, v11
; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX67-NEXT: v_readfirstlane_b32 s1, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX67-NEXT: v_readfirstlane_b32 s2, v0
+; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v2
+; GFX67-NEXT: v_add_f32_e32 v5, v5, v8
+; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v3
+; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v4
+; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v5
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX67-NEXT: v_or_b32_e32 v1, v2, v1
+; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v3
+; GFX67-NEXT: v_or_b32_e32 v2, v4, v2
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v6f16:
@@ -2128,47 +2140,43 @@ define amdgpu_vs <8 x half> @load_v8f16(ptr addrspace(6) inreg %p0, ptr addrspac
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
; GFX67-NEXT: s_lshr_b32 s0, s4, 16
; GFX67-NEXT: v_cvt_f32_f16_e32 v8, s0
-; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s4
; GFX67-NEXT: s_lshr_b32 s0, s5, 16
+; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s4
; GFX67-NEXT: v_cvt_f32_f16_e32 v10, s0
-; GFX67-NEXT: v_add_f32_e32 v1, v1, v8
; GFX67-NEXT: v_cvt_f32_f16_e32 v3, s1
; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s5
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX67-NEXT: v_add_f32_e32 v0, v0, v9
; GFX67-NEXT: s_lshr_b32 s0, s6, 16
-; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX67-NEXT: v_add_f32_e32 v1, v1, v8
+; GFX67-NEXT: s_lshr_b32 s1, s7, 16
; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s0
; GFX67-NEXT: v_add_f32_e32 v2, v2, v10
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX67-NEXT: v_add_f32_e32 v3, v3, v11
-; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX67-NEXT: v_add_f32_e32 v0, v0, v9
+; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX67-NEXT: v_cvt_f32_f16_e32 v5, s2
-; GFX67-NEXT: s_lshr_b32 s1, s7, 16
-; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s6
-; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v3
; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s1
-; GFX67-NEXT: v_add_f32_e32 v4, v4, v15
+; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s6
+; GFX67-NEXT: v_add_f32_e32 v3, v3, v11
+; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX67-NEXT: v_cvt_f32_f16_e32 v7, s3
; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s7
-; GFX67-NEXT: v_readfirstlane_b32 s0, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4
-; GFX67-NEXT: v_add_f32_e32 v5, v5, v14
-; GFX67-NEXT: v_or_b32_e32 v0, v1, v0
+; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX67-NEXT: v_add_f32_e32 v4, v4, v15
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX67-NEXT: v_add_f32_e32 v6, v6, v13
-; GFX67-NEXT: v_readfirstlane_b32 s1, v0
-; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v5
-; GFX67-NEXT: v_add_f32_e32 v7, v7, v12
-; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v6
-; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v7
+; GFX67-NEXT: v_add_f32_e32 v5, v5, v14
; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX67-NEXT: v_readfirstlane_b32 s2, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX67-NEXT: v_readfirstlane_b32 s3, v0
+; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4
+; GFX67-NEXT: v_add_f32_e32 v7, v7, v12
+; GFX67-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5
+; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v6
+; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v7
+; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX67-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX67-NEXT: v_or_b32_e32 v3, v5, v3
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v8f16:
@@ -2265,92 +2273,84 @@ define amdgpu_vs <16 x half> @load_v16f16(ptr addrspace(6) inreg %p0, ptr addrsp
; GFX67-NEXT: v_cvt_f32_f16_e32 v12, s0
; GFX67-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x10
; GFX67-NEXT: v_cvt_f32_f16_e32 v9, s8
+; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s10
; GFX67-NEXT: s_lshr_b32 s8, s11, 16
; GFX67-NEXT: v_cvt_f32_f16_e32 v14, s8
-; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s11
; GFX67-NEXT: s_waitcnt lgkmcnt(0)
-; GFX67-NEXT: s_lshr_b32 s8, s0, 16
-; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s0
-; GFX67-NEXT: s_lshr_b32 s0, s7, 16
-; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0
; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s7
-; GFX67-NEXT: s_lshr_b32 s0, s6, 16
-; GFX67-NEXT: v_cvt_f32_f16_e32 v13, s10
-; GFX67-NEXT: v_add_f32_e32 v14, v14, v19
-; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0
-; GFX67-NEXT: v_add_f32_e32 v15, v15, v17
-; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s6
-; GFX67-NEXT: s_lshr_b32 s0, s5, 16
-; GFX67-NEXT: v_add_f32_e32 v12, v12, v19
-; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0
+; GFX67-NEXT: s_lshr_b32 s7, s7, 16
+; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s6
+; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s7
+; GFX67-NEXT: v_cvt_f32_f16_e32 v15, s11
+; GFX67-NEXT: s_lshr_b32 s12, s5, 16
+; GFX67-NEXT: v_add_f32_e32 v13, v13, v19
+; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s12
; GFX67-NEXT: v_cvt_f32_f16_e32 v11, s9
-; GFX67-NEXT: v_add_f32_e32 v13, v13, v17
-; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s5
-; GFX67-NEXT: s_lshr_b32 s0, s4, 16
+; GFX67-NEXT: s_lshr_b32 s13, s6, 16
+; GFX67-NEXT: v_add_f32_e32 v14, v14, v18
+; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s5
+; GFX67-NEXT: v_add_f32_e32 v15, v15, v17
+; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s13
+; GFX67-NEXT: s_lshr_b32 s11, s4, 16
; GFX67-NEXT: v_add_f32_e32 v10, v10, v19
-; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0
-; GFX67-NEXT: v_add_f32_e32 v11, v11, v17
+; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s3
+; GFX67-NEXT: v_add_f32_e32 v11, v11, v18
+; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s11
+; GFX67-NEXT: v_add_f32_e32 v12, v12, v17
; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s4
-; GFX67-NEXT: s_lshr_b32 s0, s3, 16
-; GFX67-NEXT: v_add_f32_e32 v8, v8, v19
-; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0
+; GFX67-NEXT: s_lshr_b32 s9, s2, 16
+; GFX67-NEXT: v_add_f32_e32 v7, v7, v19
+; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s9
+; GFX67-NEXT: s_lshr_b32 s10, s3, 16
+; GFX67-NEXT: v_add_f32_e32 v8, v8, v18
+; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s2
; GFX67-NEXT: v_add_f32_e32 v9, v9, v17
-; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s3
-; GFX67-NEXT: s_lshr_b32 s0, s2, 16
-; GFX67-NEXT: v_add_f32_e32 v6, v6, v19
-; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0
-; GFX67-NEXT: v_cvt_f32_f16_e32 v16, s8
-; GFX67-NEXT: v_add_f32_e32 v7, v7, v17
-; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s2
+; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s10
+; GFX67-NEXT: s_lshr_b32 s8, s0, 16
+; GFX67-NEXT: v_cvt_f32_f16_e32 v16, s0
; GFX67-NEXT: s_lshr_b32 s0, s1, 16
; GFX67-NEXT: v_add_f32_e32 v4, v4, v19
-; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s0
-; GFX67-NEXT: v_add_f32_e32 v1, v1, v16
-; GFX67-NEXT: v_add_f32_e32 v5, v5, v17
+; GFX67-NEXT: v_cvt_f32_f16_e32 v19, s8
+; GFX67-NEXT: v_add_f32_e32 v5, v5, v18
+; GFX67-NEXT: v_cvt_f32_f16_e32 v18, s0
+; GFX67-NEXT: v_add_f32_e32 v6, v6, v17
; GFX67-NEXT: v_cvt_f32_f16_e32 v17, s1
+; GFX67-NEXT: v_add_f32_e32 v1, v1, v19
+; GFX67-NEXT: v_add_f32_e32 v2, v2, v18
+; GFX67-NEXT: v_add_f32_e32 v0, v0, v16
; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX67-NEXT: v_add_f32_e32 v0, v0, v18
+; GFX67-NEXT: v_add_f32_e32 v3, v3, v17
; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX67-NEXT: v_add_f32_e32 v2, v2, v19
; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX67-NEXT: v_add_f32_e32 v3, v3, v17
+; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v3
-; GFX67-NEXT: v_readfirstlane_b32 s0, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4
-; GFX67-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v5
-; GFX67-NEXT: v_readfirstlane_b32 s1, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v6
-; GFX67-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v7
-; GFX67-NEXT: v_readfirstlane_b32 s2, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v8
-; GFX67-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v9
-; GFX67-NEXT: v_readfirstlane_b32 s3, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v10
-; GFX67-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX67-NEXT: v_cvt_f16_f32_e32 v1, v11
-; GFX67-NEXT: v_readfirstlane_b32 s4, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v12
-; GFX67-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX67-NEXT: v_readfirstlane_b32 s5, v0
-; GFX67-NEXT: v_cvt_f16_f32_e32 v0, v13
; GFX67-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v14
-; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v15
-; GFX67-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX67-NEXT: v_readfirstlane_b32 s6, v0
-; GFX67-NEXT: v_lshlrev_b32_e32 v0, 16, v2
-; GFX67-NEXT: v_or_b32_e32 v0, v3, v0
-; GFX67-NEXT: v_readfirstlane_b32 s7, v0
+; GFX67-NEXT: v_cvt_f16_f32_e32 v2, v4
+; GFX67-NEXT: v_or_b32_e32 v1, v3, v1
+; GFX67-NEXT: v_cvt_f16_f32_e32 v3, v5
+; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v6
+; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v7
+; GFX67-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX67-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX67-NEXT: v_lshlrev_b32_e32 v3, 16, v4
+; GFX67-NEXT: v_cvt_f16_f32_e32 v4, v8
+; GFX67-NEXT: v_or_b32_e32 v3, v5, v3
+; GFX67-NEXT: v_cvt_f16_f32_e32 v5, v9
+; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v10
+; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v11
+; GFX67-NEXT: v_lshlrev_b32_e32 v4, 16, v4
+; GFX67-NEXT: v_or_b32_e32 v4, v5, v4
+; GFX67-NEXT: v_lshlrev_b32_e32 v5, 16, v6
+; GFX67-NEXT: v_cvt_f16_f32_e32 v6, v12
+; GFX67-NEXT: v_or_b32_e32 v5, v7, v5
+; GFX67-NEXT: v_cvt_f16_f32_e32 v7, v13
+; GFX67-NEXT: v_cvt_f16_f32_e32 v8, v14
+; GFX67-NEXT: v_cvt_f16_f32_e32 v9, v15
+; GFX67-NEXT: v_lshlrev_b32_e32 v6, 16, v6
+; GFX67-NEXT: v_or_b32_e32 v6, v7, v6
+; GFX67-NEXT: v_lshlrev_b32_e32 v7, 16, v8
+; GFX67-NEXT: v_or_b32_e32 v7, v9, v7
; GFX67-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: load_v16f16:
diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 795f0841cede2..ce7b09bf7ff27 100644
--- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -474,20 +474,12 @@ define <4 x float> @v_uitofp_unpack_i32_to_v4f32(i32 %arg0) nounwind {
}
define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
-; SI-LABEL: v_uitofp_i32_to_f16_mask255:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_uitofp_i32_to_f16_mask255:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_uitofp_i32_to_f16_mask255:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_i32_to_f16_mask255:
; GFX10: ; %bb.0:
@@ -524,20 +516,12 @@ define half @v_uitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
}
define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
-; SI-LABEL: v_sitofp_i32_to_f16_mask255:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_sitofp_i32_to_f16_mask255:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
-; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_sitofp_i32_to_f16_mask255:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_sitofp_i32_to_f16_mask255:
; GFX10: ; %bb.0:
@@ -574,20 +558,12 @@ define half @v_sitofp_i32_to_f16_mask255(i32 %arg0) nounwind {
}
define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
-; SI-LABEL: v_uitofp_to_f16_lshr8_mask255:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_uitofp_to_f16_lshr8_mask255:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
-; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_uitofp_to_f16_lshr8_mask255:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_ubyte1_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_to_f16_lshr8_mask255:
; GFX10: ; %bb.0:
@@ -625,20 +601,12 @@ define half @v_uitofp_to_f16_lshr8_mask255(i32 %arg0) nounwind {
}
define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
-; SI-LABEL: v_uitofp_to_f16_lshr16_mask255:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_uitofp_to_f16_lshr16_mask255:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
-; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_uitofp_to_f16_lshr16_mask255:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_ubyte2_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_to_f16_lshr16_mask255:
; GFX10: ; %bb.0:
@@ -676,20 +644,12 @@ define half @v_uitofp_to_f16_lshr16_mask255(i32 %arg0) nounwind {
}
define half @v_uitofp_to_f16_lshr24_mask255(i32 %arg0) nounwind {
-; SI-LABEL: v_uitofp_to_f16_lshr24_mask255:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_uitofp_to_f16_lshr24_mask255:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
-; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_uitofp_to_f16_lshr24_mask255:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_ubyte3_e32 v0, v0
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX10-LABEL: v_uitofp_to_f16_lshr24_mask255:
; GFX10: ; %bb.0:
@@ -732,7 +692,6 @@ define half @v_uitofp_i8_to_f16(i8 %arg0) nounwind {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f32_ubyte0_e32 v0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_uitofp_i8_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index 777b703d5319d..21abcbd4f5edc 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -1539,25 +1539,15 @@ define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bo
define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_f16_test1:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_f16_test1:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_f16_test1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: fmul_select_f16_test1:
; GFX9-SDAG: ; %bb.0:
@@ -1642,25 +1632,15 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_f16_test2:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_f16_test2:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
-; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_f16_test2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: fmul_select_f16_test2:
; GFX9-SDAG: ; %bb.0:
@@ -2078,25 +2058,15 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
}
define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_f16_test5:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_f16_test5:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc
-; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_f16_test5:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_f16_test5:
; GFX9: ; %bb.0:
@@ -2159,13 +2129,13 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX7-SDAG-LABEL: fmul_select_f16_test6:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x40400000
; GFX7-SDAG-NEXT: v_mov_b32_e32 v4, 0xc1000000
; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmul_select_f16_test6:
@@ -2268,12 +2238,12 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX7-SDAG-LABEL: fmul_select_f16_test7:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_mov_b32_e32 v3, 0x41000000
; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmul_select_f16_test7:
@@ -2376,12 +2346,12 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX7-SDAG-LABEL: fmul_select_f16_test8:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_bfrev_b32_e32 v3, 1
; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmul_select_f16_test8:
@@ -2458,11 +2428,12 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
; GFX7-SDAG-LABEL: fmul_select_f16_test9:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0
; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 5, 4, vcc
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmul_select_f16_test9:
@@ -2565,25 +2536,15 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
}
define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc
-; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
; GFX9-SDAG: ; %bb.0:
@@ -2668,25 +2629,15 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a
}
define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.arg1, i32 %bool.arg2) {
-; GFX7-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-SDAG-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
-; GFX7-GISEL-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc
-; GFX7-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7-NEXT: v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX7-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
; GFX9-SDAG: ; %bb.0:
@@ -2774,12 +2725,11 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX7-LABEL: fmul_select_bf16_test1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_bf16_test1:
@@ -2908,12 +2858,11 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX7-LABEL: fmul_select_bf16_test2:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_bf16_test2:
@@ -3448,13 +3397,12 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX7-LABEL: fmul_select_bf16_test5:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: v_cndmask_b32_e64 v1, v3, 2.0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_bf16_test5:
@@ -3583,14 +3531,13 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX7-LABEL: fmul_select_bf16_test6:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mov_b32_e32 v3, 0x40400000
; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1000000
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_bf16_test6:
@@ -3719,13 +3666,12 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX7-LABEL: fmul_select_bf16_test7:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mov_b32_e32 v3, 0x41000000
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: v_cndmask_b32_e32 v1, -4.0, v3, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_bf16_test7:
@@ -3854,13 +3800,12 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX7-LABEL: fmul_select_bf16_test8:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_bfrev_b32_e32 v3, 1
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_bf16_test8:
@@ -3984,14 +3929,13 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
; GFX7-LABEL: fmul_select_bf16_test9:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mov_b32_e32 v3, 0xc2000000
; GFX7-NEXT: v_mov_b32_e32 v4, 0xc1800000
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_bf16_test9:
@@ -4120,14 +4064,13 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
; GFX7-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_mov_b32_e32 v3, 0xdb800000
; GFX7-NEXT: v_bfrev_b32_e32 v4, 7
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
@@ -4256,14 +4199,13 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
; GFX7-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_bfrev_b32_e32 v3, 50
; GFX7-NEXT: v_mov_b32_e32 v4, 0x34800000
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
; GFX7-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
index 8532a7f716ba7..d9b23d43d593d 100644
--- a/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
+++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-buildvector.ll
@@ -246,7 +246,7 @@ define float @divergent_vec_f16_0(half %a) {
; GCN-LABEL: divergent_vec_f16_0:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: divergent_vec_f16_0:
@@ -654,8 +654,7 @@ define float @divergent_vec_f16_LL(half %a, half %b) {
; GCN-LABEL: divergent_vec_f16_LL:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 65e2b26a79fbd..a723a67498d05 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -215,8 +215,6 @@ define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1
; CI-LABEL: v_test_canonicalize_build_vector_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -2572,11 +2570,12 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
; CI-LABEL: v_test_canonicalize_reg_undef_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000
+; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2615,9 +2614,8 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
; CI-LABEL: v_test_canonicalize_undef_reg_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; CI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -2775,11 +2773,10 @@ define <2 x half> @v_test_canonicalize_reg_k_v2f16(half %val) #1 {
; CI-LABEL: v_test_canonicalize_reg_k_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2821,9 +2818,8 @@ define <2 x half> @v_test_canonicalize_k_reg_v2f16(half %val) #1 {
; CI-LABEL: v_test_canonicalize_k_reg_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v1, 2.0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; CI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -2915,12 +2911,11 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
; CI-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v1, 0x7fc00000
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; CI-NEXT: v_or_b32_e32 v1, v1, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_or_b32_e32 v0, v0, v2
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2967,11 +2962,9 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; CI-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v2, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v2, 0x7fc00000
; CI-NEXT: v_cvt_f16_f32_e32 v3, v1
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -3027,17 +3020,14 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; CI-LABEL: v_test_canonicalize_reg_undef_reg_reg_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v3, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v3, 0x7fc00000
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v0, v0, v3
; CI-NEXT: v_or_b32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 50066711f2552..a8703d5d6e51d 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -13,23 +13,15 @@ define bfloat @v_copysign_bf16_bf16(bfloat %mag, bfloat %sign) {
; GCN-LABEL: v_copysign_bf16_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_movk_i32 s4, 0x7fff
+; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_bf16_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_movk_i32 s4, 0x7fff
+; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_bf16_bf16:
@@ -65,23 +57,17 @@ define bfloat @v_copysign_bf16_s_bf16(bfloat %mag, bfloat inreg %sign) {
; GCN-LABEL: v_copysign_bf16_s_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: s_and_b32 s4, s16, 0x80000000
-; GCN-NEXT: s_lshr_b32 s4, s4, 16
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, s4, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_movk_i32 s4, 0x7fff
+; GCN-NEXT: v_mov_b32_e32 v1, s16
+; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_bf16_s_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: s_and_b32 s4, s16, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s4, s4, 16
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7-NEXT: v_or_b32_e32 v0, s4, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_movk_i32 s4, 0x7fff
+; GFX7-NEXT: v_mov_b32_e32 v1, s16
+; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_bf16_s_bf16:
@@ -127,23 +113,17 @@ define bfloat @v_copysign_s_bf16_bf16(bfloat inreg %mag, bfloat %sign) {
; GCN-LABEL: v_copysign_s_bf16_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e64 v1, 1.0, s16
-; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_bfe_u32 v1, v1, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, v1, v0
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_movk_i32 s4, 0x7fff
+; GCN-NEXT: v_mov_b32_e32 v1, s16
+; GCN-NEXT: v_bfi_b32 v0, s4, v1, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_s_bf16_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s16
-; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_movk_i32 s4, 0x7fff
+; GFX7-NEXT: v_mov_b32_e32 v1, s16
+; GFX7-NEXT: v_bfi_b32 v0, s4, v1, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_s_bf16_bf16:
@@ -189,23 +169,19 @@ define bfloat @v_copysign_bf16_f32(bfloat %mag, float %sign.f32) {
; GCN-LABEL: v_copysign_bf16_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v1
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_bf16_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_bf16_f32:
@@ -247,23 +223,19 @@ define bfloat @v_copysign_bf16_f64(bfloat %mag, double %sign.f64) {
; GCN-LABEL: v_copysign_bf16_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GCN-NEXT: v_and_b32_e32 v1, 0x80000000, v2
; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_bf16_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v1, 0x80000000, v2
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_bf16_f64:
@@ -305,23 +277,15 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
; GCN-LABEL: v_copysign_bf16_f16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: s_movk_i32 s4, 0x7fff
+; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_bf16_f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7-NEXT: v_and_b32_e32 v1, 0x8000, v1
-; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: s_movk_i32 s4, 0x7fff
+; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_bf16_f16:
@@ -357,22 +321,16 @@ define bfloat @v_copysign_bf16_f16(bfloat %mag, half %sign.f16) {
define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign) {
; GCN-LABEL: s_copysign_bf16_bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: s_and_b32 s0, s1, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s1, s1, 0x8000
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_bf16_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s1, s1, 0x8000
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_copysign_bf16_bf16:
@@ -433,22 +391,18 @@ define amdgpu_ps i32 @s_copysign_bf16_bf16(bfloat inreg %mag, bfloat inreg %sign
define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f32) {
; GCN-LABEL: s_copysign_bf16_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: s_and_b32 s0, s1, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_and_b32 s1, s1, 0x80000000
+; GCN-NEXT: s_lshr_b32 s1, s1, 16
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_bf16_f32:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s1, s1, 0x80000000
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_lshr_b32 s1, s1, 16
+; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_copysign_bf16_f32:
@@ -510,22 +464,18 @@ define amdgpu_ps i32 @s_copysign_bf16_f32(bfloat inreg %mag, float inreg %sign.f
define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.f64) {
; GCN-LABEL: s_copysign_bf16_f64:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: s_and_b32 s0, s2, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_and_b32 s1, s2, 0x80000000
+; GCN-NEXT: s_lshr_b32 s1, s1, 16
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_bf16_f64:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s1, s2, 0x80000000
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_lshr_b32 s1, s1, 16
+; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_copysign_bf16_f64:
@@ -587,22 +537,16 @@ define amdgpu_ps i32 @s_copysign_bf16_f64(bfloat inreg %mag, double inreg %sign.
define amdgpu_ps i32 @s_copysign_bf16_f16(bfloat inreg %mag, half inreg %sign.f16) {
; GCN-LABEL: s_copysign_bf16_f16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, s1
-; GCN-NEXT: v_and_b32_e32 v1, 0x8000, v1
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, v0, v1
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s1, s1, 0x8000
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_bf16_f16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s1
-; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s0
-; GFX7-NEXT: v_bfe_u32 v1, v1, 16, 15
-; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
-; GFX7-NEXT: v_or_b32_e32 v0, v1, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s1, s1, 0x8000
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_copysign_bf16_f16:
@@ -667,6 +611,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
; GCN-LABEL: v_copysign_f32_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GCN-NEXT: s_brev_b32 s4, -2
; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -674,6 +619,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
; GFX7-LABEL: v_copysign_f32_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: s_brev_b32 s4, -2
; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -724,6 +670,7 @@ define float @v_copysign_f32_bf16(float %mag, bfloat %sign.bf16) {
define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.bf16) {
; GCN-LABEL: s_copysign_f32_bf16:
; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s1, s1, 16
; GCN-NEXT: s_brev_b32 s2, -2
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
@@ -733,6 +680,7 @@ define amdgpu_ps i32 @s_copysign_f32_bf16(float inreg %mag, bfloat inreg %sign.b
;
; GFX7-LABEL: s_copysign_f32_bf16:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_brev_b32 s2, -2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -793,25 +741,21 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
; GCN-LABEL: v_copysign_f16_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GCN-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
; GCN-NEXT: s_brev_b32 s4, -2
; GCN-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_f16_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: s_brev_b32 s4, -2
; GFX7-NEXT: v_bfi_b32 v0, s4, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_f16_bf16:
@@ -847,24 +791,18 @@ define half @v_copysign_f16_bf16(half %mag, bfloat %sign.bf16) {
define amdgpu_ps i32 @s_copysign_f16_bf16(half inreg %mag, bfloat inreg %sign.bf16) {
; GCN-LABEL: s_copysign_f16_bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s1
-; GCN-NEXT: v_cvt_f16_f32_e32 v1, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GCN-NEXT: v_cvt_f32_f16_e32 v1, s1
; GCN-NEXT: s_brev_b32 s0, -2
-; GCN-NEXT: v_bfi_b32 v0, s0, v1, v0
+; GCN-NEXT: v_bfi_b32 v0, s0, v0, v1
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_f16_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s0
-; GFX7-NEXT: v_mul_f32_e64 v1, 1.0, s1
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s1
; GFX7-NEXT: s_brev_b32 s0, -2
; GFX7-NEXT: v_bfi_b32 v0, s0, v0, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
@@ -933,6 +871,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
; GCN-LABEL: v_copysign_f64_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GCN-NEXT: s_brev_b32 s4, -2
; GCN-NEXT: v_bfi_b32 v1, s4, v1, v2
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -940,6 +879,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
; GFX7-LABEL: v_copysign_f64_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: s_brev_b32 s4, -2
; GFX7-NEXT: v_bfi_b32 v1, s4, v1, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -990,6 +930,7 @@ define double @v_copysign_f64_bf16(double %mag, bfloat %sign.bf16) {
define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg %sign.bf16) {
; GCN-LABEL: s_copysign_f64_bf16:
; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s2, s2, 16
; GCN-NEXT: s_brev_b32 s3, -2
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -999,6 +940,7 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_bf16(double inreg %mag, bfloat inreg
;
; GFX7-LABEL: s_copysign_f64_bf16:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshl_b32 s2, s2, 16
; GFX7-NEXT: s_brev_b32 s3, -2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s2
@@ -2716,15 +2658,17 @@ define <32 x bfloat> @v_copysign_v32bf16(<32 x bfloat> %mag, <32 x bfloat> %sign
define amdgpu_ps i32 @s_copysign_out_f32_mag_bf16_sign_f32(bfloat inreg %mag, float inreg %sign) {
; GCN-LABEL: s_copysign_out_f32_mag_bf16_sign_f32:
; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 16
; GCN-NEXT: s_brev_b32 s2, -2
-; GCN-NEXT: v_mov_b32_e32 v0, s0
-; GCN-NEXT: v_mov_b32_e32 v1, s1
-; GCN-NEXT: v_bfi_b32 v0, s2, v0, v1
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: v_mov_b32_e32 v1, s0
+; GCN-NEXT: v_bfi_b32 v0, s2, v1, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_out_f32_mag_bf16_sign_f32:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshl_b32 s0, s0, 16
; GFX7-NEXT: s_brev_b32 s2, -2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -2778,16 +2722,18 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_bf16_sign_f32(bfloat inreg %mag, fl
define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_bf16_sign_f64(bfloat inreg %mag, double inreg %sign) {
; GCN-LABEL: s_copysign_out_f64_mag_bf16_sign_f64:
; GCN: ; %bb.0:
-; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], s0
-; GCN-NEXT: s_brev_b32 s0, -2
+; GCN-NEXT: s_lshl_b32 s0, s0, 16
+; GCN-NEXT: s_brev_b32 s1, -2
; GCN-NEXT: v_mov_b32_e32 v2, s2
-; GCN-NEXT: v_bfi_b32 v1, s0, v1, v2
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], s0
+; GCN-NEXT: v_bfi_b32 v1, s1, v1, v2
; GCN-NEXT: v_readfirstlane_b32 s1, v1
; GCN-NEXT: v_readfirstlane_b32 s0, v0
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_out_f64_mag_bf16_sign_f64:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshl_b32 s0, s0, 16
; GFX7-NEXT: v_cvt_f64_f32_e32 v[0:1], s0
; GFX7-NEXT: s_brev_b32 s0, -2
; GFX7-NEXT: v_mov_b32_e32 v2, s2
@@ -2846,6 +2792,7 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_bf16_sign_f64(bfloat inreg %m
define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_bf16(float inreg %mag, bfloat inreg %sign) {
; GCN-LABEL: s_copysign_out_f32_mag_f32_sign_bf16:
; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s1, s1, 16
; GCN-NEXT: s_brev_b32 s2, -2
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
@@ -2855,6 +2802,7 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_bf16(float inreg %mag, bfl
;
; GFX7-LABEL: s_copysign_out_f32_mag_f32_sign_bf16:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshl_b32 s1, s1, 16
; GFX7-NEXT: s_brev_b32 s2, -2
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v1, s1
@@ -2911,6 +2859,7 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_bf16(float inreg %mag, bfl
define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_bf16(double inreg %mag, bfloat inreg %sign) {
; GCN-LABEL: s_copysign_out_f64_mag_f64_sign_bf16:
; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s2, s2, 16
; GCN-NEXT: s_brev_b32 s3, -2
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -2920,6 +2869,7 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_bf16(double inreg %m
;
; GFX7-LABEL: s_copysign_out_f64_mag_f64_sign_bf16:
; GFX7: ; %bb.0:
+; GFX7-NEXT: s_lshl_b32 s2, s2, 16
; GFX7-NEXT: s_brev_b32 s3, -2
; GFX7-NEXT: v_mov_b32_e32 v0, s1
; GFX7-NEXT: v_mov_b32_e32 v1, s2
@@ -2976,22 +2926,18 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_bf16(double inreg %m
define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, float inreg %sign) {
; GCN-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: s_and_b32 s0, s1, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_and_b32 s1, s1, 0x80000000
+; GCN-NEXT: s_lshr_b32 s1, s1, 16
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s1, s1, 0x80000000
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_lshr_b32 s1, s1, 16
+; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_copysign_out_bf16_mag_bf16_sign_f32:
@@ -3044,22 +2990,18 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f32(bfloat inreg %mag, f
define amdgpu_ps i16 @s_copysign_out_bf16_mag_bf16_sign_f64(bfloat inreg %mag, double inreg %sign) {
; GCN-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: s_and_b32 s0, s2, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
-; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GCN-NEXT: v_or_b32_e32 v0, s0, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s0, s0, 0x7fff
+; GCN-NEXT: s_and_b32 s1, s2, 0x80000000
+; GCN-NEXT: s_lshr_b32 s1, s1, 16
+; GCN-NEXT: s_or_b32 s0, s0, s1
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: s_and_b32 s0, s2, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
-; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
-; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s1, s2, 0x80000000
+; GFX7-NEXT: s_and_b32 s0, s0, 0x7fff
+; GFX7-NEXT: s_lshr_b32 s1, s1, 16
+; GFX7-NEXT: s_or_b32 s0, s0, s1
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_copysign_out_bf16_mag_bf16_sign_f64:
@@ -3113,8 +3055,7 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_f32_sign_bf16(float inreg %mag, bf
; GCN-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: s_and_b32 s0, s1, 0x80000000
-; GCN-NEXT: s_lshr_b32 s0, s0, 16
+; GCN-NEXT: s_and_b32 s0, s1, 0xffff8000
; GCN-NEXT: v_bfe_u32 v0, v0, 16, 15
; GCN-NEXT: v_or_b32_e32 v0, s0, v0
; GCN-NEXT: v_readfirstlane_b32 s0, v0
@@ -3123,8 +3064,7 @@ define amdgpu_ps i16 @s_copysign_out_bf16_mag_f32_sign_bf16(float inreg %mag, bf
; GFX7-LABEL: s_copysign_out_bf16_mag_f32_sign_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: s_and_b32 s0, s1, 0x80000000
-; GFX7-NEXT: s_lshr_b32 s0, s0, 16
+; GFX7-NEXT: s_and_b32 s0, s1, 0xffff8000
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 15
; GFX7-NEXT: v_or_b32_e32 v0, s0, v0
; GFX7-NEXT: v_readfirstlane_b32 s0, v0
@@ -7282,18 +7222,12 @@ define <4 x bfloat> @v_copysign_out_v4bf16_mag_v4bf16_sign_v4f64(<4 x bfloat> %m
define amdgpu_ps i32 @s_copysign_bf16_0_bf16(bfloat inreg %sign) {
; GCN-LABEL: s_copysign_bf16_0_bf16:
; GCN: ; %bb.0:
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0
-; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: s_and_b32 s0, s0, 0x8000
; GCN-NEXT: ; return to shader part epilog
;
; GFX7-LABEL: s_copysign_bf16_0_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, s0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
-; GFX7-NEXT: v_readfirstlane_b32 s0, v0
+; GFX7-NEXT: s_and_b32 s0, s0, 0x8000
; GFX7-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: s_copysign_bf16_0_bf16:
@@ -7325,13 +7259,13 @@ define bfloat @v_copysign_bf16_0_bf16(bfloat %sign) {
; GCN-LABEL: v_copysign_bf16_0_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_bf16_0_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_bf16_0_bf16:
@@ -7444,13 +7378,17 @@ define bfloat @v_copysign_bf16_0_f32(float %sign) {
; GCN-LABEL: v_copysign_bf16_0_f32:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_bf16_0_f32:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_bf16_0_f32:
@@ -7624,13 +7562,17 @@ define bfloat @v_copysign_bf16_0_f64(double %sign) {
; GCN-LABEL: v_copysign_bf16_0_f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: v_and_b32_e32 v0, 0x80000000, v1
+; GCN-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GCN-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_copysign_bf16_0_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_and_b32_e32 v0, 0x80000000, v1
+; GFX7-NEXT: v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_copysign_bf16_0_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 7f38e5bb5bb61..b80204e70851e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -16,11 +16,10 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) {
; SI-LABEL: s_copysign_f16:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
; SI-NEXT: s_brev_b32 s0, -2
-; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_bfi_b32 v0, s0, v0, v1
+; SI-NEXT: v_bfi_b32 v0, s0, v1, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
@@ -67,9 +66,7 @@ define amdgpu_ps i16 @s_copysign_f16(half inreg %mag, half inreg %sign) {
define amdgpu_ps i16 @s_test_copysign_f16_0(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_0:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_0:
@@ -94,9 +91,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_0(half inreg %mag) {
define amdgpu_ps i16 @s_test_copysign_f16_1(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_1:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_1:
@@ -121,9 +116,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_1(half inreg %mag) {
define amdgpu_ps i16 @s_test_copysign_f16_10.0(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_10.0:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0x7fff
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_10.0:
@@ -148,9 +141,8 @@ define amdgpu_ps i16 @s_test_copysign_f16_10.0(half inreg %mag) {
define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_neg1:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0x7fff
+; SI-NEXT: s_bitset1_b32 s0, 15
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_neg1:
@@ -175,9 +167,8 @@ define amdgpu_ps i16 @s_test_copysign_f16_neg1(half inreg %mag) {
define amdgpu_ps i16 @s_test_copysign_f16_neg10(half inreg %mag) {
; SI-LABEL: s_test_copysign_f16_neg10:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0x7fff
+; SI-NEXT: s_bitset1_b32 s0, 15
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_neg10:
@@ -202,9 +193,7 @@ define amdgpu_ps i16 @s_test_copysign_f16_neg10(half inreg %mag) {
define amdgpu_ps i16 @s_test_copysign_f16_0_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_0_mag:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0x8000
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_0_mag:
@@ -230,10 +219,8 @@ define amdgpu_ps i16 @s_test_copysign_f16_0_mag(half inreg %sign) {
define amdgpu_ps i16 @s_test_copysign_f16_1_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_1_mag:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
-; SI-NEXT: v_or_b32_e32 v0, 0x3c00, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0xffff8000
+; SI-NEXT: s_or_b32 s0, s0, 0x3c00
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_1_mag:
@@ -263,9 +250,9 @@ define i16 @s_test_copysign_f16_10_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_10_mag:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s16
-; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
-; SI-NEXT: v_or_b32_e32 v0, 0x4900, v0
+; SI-NEXT: s_and_b32 s4, s16, 0xffff8000
+; SI-NEXT: s_or_b32 s4, s4, 0x4900
+; SI-NEXT: v_mov_b32_e32 v0, s4
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: s_test_copysign_f16_10_mag:
@@ -300,10 +287,8 @@ define i16 @s_test_copysign_f16_10_mag(half inreg %sign) {
define amdgpu_ps i16 @s_test_copysign_f16_neg1_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_neg1_mag:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
-; SI-NEXT: v_or_b32_e32 v0, 0x3c00, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0xffff8000
+; SI-NEXT: s_or_b32 s0, s0, 0x3c00
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_neg1_mag:
@@ -332,10 +317,8 @@ define amdgpu_ps i16 @s_test_copysign_f16_neg1_mag(half inreg %sign) {
define amdgpu_ps i16 @s_test_copysign_f16_neg10_mag(half inreg %sign) {
; SI-LABEL: s_test_copysign_f16_neg10_mag:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
-; SI-NEXT: v_or_b32_e32 v0, 0x4900, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0xffff8000
+; SI-NEXT: s_or_b32 s0, s0, 0x4900
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_test_copysign_f16_neg10_mag:
@@ -365,10 +348,11 @@ define half @v_copysign_f16(half %mag, half %sign) {
; SI-LABEL: v_copysign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_f16:
@@ -398,8 +382,7 @@ define half @v_test_copysign_f16_0(half %mag) {
; SI-LABEL: v_test_copysign_f16_0:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_0:
@@ -433,8 +416,7 @@ define half @v_test_copysign_f16_1(half %mag) {
; SI-LABEL: v_test_copysign_f16_1:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_1:
@@ -468,8 +450,7 @@ define half @v_test_copysign_f16_10(half %mag) {
; SI-LABEL: v_test_copysign_f16_10:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_10:
@@ -503,8 +484,8 @@ define half @v_test_copysign_f16_neg1(half %mag) {
; SI-LABEL: v_test_copysign_f16_neg1:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
+; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_neg1:
@@ -538,8 +519,8 @@ define half @v_test_copysign_f16_neg10(half %mag) {
; SI-LABEL: v_test_copysign_f16_neg10:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
+; SI-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; SI-NEXT: v_or_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f16_neg10:
@@ -573,6 +554,7 @@ define float @v_copysign_out_f32_mag_f16_sign_f32(half %mag, float %sign) {
; SI-LABEL: v_copysign_out_f32_mag_f16_sign_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -617,8 +599,9 @@ define double @v_copysign_out_f64_mag_f16_sign_f64(half %mag, double %sign) {
; SI-LABEL: v_copysign_out_f64_mag_f16_sign_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_brev_b32 s4, -2
+; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; SI-NEXT: v_bfi_b32 v1, s4, v1, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -666,6 +649,7 @@ define float @v_copysign_out_f32_mag_f32_sign_f16(float %mag, half %sign) {
; SI-LABEL: v_copysign_out_f32_mag_f32_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -710,6 +694,7 @@ define double @v_copysign_out_f64_mag_f64_sign_f16(double %mag, half %sign) {
; SI-LABEL: v_copysign_out_f64_mag_f64_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v1, s4, v1, v2
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -754,10 +739,10 @@ define half @v_copysign_out_f16_mag_f16_sign_f32(half %mag, float %sign) {
; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f32:
@@ -792,10 +777,10 @@ define half @v_copysign_out_f16_mag_f16_sign_f64(half %mag, double %sign) {
; SI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f16_mag_f16_sign_f64:
@@ -831,9 +816,11 @@ define half @v_copysign_out_f16_mag_f32_sign_f16(float %mag, half %sign) {
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_bfi_b32 v0, s4, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f16_mag_f32_sign_f16:
@@ -883,8 +870,9 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; SI-NEXT: v_and_b32_e32 v3, 0xffe, v3
; SI-NEXT: v_bfe_u32 v4, v1, 20, 11
+; SI-NEXT: s_movk_i32 s4, 0x3f1
; SI-NEXT: v_or_b32_e32 v0, v3, v0
-; SI-NEXT: v_sub_i32_e32 v5, vcc, 0x3f1, v4
+; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4
; SI-NEXT: v_or_b32_e32 v3, 0x1000, v0
; SI-NEXT: v_med3_i32 v5, v5, 0, 13
; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v3
@@ -918,9 +906,11 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
; SI-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_bfi_b32 v0, s4, v0, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_out_f16_mag_f64_sign_f16:
@@ -1115,51 +1105,51 @@ define half @v_copysign_out_f16_mag_f64_sign_f16(double %mag, half %sign) {
define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, half inreg %sign) {
; SI-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_and_b32 s3, s1, 0x1ff
-; SI-NEXT: s_or_b32 s0, s3, s0
-; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT: s_and_b32 s2, s1, 0x1ff
+; SI-NEXT: s_or_b32 s0, s2, s0
+; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3]
; SI-NEXT: s_lshr_b32 s0, s1, 8
-; SI-NEXT: s_bfe_u32 s4, s1, 0xb0014
+; SI-NEXT: s_bfe_u32 s3, s1, 0xb0014
; SI-NEXT: s_and_b32 s0, s0, 0xffe
-; SI-NEXT: v_readfirstlane_b32 s3, v0
-; SI-NEXT: s_sub_i32 s5, 0x3f1, s4
-; SI-NEXT: s_or_b32 s0, s0, s3
-; SI-NEXT: v_med3_i32 v0, s5, 0, 13
-; SI-NEXT: s_or_b32 s3, s0, 0x1000
-; SI-NEXT: v_readfirstlane_b32 s5, v0
-; SI-NEXT: s_lshr_b32 s6, s3, s5
-; SI-NEXT: s_lshl_b32 s5, s6, s5
-; SI-NEXT: s_cmp_lg_u32 s5, s3
-; SI-NEXT: s_cselect_b32 s3, 1, 0
-; SI-NEXT: s_addk_i32 s4, 0xfc10
-; SI-NEXT: s_lshl_b32 s5, s4, 12
-; SI-NEXT: s_or_b32 s3, s6, s3
-; SI-NEXT: s_or_b32 s5, s0, s5
-; SI-NEXT: s_cmp_lt_i32 s4, 1
-; SI-NEXT: s_cselect_b32 s3, s3, s5
-; SI-NEXT: s_and_b32 s5, s3, 7
-; SI-NEXT: s_cmp_gt_i32 s5, 5
-; SI-NEXT: s_cselect_b32 s6, 1, 0
-; SI-NEXT: s_cmp_eq_u32 s5, 3
+; SI-NEXT: v_readfirstlane_b32 s2, v1
+; SI-NEXT: s_sub_i32 s4, 0x3f1, s3
+; SI-NEXT: s_or_b32 s0, s0, s2
+; SI-NEXT: v_med3_i32 v1, s4, 0, 13
+; SI-NEXT: s_or_b32 s2, s0, 0x1000
+; SI-NEXT: v_readfirstlane_b32 s4, v1
+; SI-NEXT: s_lshr_b32 s5, s2, s4
+; SI-NEXT: s_lshl_b32 s4, s5, s4
+; SI-NEXT: s_cmp_lg_u32 s4, s2
+; SI-NEXT: s_cselect_b32 s2, 1, 0
+; SI-NEXT: s_addk_i32 s3, 0xfc10
+; SI-NEXT: s_lshl_b32 s4, s3, 12
+; SI-NEXT: s_or_b32 s2, s5, s2
+; SI-NEXT: s_or_b32 s4, s0, s4
+; SI-NEXT: s_cmp_lt_i32 s3, 1
+; SI-NEXT: s_cselect_b32 s2, s2, s4
+; SI-NEXT: s_and_b32 s4, s2, 7
+; SI-NEXT: s_cmp_gt_i32 s4, 5
; SI-NEXT: s_cselect_b32 s5, 1, 0
-; SI-NEXT: s_or_b32 s5, s5, s6
-; SI-NEXT: s_lshr_b32 s3, s3, 2
-; SI-NEXT: s_add_i32 s3, s3, s5
-; SI-NEXT: s_cmp_lt_i32 s4, 31
-; SI-NEXT: s_cselect_b32 s3, s3, 0x7c00
+; SI-NEXT: s_cmp_eq_u32 s4, 3
+; SI-NEXT: s_cselect_b32 s4, 1, 0
+; SI-NEXT: s_or_b32 s4, s4, s5
+; SI-NEXT: s_lshr_b32 s2, s2, 2
+; SI-NEXT: s_add_i32 s2, s2, s4
+; SI-NEXT: s_cmp_lt_i32 s3, 31
+; SI-NEXT: s_cselect_b32 s2, s2, 0x7c00
; SI-NEXT: s_cmp_lg_u32 s0, 0
; SI-NEXT: s_movk_i32 s0, 0x7e00
; SI-NEXT: s_cselect_b32 s0, s0, 0x7c00
-; SI-NEXT: s_cmpk_eq_i32 s4, 0x40f
-; SI-NEXT: s_cselect_b32 s0, s0, s3
+; SI-NEXT: s_cmpk_eq_i32 s3, 0x40f
+; SI-NEXT: s_cselect_b32 s0, s0, s2
; SI-NEXT: s_lshr_b32 s1, s1, 16
; SI-NEXT: s_and_b32 s1, s1, 0x8000
; SI-NEXT: s_or_b32 s0, s1, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
; SI-NEXT: s_brev_b32 s0, -2
-; SI-NEXT: v_mov_b32_e32 v1, s2
-; SI-NEXT: v_bfi_b32 v0, s0, v0, v1
+; SI-NEXT: v_bfi_b32 v0, s0, v1, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
@@ -2551,10 +2541,10 @@ define <32 x half> @v_copysign_v32f32(<32 x half> %mag, <32 x half> %sign) {
define amdgpu_ps i32 @s_copysign_out_f32_mag_f16_sign_f32(half inreg %mag, float inreg %sign) {
; SI-LABEL: s_copysign_out_f32_mag_f16_sign_f32:
; SI: ; %bb.0:
-; SI-NEXT: s_brev_b32 s2, -2
-; SI-NEXT: v_mov_b32_e32 v0, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: v_bfi_b32 v0, s2, v0, v1
+; SI-NEXT: v_bfi_b32 v0, s0, v0, v1
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
@@ -2592,12 +2582,13 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f16_sign_f32(half inreg %mag, float
define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f16_sign_f64(half inreg %mag, double inreg %sign) {
; SI-LABEL: s_copysign_out_f64_mag_f16_sign_f64:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], s0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_mov_b32_e32 v2, s2
+; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; SI-NEXT: v_bfi_b32 v1, s0, v1, v2
-; SI-NEXT: v_readfirstlane_b32 s1, v1
; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: v_readfirstlane_b32 s1, v1
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_out_f64_mag_f16_sign_f64:
@@ -2641,10 +2632,10 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f16_sign_f64(half inreg %mag,
define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_f16(float inreg %mag, half inreg %sign) {
; SI-LABEL: s_copysign_out_f32_mag_f32_sign_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_brev_b32 s2, -2
-; SI-NEXT: v_mov_b32_e32 v0, s0
-; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: v_bfi_b32 v0, s2, v0, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: s_brev_b32 s1, -2
+; SI-NEXT: v_mov_b32_e32 v1, s0
+; SI-NEXT: v_bfi_b32 v0, s1, v1, v0
; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
@@ -2690,10 +2681,10 @@ define amdgpu_ps i32 @s_copysign_out_f32_mag_f32_sign_f16(float inreg %mag, half
define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_f16(double inreg %mag, half inreg %sign) {
; SI-LABEL: s_copysign_out_f64_mag_f64_sign_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_brev_b32 s3, -2
-; SI-NEXT: v_mov_b32_e32 v0, s1
-; SI-NEXT: v_mov_b32_e32 v1, s2
-; SI-NEXT: v_bfi_b32 v0, s3, v0, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
+; SI-NEXT: s_brev_b32 s2, -2
+; SI-NEXT: v_mov_b32_e32 v1, s1
+; SI-NEXT: v_bfi_b32 v0, s2, v1, v0
; SI-NEXT: v_readfirstlane_b32 s1, v0
; SI-NEXT: ; return to shader part epilog
;
@@ -2739,10 +2730,9 @@ define amdgpu_ps <2 x i32> @s_copysign_out_f64_mag_f64_sign_f16(double inreg %ma
define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float inreg %sign) {
; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f32:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_mov_b32_e32 v1, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_bfi_b32 v0, s0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_readfirstlane_b32 s0, v0
@@ -2791,10 +2781,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f32(half inreg %mag, float
define amdgpu_ps i16 @s_copysign_out_f16_mag_f16_sign_f64(half inreg %mag, double inreg %sign) {
; SI-LABEL: s_copysign_out_f16_mag_f16_sign_f64:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
; SI-NEXT: s_brev_b32 s0, -2
; SI-NEXT: v_mov_b32_e32 v1, s2
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_bfi_b32 v0, s0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_readfirstlane_b32 s0, v0
@@ -2844,8 +2833,8 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f32_sign_f16(float inreg %mag, half
; SI-LABEL: s_copysign_out_f16_mag_f32_sign_f16:
; SI: ; %bb.0:
; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s1
; SI-NEXT: s_brev_b32 s0, -2
-; SI-NEXT: v_mov_b32_e32 v1, s1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_bfi_b32 v0, s0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
@@ -6900,9 +6889,7 @@ define <4 x half> @v_copysign_out_v4f16_mag_v4f16_sign_v4f64(<4 x half> %mag, <4
define amdgpu_ps i32 @s_copysign_f16_0_f16(half inreg %sign) {
; SI-LABEL: s_copysign_f16_0_f16:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s0
-; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
-; SI-NEXT: v_readfirstlane_b32 s0, v0
+; SI-NEXT: s_and_b32 s0, s0, 0x8000
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: s_copysign_f16_0_f16:
@@ -6929,7 +6916,7 @@ define half @v_copysign_f16_0_f16(half %sign) {
; SI-LABEL: v_copysign_f16_0_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_f16_0_f16:
@@ -7007,7 +6994,8 @@ define half @v_copysign_f16_0_f32(float %sign) {
; SI-LABEL: v_copysign_f16_0_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_f16_0_f32:
@@ -7242,7 +7230,45 @@ define half @v_copysign_f16_0_f64(double %sign) {
; SI-LABEL: v_copysign_f16_0_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v1
+; SI-NEXT: v_and_b32_e32 v2, 0x1ff, v1
+; SI-NEXT: v_or_b32_e32 v0, v2, v0
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2
+; SI-NEXT: v_bfe_u32 v3, v1, 20, 11
+; SI-NEXT: v_or_b32_e32 v0, v2, v0
+; SI-NEXT: v_sub_i32_e32 v4, vcc, 0x3f1, v3
+; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0
+; SI-NEXT: v_med3_i32 v4, v4, 0, 13
+; SI-NEXT: v_lshrrev_b32_e32 v5, v4, v2
+; SI-NEXT: v_lshlrev_b32_e32 v4, v4, v5
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, v4, v2
+; SI-NEXT: s_movk_i32 s4, 0xfc10
+; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT: v_add_i32_e32 v3, vcc, s4, v3
+; SI-NEXT: v_lshlrev_b32_e32 v4, 12, v3
+; SI-NEXT: v_or_b32_e32 v2, v5, v2
+; SI-NEXT: v_or_b32_e32 v0, v0, v4
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v3
+; SI-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; SI-NEXT: v_and_b32_e32 v2, 7, v0
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v2
+; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2
+; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; SI-NEXT: v_or_b32_e32 v2, v2, v4
+; SI-NEXT: v_lshrrev_b32_e32 v0, 2, v0
+; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; SI-NEXT: s_movk_i32 s4, 0x40f
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v3
+; SI-NEXT: v_cmp_ne_u32_e64 s[4:5], s4, v3
+; SI-NEXT: v_mov_b32_e32 v2, 0x7c00
+; SI-NEXT: s_and_b64 vcc, s[4:5], vcc
+; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_and_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_copysign_f16_0_f64:
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index eea2e46f8e390..7b11922e75b4f 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -1177,17 +1177,12 @@ define float @v_copysign_f32_0_f64(double %sign) {
}
define amdgpu_ps i32 @s_copysign_f32_0_f16(half inreg %sign) {
-; SI-LABEL: s_copysign_f32_0_f16:
-; SI: ; %bb.0:
-; SI-NEXT: s_and_b32 s0, s0, 0x80000000
-; SI-NEXT: ; return to shader part epilog
-;
-; VI-LABEL: s_copysign_f32_0_f16:
-; VI: ; %bb.0:
-; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
-; VI-NEXT: v_and_b32_e32 v0, 0x80000000, v0
-; VI-NEXT: v_readfirstlane_b32 s0, v0
-; VI-NEXT: ; return to shader part epilog
+; SIVI-LABEL: s_copysign_f32_0_f16:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: v_cvt_f32_f16_e32 v0, s0
+; SIVI-NEXT: v_and_b32_e32 v0, 0x80000000, v0
+; SIVI-NEXT: v_readfirstlane_b32 s0, v0
+; SIVI-NEXT: ; return to shader part epilog
;
; GFX11-LABEL: s_copysign_f32_0_f16:
; GFX11: ; %bb.0:
@@ -1206,6 +1201,7 @@ define float @v_copysign_f32_0_f16(half %sign) {
; SI-LABEL: v_copysign_f32_0_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v0, 0x80000000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
index b72eb5c5cf588..6c37316d33d03 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -955,9 +955,10 @@ define double @v_test_copysign_f64_f16(ptr addrspace(1) %out, [8 x i32], double
; SI-LABEL: v_test_copysign_f64_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v20
; SI-NEXT: s_brev_b32 s4, -2
; SI-NEXT: v_mov_b32_e32 v0, v10
-; SI-NEXT: v_bfi_b32 v1, s4, v11, v20
+; SI-NEXT: v_bfi_b32 v1, s4, v11, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_test_copysign_f64_f16:
@@ -1139,6 +1140,7 @@ define double @v_copysign_f64_0_f32(float %sign) {
define amdgpu_ps <2 x i32> @s_copysign_f64_0_f16(half inreg %sign) {
; SI-LABEL: s_copysign_f64_0_f16:
; SI: ; %bb.0:
+; SI-NEXT: s_lshl_b32 s0, s0, 16
; SI-NEXT: s_and_b32 s1, s0, 0x80000000
; SI-NEXT: s_mov_b32 s0, 0
; SI-NEXT: ; return to shader part epilog
@@ -1164,22 +1166,14 @@ define amdgpu_ps <2 x i32> @s_copysign_f64_0_f16(half inreg %sign) {
}
define double @v_copysign_f64_0_f16(half %sign) {
-; SI-LABEL: v_copysign_f64_0_f16:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; SI-NEXT: v_mov_b32_e32 v0, 0
-; SI-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_copysign_f64_0_f16:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
-; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: v_and_b32_e32 v1, 0x80000000, v1
-; VI-NEXT: s_setpc_b64 s[30:31]
+; SIVI-LABEL: v_copysign_f64_0_f16:
+; SIVI: ; %bb.0:
+; SIVI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SIVI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SIVI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
+; SIVI-NEXT: v_mov_b32_e32 v0, 0
+; SIVI-NEXT: v_and_b32_e32 v1, 0x80000000, v1
+; SIVI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_copysign_f64_0_f16:
; GFX11: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
index 0bb3b8c6f3740..1779c45203f47 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -1937,9 +1937,6 @@ define half @v_fdiv_f16_arcp(half %x, half %y) {
; SI-LABEL: v_fdiv_f16_arcp:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_div_scale_f32 v2, s[4:5], v1, v1, v0
@@ -1955,6 +1952,8 @@ define half @v_fdiv_f16_arcp(half %x, half %y) {
; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5
; SI-NEXT: v_div_fixup_f32 v0, v2, v1, v0
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_f16_arcp:
@@ -2001,12 +2000,11 @@ define half @v_fdiv_f16_afn_nsz(half %x, half %y) {
; SI-LABEL: v_fdiv_f16_afn_nsz:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_rcp_f32_e32 v1, v1
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_fdiv_f16_afn_nsz:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index e32842f8d6f57..dcf0519dee355 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -8690,10 +8690,11 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -8718,7 +8719,6 @@ define half @flat_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret half %result
@@ -9071,10 +9071,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -9099,7 +9100,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -9454,10 +9454,11 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -9482,7 +9483,6 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 -1024
%result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -9814,13 +9814,14 @@ define void @flat_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_not_b32_e32 v6, v3
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10182,10 +10183,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -10552,10 +10554,11 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -10830,8 +10833,9 @@ define void @flat_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -11108,31 +11112,31 @@ define half @flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX7-LABEL: flat_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: .LBB43_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB43_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fadd ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
@@ -11490,10 +11494,11 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -11518,7 +11523,6 @@ define half @flat_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fadd ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -11865,10 +11869,11 @@ define void @flat_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -12318,23 +12323,24 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB46_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12345,8 +12351,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX7-NEXT: s_cbranch_execnz .LBB46_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret bfloat %result
@@ -12782,23 +12787,24 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12809,8 +12815,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_cbranch_execnz .LBB47_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -13248,23 +13253,24 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB48_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13275,8 +13281,7 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: s_cbranch_execnz .LBB48_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024
%result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -13701,6 +13706,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -14152,6 +14158,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -14524,31 +14531,31 @@ define bfloat @flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX7-LABEL: flat_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_add_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB51_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fadd ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
@@ -14887,6 +14894,7 @@ define void @flat_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -15317,22 +15325,23 @@ define void @flat_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX7-NEXT: .LBB53_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15783,23 +15792,24 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB54_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_add_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_add_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -15810,8 +15820,7 @@ define bfloat @flat_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_cbranch_execnz .LBB54_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fadd ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -16240,6 +16249,7 @@ define void @flat_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 2b15147365777..a412a4eebe7ea 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -6350,10 +6350,11 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -6378,7 +6379,6 @@ define half @flat_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret half %result
@@ -6756,10 +6756,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -6784,7 +6785,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -7164,10 +7164,11 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -7192,7 +7193,6 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 -1024
%result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -7544,13 +7544,14 @@ define void @flat_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_not_b32_e32 v6, v3
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -7937,10 +7938,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -8332,10 +8334,11 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -8636,31 +8639,31 @@ define half @flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX7-LABEL: flat_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fmax ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
@@ -8932,8 +8935,9 @@ define void @flat_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9335,10 +9339,11 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -9363,7 +9368,6 @@ define half @flat_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fmax ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -9735,10 +9739,11 @@ define void @flat_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -10188,24 +10193,25 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10216,8 +10222,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX7-NEXT: s_cbranch_execnz .LBB36_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret bfloat %result
@@ -10653,24 +10658,25 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10681,8 +10687,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -11120,24 +11125,25 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11148,8 +11154,7 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024
%result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -11559,23 +11564,24 @@ define void @flat_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12010,6 +12016,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12462,6 +12469,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12835,32 +12843,32 @@ define bfloat @flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX7-LABEL: flat_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fmax ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
@@ -13199,6 +13207,7 @@ define void @flat_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -13661,24 +13670,25 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_max_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_max_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13689,8 +13699,7 @@ define bfloat @flat_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_cbranch_execnz .LBB44_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fmax ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -14119,6 +14128,7 @@ define void @flat_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
index ad7ee22fdb76e..c05d76a63a1d4 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll
@@ -6350,10 +6350,11 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -6378,7 +6379,6 @@ define half @flat_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr %
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret half %result
@@ -6756,10 +6756,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -6784,7 +6785,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grain
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -7164,10 +7164,11 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -7192,7 +7193,6 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grain
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 -1024
%result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -7544,13 +7544,14 @@ define void @flat_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_not_b32_e32 v6, v3
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -7937,10 +7938,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -8332,10 +8334,11 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -8636,31 +8639,31 @@ define half @flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fi
; GFX7-LABEL: flat_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fmin ptr %gep, half %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
@@ -8932,8 +8935,9 @@ define void @flat_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_no_
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -9335,10 +9339,11 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -9363,7 +9368,6 @@ define half @flat_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grai
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fmin ptr %gep, half %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -9735,10 +9739,11 @@ define void @flat_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -10188,24 +10193,25 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB36_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10216,8 +10222,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX7-NEXT: s_cbranch_execnz .LBB36_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
ret bfloat %result
@@ -10653,24 +10658,25 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB37_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10681,8 +10687,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_cbranch_execnz .LBB37_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -11120,24 +11125,25 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11148,8 +11154,7 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024
%result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -11559,23 +11564,24 @@ define void @flat_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(pt
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -12010,6 +12016,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12462,6 +12469,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_gr
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12835,32 +12843,32 @@ define bfloat @flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no
; GFX7-LABEL: flat_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fmin ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory !0
@@ -13199,6 +13207,7 @@ define void @flat_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_no
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -13661,24 +13670,25 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB44_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_min_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_min_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13689,8 +13699,7 @@ define bfloat @flat_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_cbranch_execnz .LBB44_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fmin ptr %gep, bfloat %val seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -14119,6 +14128,7 @@ define void @flat_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
index a278be61104cc..d7c913cafd7d9 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll
@@ -6137,10 +6137,11 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -6165,7 +6166,6 @@ define half @flat_agent_atomic_fsub_ret_f16(ptr %ptr, half %val) #0 {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fsub ptr %ptr, half %val syncscope("agent") seq_cst
ret half %result
@@ -6518,10 +6518,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -6546,7 +6547,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst
@@ -6901,10 +6901,11 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -6929,7 +6930,6 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_neg(ptr %ptr, half %val)
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 -1024
%result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst
@@ -7261,13 +7261,14 @@ define void @flat_agent_atomic_fsub_noret_f16(ptr %ptr, half %val) #0 {
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v4, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_not_b32_e32 v6, v3
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -7629,10 +7630,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %val
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -7999,10 +8001,11 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b_neg(ptr %ptr, half %val
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -8283,31 +8286,31 @@ define half @flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr %ptr, hal
; GFX7-LABEL: flat_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v6
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fsub ptr %gep, half %val syncscope("agent") seq_cst, align 4
@@ -8559,8 +8562,9 @@ define void @flat_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr %ptr, h
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8937,10 +8941,11 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
-; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v2
; GFX7-NEXT: v_not_b32_e32 v4, v4
@@ -8965,7 +8970,6 @@ define half @flat_system_atomic_fsub_ret_f16__offset12b_pos(ptr %ptr, half %val)
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds half, ptr %ptr, i64 1023
%result = atomicrmw fsub ptr %gep, half %val seq_cst
@@ -9312,10 +9316,11 @@ define void @flat_system_atomic_fsub_noret_f16__offset12b_pos(ptr %ptr, half %va
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[4:5], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -9765,23 +9770,24 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -9792,8 +9798,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fsub ptr %ptr, bfloat %val syncscope("agent") seq_cst
ret bfloat %result
@@ -10229,23 +10234,24 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10256,8 +10262,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX7-NEXT: s_cbranch_execnz .LBB33_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst
@@ -10695,23 +10700,24 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, -1, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB34_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -10722,8 +10728,7 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX7-NEXT: s_cbranch_execnz .LBB34_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 -1024
%result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst
@@ -11133,22 +11138,23 @@ define void @flat_agent_atomic_fsub_noret_bf16(ptr %ptr, bfloat %val) #0 {
; GFX7-NEXT: v_mov_b32_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v4, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -11583,6 +11589,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat %
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12034,6 +12041,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr %ptr, bfloat %
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12406,31 +12414,31 @@ define bfloat @flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr %ptr,
; GFX7-LABEL: flat_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
-; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX7-NEXT: flat_load_dword v3, v[0:1]
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_add_i32_e32 v3, vcc, 0x7fe, v0
+; GFX7-NEXT: v_addc_u32_e32 v4, vcc, 0, v1, vcc
+; GFX7-NEXT: flat_load_dword v0, v[3:4]
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mov_b32_e32 v4, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v4
-; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
-; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v4
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_or_b32_e32 v3, v5, v3
-; GFX7-NEXT: flat_atomic_cmpswap v3, v[0:1], v[3:4] glc
+; GFX7-NEXT: v_mov_b32_e32 v6, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v6
+; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_or_b32_e32 v5, v2, v0
+; GFX7-NEXT: flat_atomic_cmpswap v0, v[3:4], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v4
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v0, v6
; GFX7-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX7-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fsub ptr %gep, bfloat %val syncscope("agent") seq_cst, align 4
@@ -12769,6 +12777,7 @@ define void @flat_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr %ptr,
; GFX7-NEXT: v_add_i32_e32 v0, vcc, 0x7fe, v0
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: flat_load_dword v3, v[0:1]
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[4:5], 0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -13230,23 +13239,24 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX7-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX7-NEXT: v_and_b32_e32 v0, -4, v3
; GFX7-NEXT: flat_load_dword v5, v[0:1]
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v4, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
+; GFX7-NEXT: v_not_b32_e32 v3, v3
; GFX7-NEXT: s_mov_b64 s[4:5], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: .LBB40_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mov_b32_e32 v6, v5
-; GFX7-NEXT: v_lshrrev_b32_e32 v5, v3, v6
+; GFX7-NEXT: v_lshrrev_b32_e32 v5, v2, v6
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_sub_f32_e32 v5, v5, v2
+; GFX7-NEXT: v_sub_f32_e32 v5, v5, v4
; GFX7-NEXT: v_lshrrev_b32_e32 v5, 16, v5
-; GFX7-NEXT: v_and_b32_e32 v7, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, v3, v5
+; GFX7-NEXT: v_and_b32_e32 v7, v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v5, v2, v5
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: flat_atomic_cmpswap v5, v[0:1], v[5:6] glc
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -13257,8 +13267,7 @@ define bfloat @flat_system_atomic_fsub_ret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX7-NEXT: s_cbranch_execnz .LBB40_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, v3, v5
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, v2, v5
; GFX7-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr inbounds bfloat, ptr %ptr, i64 1023
%result = atomicrmw fsub ptr %gep, bfloat %val seq_cst
@@ -13687,6 +13696,7 @@ define void @flat_system_atomic_fsub_noret_bf16__offset12b_pos(ptr %ptr, bfloat
; GFX7-NEXT: v_and_b32_e32 v0, -4, v4
; GFX7-NEXT: flat_load_dword v3, v[0:1]
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
index e9a6854226e60..7afdf102f5295 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3-maximumnum.ll
@@ -1039,25 +1039,21 @@ define half @v_max3_f16_maximumnum_maximumnum__v_v_v_0(half %a, half %b, half %c
; GFX6-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_max3_f32 v0, v0, v1, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_max3_f32 v0, v0, v1, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_max3_f16_maximumnum_maximumnum__v_v_v_0:
@@ -1678,31 +1674,31 @@ define bfloat @v_max3_bf16_maximumnum_maximumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX6-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_max3_bf16_maximumnum_maximumnum__v_v_v_0:
diff --git a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
index dd2e9896cf882..b187f39c786aa 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax_legacy.f16.ll
@@ -26,11 +26,10 @@ define half @test_fmax_legacy_ugt_f16(half %a, half %b) #0 {
; SI-LABEL: test_fmax_legacy_ugt_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_max_legacy_f32_e32 v0, v1, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16:
@@ -67,11 +66,10 @@ define half @test_fmax_legacy_ugt_f16_fast(half %a, half %b) #0 {
; SI-LABEL: test_fmax_legacy_ugt_f16_fast:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: test_fmax_legacy_ugt_f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
index 99b163dc9753b..c0ff9b5a041ef 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3-cast-combine.ll
@@ -20,9 +20,11 @@ define half @fmed3_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16:
@@ -74,9 +76,11 @@ define half @fmed3_f32_fpext_f16_flags(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_flags:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_flags:
@@ -128,10 +132,12 @@ define half @fmed3_f32_fpext_f16_multi_use(half %arg0, half %arg1, half %arg2, p
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_med3_f32 v1, v0, v1, v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v1
; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -183,9 +189,10 @@ define half @fmed3_f32_fpext_f16_k0(half %arg1, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_med3_f32 v0, 2.0, v0, v1
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0:
@@ -234,9 +241,10 @@ define half @fmed3_f32_fpext_f16_k1(half %arg0, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k1:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, v1
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k1:
@@ -285,9 +293,10 @@ define half @fmed3_f32_fpext_f16_k2(half %arg0, half %arg1) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k2:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, 2.0
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k2:
@@ -333,23 +342,14 @@ define half @fmed3_f32_fpext_f16_k2(half %arg0, half %arg1) #1 {
}
define half @fmed3_f32_fpext_f16_k0_k1(half %arg2) #1 {
-; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k1:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_mov_b32_e32 v1, 0x41800000
-; GFX7-SDAG-NEXT: v_med3_f32 v0, 0, v1, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k1:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_mov_b32_e32 v1, 0x41800000
-; GFX7-GISEL-NEXT: v_med3_f32 v0, 0, v1, v0
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmed3_f32_fpext_f16_k0_k1:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v1, 0x41800000
+; GFX7-NEXT: v_med3_f32 v0, 0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k1:
; GFX8-SDAG: ; %bb.0:
@@ -386,21 +386,13 @@ define half @fmed3_f32_fpext_f16_k0_k1(half %arg2) #1 {
}
define half @fmed3_f32_fpext_f16_k0_k2(half %arg1) #1 {
-; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k2:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_med3_f32 v0, 0, v0, 2.0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_k0_k2:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_med3_f32 v0, 0, v0, 2.0
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: fmed3_f32_fpext_f16_k0_k2:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_med3_f32 v0, 0, v0, 2.0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: fmed3_f32_fpext_f16_k0_k2:
; GFX8-SDAG: ; %bb.0:
@@ -437,15 +429,11 @@ define half @fmed3_f32_fpext_f16_fabs(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fabs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fabs:
@@ -500,9 +488,11 @@ define half @fmed3_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_fabs_f32_fpext_f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_med3_f32 v0, |v0|, |v1|, |v2|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_fabs_f32_fpext_f16:
@@ -549,9 +539,11 @@ define half @fmed3_f32_fpext_f16_fneg(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg:
@@ -606,9 +598,11 @@ define half @fmed3_fneg_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_fneg_f32_fpext_f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_fneg_f32_fpext_f16:
@@ -655,15 +649,11 @@ define half @fmed3_f32_fpext_f16_fneg_fabs(half %arg0, half %arg1, half %arg2) #
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_fneg_fabs:
@@ -721,9 +711,11 @@ define half @fmed3_fneg_fabs_f32_fpext_f16(half %arg0, half %arg1, half %arg2) #
; GFX7-SDAG-LABEL: fmed3_fneg_fabs_f32_fpext_f16:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_med3_f32 v0, -|v0|, -|v1|, -|v2|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; GFX7-SDAG-NEXT: v_med3_f32 v0, -v0, -v1, -v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_fneg_fabs_f32_fpext_f16:
@@ -776,8 +768,11 @@ define bfloat @fmed3_f32_fpext_f16_fptrunc_bf16(half %arg0, half %arg1, half %ar
; GFX7-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: fmed3_f32_fpext_f16_fptrunc_bf16:
@@ -823,11 +818,12 @@ define half @fmed3_f32_fpext_f16_multi_use_0(half %arg0, half %arg1, half %arg2,
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_0:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_med3_f32 v1, v0, v1, v2
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_mov_b32_e32 v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v5, v0
+; GFX7-SDAG-NEXT: v_med3_f32 v0, v5, v1, v2
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v5
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -891,10 +887,12 @@ define half @fmed3_f32_fpext_f16_multi_use_1(half %arg0, half %arg1, half %arg2,
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_1:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -958,10 +956,12 @@ define half @fmed3_f32_fpext_f16_multi_use_2(half %arg0, half %arg1, half %arg2,
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_multi_use_2:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: flat_store_dword v[3:4], v2
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0)
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -1022,33 +1022,15 @@ define half @fmed3_f32_fpext_f16_multi_use_2(half %arg0, half %arg1, half %arg2,
}
define half @fmed3_f32_fpext_bf16(bfloat %arg0, bfloat %arg1, bfloat %arg2) #1 {
-; GFX7-LABEL: fmed3_f32_fpext_bf16:
-; GFX7: ; %bb.0:
-; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX8-LABEL: fmed3_f32_fpext_bf16:
-; GFX8: ; %bb.0:
-; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX8-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX8-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX8-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX9-LABEL: fmed3_f32_fpext_bf16:
-; GFX9: ; %bb.0:
-; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GFX9-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT: v_med3_f32 v0, v0, v1, v2
-; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: fmed3_f32_fpext_bf16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GCN-NEXT: v_med3_f32 v0, v0, v1, v2
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%arg0.ext = fpext bfloat %arg0 to float
%arg1.ext = fpext bfloat %arg1 to float
%arg2.ext = fpext bfloat %arg2 to float
@@ -1061,9 +1043,11 @@ define half @fmed3_f32_fpext_f16_bf16_0(bfloat %arg0, half %arg1, half %arg2) #1
; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_0:
@@ -1097,9 +1081,11 @@ define half @fmed3_f32_fpext_f16_bf16_1(half %arg0, bfloat %arg1, half %arg2) #1
; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_1:
@@ -1133,9 +1119,11 @@ define half @fmed3_f32_fpext_f16_bf16_2(half %arg0, half %arg1, bfloat %arg2) #1
; GFX7-LABEL: fmed3_f32_fpext_f16_bf16_2:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_med3_f32 v0, v0, v1, v2
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: fmed3_f32_fpext_f16_bf16_2:
@@ -1169,10 +1157,11 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k0(half %arg1, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000
; GFX7-SDAG-NEXT: v_med3_f32 v0, s4, v0, v1
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k0:
@@ -1235,10 +1224,11 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k1(half %arg0, half %arg2) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000
; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, s4, v1
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k1:
@@ -1301,10 +1291,11 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k2(half %arg0, half %arg1) #1 {
; GFX7-SDAG-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_mov_b32 s4, 0x4f800000
; GFX7-SDAG-NEXT: v_med3_f32 v0, v0, v1, s4
; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: fmed3_f32_fpext_f16_unrepresentable_k2:
@@ -1365,5 +1356,3 @@ define half @fmed3_f32_fpext_f16_unrepresentable_k2(half %arg0, half %arg1) #1 {
attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
index 65ced4f658692..a2de79079feed 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
@@ -9,12 +9,12 @@ define bfloat @v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum(bfloat %a) #1 {
; SI-LABEL: v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; SI-NEXT: v_max_f32_e32 v0, 2.0, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
-; SI-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; SI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-SDAG-LABEL: v_test_fmed3_r_i_i_bf16_minimumnum_maximumnum:
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index d1b1a96fdeffc..b37ab370d0bbf 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8697,9 +8697,9 @@ define half @v_test_fmed3_r_i_i_f16_minimumnum_maximumnum(half %a) {
; SI-SDAG-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_test_fmed3_r_i_i_f16_minimumnum_maximumnum:
@@ -9181,10 +9181,10 @@ define half @v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum(half %a) {
; SI-SDAG-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: v_add_f32_e32 v0, 1.0, v0
; SI-SDAG-NEXT: v_med3_f32 v0, v0, 2.0, 4.0
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_test_nnan_input_fmed3_r_i_i_f16_maximum_minimum:
diff --git a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
index 6c78f55d2da86..bb6b20df0c149 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3-minimumnum.ll
@@ -1039,25 +1039,21 @@ define half @v_min3_f16_minimumnum_minimumnum__v_v_v_0(half %a, half %b, half %c
; GFX6-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_min3_f32 v0, v0, v1, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_min3_f32 v0, v0, v1, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_min3_f16_minimumnum_minimumnum__v_v_v_0:
@@ -1678,31 +1674,31 @@ define bfloat @v_min3_bf16_minimumnum_minimumnum__v_v_v_0(bfloat %a, bfloat %b,
; GFX6-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX6-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX6-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
+; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_min3_bf16_minimumnum_minimumnum__v_v_v_0:
diff --git a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
index 9e5a28d6c5041..dd77eb6f364a7 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin_legacy.f16.ll
@@ -27,11 +27,10 @@ define half @test_fmin_legacy_ule_f16(half %a, half %b) #0 {
; SI-LABEL: test_fmin_legacy_ule_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_min_legacy_f32_e32 v0, v1, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16:
@@ -68,11 +67,10 @@ define half @test_fmin_legacy_ule_f16_fast(half %a, half %b) #0 {
; SI-LABEL: test_fmin_legacy_ule_f16_fast:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-TRUE16-LABEL: test_fmin_legacy_ule_f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index b3f6de638a67d..2079ee54653ce 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -19,22 +19,20 @@ define half @v_fneg_add_f16(half %a, half %b) #0 {
; SI-SAFE-LABEL: v_fneg_add_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_add_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_add_f16:
@@ -84,12 +82,11 @@ define { half, half } @v_fneg_add_store_use_add_f16(half %a, half %b) #0 {
; SI-LABEL: v_fneg_add_store_use_add_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e32 v1, v0, v1
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_add_store_use_add_f16:
@@ -131,24 +128,24 @@ define { half, half } @v_fneg_add_multi_use_add_f16(half %a, half %b) #0 {
; SI-SAFE-LABEL: v_fneg_add_multi_use_add_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT: v_add_f32_e32 v1, v0, v1
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
-; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1
+; SI-SAFE-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_add_multi_use_add_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1
; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_add_multi_use_add_f16:
@@ -210,22 +207,19 @@ define half @v_fneg_add_fneg_x_f16(half %a, half %b) #0 {
; SI-SAFE-LABEL: v_fneg_add_fneg_x_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_add_fneg_x_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_add_fneg_x_f16:
@@ -276,22 +270,19 @@ define half @v_fneg_add_x_fneg_f16(half %a, half %b) #0 {
; SI-SAFE-LABEL: v_fneg_add_x_fneg_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_add_x_fneg_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_sub_f32_e32 v0, v1, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_add_x_fneg_f16:
@@ -342,22 +333,20 @@ define half @v_fneg_add_fneg_fneg_f16(half %a, half %b) #0 {
; SI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-SAFE-NEXT: v_sub_f32_e32 v0, v0, v1
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_add_fneg_fneg_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_add_f32_e32 v0, v0, v1
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_add_fneg_fneg_f16:
@@ -409,24 +398,25 @@ define { half, half } @v_fneg_add_store_use_fneg_x_f16(half %a, half %b) #0 {
; SI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v0
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1
-; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v1, -v0
-; SI-SAFE-NEXT: v_sub_f32_e32 v0, v3, v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-SAFE-NEXT: v_sub_f32_e32 v1, v1, v2
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v2, -v1
+; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; SI-SAFE-NEXT: v_mov_b32_e32 v0, v2
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_add_store_use_fneg_x_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v1
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0
-; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v2
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NSZ-NEXT: v_sub_f32_e32 v1, v2, v1
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v1
+; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; SI-NSZ-NEXT: v_mov_b32_e32 v0, v2
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_add_store_use_fneg_x_f16:
@@ -493,30 +483,29 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c
; SI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v0
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-SAFE-NEXT: v_and_b32_e32 v3, 0xffff, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v4, -v0
-; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v3
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; SI-SAFE-NEXT: v_mul_f32_e32 v1, v4, v2
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v3, -v3
+; SI-SAFE-NEXT: v_sub_f32_e32 v0, v1, v0
+; SI-SAFE-NEXT: v_mul_f32_e32 v1, v3, v2
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_add_multi_use_fneg_x_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NSZ-NEXT: v_and_b32_e32 v3, 0xffff, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v0
-; SI-NSZ-NEXT: v_sub_f32_e32 v0, v3, v1
-; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v2
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v3, -v3
+; SI-NSZ-NEXT: v_sub_f32_e32 v0, v0, v1
+; SI-NSZ-NEXT: v_mul_f32_e32 v1, v3, v2
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_add_multi_use_fneg_x_f16:
@@ -584,11 +573,8 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c
define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 {
; SI-LABEL: fneg_fadd_0_safe_f16:
; SI: ; %bb.0: ; %.entry
-; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
; SI-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0
; SI-NEXT: v_rcp_f32_e32 v3, v2
; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0
@@ -608,6 +594,8 @@ define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6,
; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc
+; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: fneg_fadd_0_safe_f16:
@@ -654,17 +642,16 @@ define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6,
define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 {
; SI-LABEL: fneg_fadd_0_nsz_f16:
; SI: ; %bb.0: ; %.entry
-; SI-NEXT: v_cvt_f16_f32_e32 v0, s1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_rcp_f32_e32 v0, v0
; SI-NEXT: v_mul_f32_e32 v0, 0x80000000, v0
; SI-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1
; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: fneg_fadd_0_nsz_f16:
@@ -712,11 +699,11 @@ define half @v_fneg_mul_f16(half %a, half %b) #0 {
; SI-LABEL: v_fneg_mul_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_mul_f16:
@@ -749,12 +736,11 @@ define { half, half } @v_fneg_mul_store_use_mul_f16(half %a, half %b) #0 {
; SI-LABEL: v_fneg_mul_store_use_mul_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_mul_f32_e32 v1, v0, v1
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_mul_store_use_mul_f16:
@@ -796,12 +782,13 @@ define { half, half } @v_fneg_mul_multi_use_mul_f16(half %a, half %b) #0 {
; SI-LABEL: v_fneg_mul_multi_use_mul_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_mul_multi_use_mul_f16:
@@ -844,11 +831,10 @@ define half @v_fneg_mul_fneg_x_f16(half %a, half %b) #0 {
; SI-LABEL: v_fneg_mul_fneg_x_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_mul_fneg_x_f16:
@@ -882,11 +868,10 @@ define half @v_fneg_mul_x_fneg_f16(half %a, half %b) #0 {
; SI-LABEL: v_fneg_mul_x_fneg_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_mul_x_fneg_f16:
@@ -920,11 +905,11 @@ define half @v_fneg_mul_fneg_fneg_f16(half %a, half %b) #0 {
; SI-LABEL: v_fneg_mul_fneg_fneg_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_mul_fneg_fneg_f16:
@@ -959,12 +944,13 @@ define { half, half } @v_fneg_mul_store_use_fneg_x_f16(half %a, half %b) #0 {
; SI-LABEL: v_fneg_mul_store_use_fneg_x_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
-; SI-NEXT: v_cvt_f32_f16_e64 v1, -v0
-; SI-NEXT: v_mul_f32_e32 v0, v3, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_mul_f32_e32 v1, v2, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v1
+; SI-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; SI-NEXT: v_mov_b32_e32 v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_mul_store_use_fneg_x_f16:
@@ -1011,15 +997,15 @@ define { half, half } @v_fneg_mul_multi_use_fneg_x_f16(half %a, half %b, half %c
; SI-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v3, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
-; SI-NEXT: v_cvt_f32_f16_e64 v4, -v0
-; SI-NEXT: v_mul_f32_e32 v0, v3, v1
-; SI-NEXT: v_mul_f32_e32 v1, v4, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v3, -v3
+; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_mul_f32_e32 v1, v3, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_mul_multi_use_fneg_x_f16:
@@ -1071,11 +1057,12 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
; SI-LABEL: v_fneg_minnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_minnum_f16_ieee:
@@ -1119,11 +1106,12 @@ define half @v_fneg_minnum_f16_no_ieee(half %a, half %b) #4 {
; SI-LABEL: v_fneg_minnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_minnum_f16_no_ieee:
@@ -1156,7 +1144,8 @@ define half @v_fneg_self_minnum_f16_ieee(half %a) #0 {
; SI-LABEL: v_fneg_self_minnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_self_minnum_f16_ieee:
@@ -1189,7 +1178,8 @@ define half @v_fneg_self_minnum_f16_no_ieee(half %a) #4 {
; SI-LABEL: v_fneg_self_minnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_self_minnum_f16_no_ieee:
@@ -1222,9 +1212,10 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 {
; SI-LABEL: v_fneg_posk_minnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, -4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_posk_minnum_f16_ieee:
@@ -1264,9 +1255,10 @@ define half @v_fneg_posk_minnum_f16_no_ieee(half %a) #4 {
; SI-LABEL: v_fneg_posk_minnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, -4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_posk_minnum_f16_no_ieee:
@@ -1299,9 +1291,10 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 {
; SI-LABEL: v_fneg_negk_minnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_negk_minnum_f16_ieee:
@@ -1341,9 +1334,10 @@ define half @v_fneg_negk_minnum_f16_no_ieee(half %a) #4 {
; SI-LABEL: v_fneg_negk_minnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_negk_minnum_f16_no_ieee:
@@ -1376,10 +1370,9 @@ define half @v_fneg_0_minnum_f16(half %a) #0 {
; SI-LABEL: v_fneg_0_minnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_min_f32_e32 v0, 0, v0
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_0_minnum_f16:
@@ -1419,9 +1412,10 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 {
; SI-LABEL: v_fneg_neg0_minnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_neg0_minnum_f16_ieee:
@@ -1461,9 +1455,10 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
; SI-LABEL: v_fneg_inv2pi_minnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_inv2pi_minnum_f16:
@@ -1507,9 +1502,10 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
@@ -1553,9 +1549,10 @@ define half @v_fneg_neg0_minnum_f16_no_ieee(half %a) #4 {
; SI-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_neg0_minnum_f16_no_ieee:
@@ -1588,12 +1585,11 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_min_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_ieee:
@@ -1638,12 +1634,12 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
; SI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_inv2pi_minnum_foldable_use_f16:
@@ -1688,12 +1684,11 @@ define half @v_fneg_0_minnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
; SI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_min_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_0_minnum_foldable_use_f16_no_ieee:
@@ -1734,12 +1729,14 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b)
; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_minnum_multi_use_minnum_f16_ieee:
@@ -1790,8 +1787,6 @@ define <2 x half> @v_fneg_minnum_multi_use_minnum_f16_no_ieee(half %a, half %b)
; SI-LABEL: v_fneg_minnum_multi_use_minnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
@@ -1851,11 +1846,12 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
; SI-LABEL: v_fneg_maxnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_maxnum_f16_ieee:
@@ -1899,11 +1895,12 @@ define half @v_fneg_maxnum_f16_no_ieee(half %a, half %b) #4 {
; SI-LABEL: v_fneg_maxnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_maxnum_f16_no_ieee:
@@ -1936,7 +1933,8 @@ define half @v_fneg_self_maxnum_f16_ieee(half %a) #0 {
; SI-LABEL: v_fneg_self_maxnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_self_maxnum_f16_ieee:
@@ -1969,7 +1967,8 @@ define half @v_fneg_self_maxnum_f16_no_ieee(half %a) #4 {
; SI-LABEL: v_fneg_self_maxnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_self_maxnum_f16_no_ieee:
@@ -2002,9 +2001,10 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 {
; SI-LABEL: v_fneg_posk_maxnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_min_f32_e32 v0, -4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_posk_maxnum_f16_ieee:
@@ -2044,9 +2044,10 @@ define half @v_fneg_posk_maxnum_f16_no_ieee(half %a) #4 {
; SI-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_min_f32_e32 v0, -4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_posk_maxnum_f16_no_ieee:
@@ -2079,9 +2080,10 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 {
; SI-LABEL: v_fneg_negk_maxnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_negk_maxnum_f16_ieee:
@@ -2121,9 +2123,10 @@ define half @v_fneg_negk_maxnum_f16_no_ieee(half %a) #4 {
; SI-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_negk_maxnum_f16_no_ieee:
@@ -2156,10 +2159,9 @@ define half @v_fneg_0_maxnum_f16(half %a) #0 {
; SI-LABEL: v_fneg_0_maxnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_max_f32_e32 v0, 0, v0
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_0_maxnum_f16:
@@ -2199,9 +2201,10 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 {
; SI-LABEL: v_fneg_neg0_maxnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_min_f32_e32 v0, 0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_neg0_maxnum_f16_ieee:
@@ -2241,9 +2244,10 @@ define half @v_fneg_neg0_maxnum_f16_no_ieee(half %a) #4 {
; SI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_min_f32_e32 v0, 0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_neg0_maxnum_f16_no_ieee:
@@ -2276,12 +2280,11 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_max_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_ieee:
@@ -2326,12 +2329,11 @@ define half @v_fneg_0_maxnum_foldable_use_f16_no_ieee(half %a, half %b) #4 {
; SI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_max_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_0_maxnum_foldable_use_f16_no_ieee:
@@ -2372,12 +2374,14 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b)
; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_ieee:
@@ -2428,8 +2432,6 @@ define <2 x half> @v_fneg_maxnum_multi_use_maxnum_f16_no_ieee(half %a, half %b)
; SI-LABEL: v_fneg_maxnum_multi_use_maxnum_f16_no_ieee:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
@@ -2489,26 +2491,21 @@ define half @v_fneg_fma_f16(half %a, half %b, half %c) #0 {
; SI-SAFE-LABEL: v_fneg_fma_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fma_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fma_f16:
@@ -2558,14 +2555,12 @@ define { half, half } @v_fneg_fma_store_use_fma_f16(half %a, half %b, half %c) #
; SI-LABEL: v_fneg_fma_store_use_fma_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_fma_f32 v1, v0, v1, v2
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_fma_store_use_fma_f16:
@@ -2607,28 +2602,25 @@ define { half, half } @v_fneg_fma_multi_use_fma_f16(half %a, half %b, half %c) #
; SI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT: v_fma_f32 v1, v0, v1, v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
-; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1
+; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2
+; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fma_multi_use_fma_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2
; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fma_multi_use_fma_f16:
@@ -2689,26 +2681,21 @@ define half @v_fneg_fma_fneg_x_y_f16(half %a, half %b, half %c) #0 {
; SI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fma_fneg_x_y_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fma_fneg_x_y_f16:
@@ -2759,26 +2746,21 @@ define half @v_fneg_fma_x_fneg_y_f16(half %a, half %b, half %c) #0 {
; SI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_fma_f32 v0, v0, -v1, v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fma_x_fneg_y_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fma_x_fneg_y_f16:
@@ -2829,26 +2811,21 @@ define half @v_fneg_fma_fneg_fneg_y_f16(half %a, half %b, half %c) #0 {
; SI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fma_fneg_fneg_y_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, -v2
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fma_fneg_fneg_y_f16:
@@ -2900,26 +2877,21 @@ define half @v_fneg_fma_fneg_x_fneg_f16(half %a, half %b, half %c) #0 {
; SI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v1, -v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fma_fneg_x_fneg_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, v2
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fma_fneg_x_fneg_f16:
@@ -2971,26 +2943,21 @@ define half @v_fneg_fma_x_y_fneg_f16(half %a, half %b, half %c) #0 {
; SI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_fma_f32 v0, v0, v1, -v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fma_x_y_fneg_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_fma_f32 v0, v0, -v1, v2
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fma_x_y_fneg_f16:
@@ -3041,28 +3008,26 @@ define { half, half } @v_fneg_fma_store_use_fneg_x_y_f16(half %a, half %b, half
; SI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v3, -v0
+; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v1
+; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v1
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
-; SI-SAFE-NEXT: v_fma_f32 v0, v3, v4, v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v1
+; SI-SAFE-NEXT: v_fma_f32 v0, v0, v3, v2
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v1
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v0
-; SI-NSZ-NEXT: v_fma_f32 v0, v4, v3, -v2
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v0
+; SI-NSZ-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, -v2
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v1
+; SI-NSZ-NEXT: v_xor_b32_e32 v1, 0x8000, v0
+; SI-NSZ-NEXT: v_mov_b32_e32 v0, v2
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fma_store_use_fneg_x_y_f16:
@@ -3129,33 +3094,30 @@ define { half, half } @v_fneg_fma_multi_use_fneg_x_y_f16(half %a, half %b, half
; SI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-SAFE-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v4, v0
+; SI-SAFE-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT: v_fma_f32 v0, v4, v1, v2
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
-; SI-SAFE-NEXT: v_mul_f32_e32 v1, v4, v3
+; SI-SAFE-NEXT: v_fma_f32 v1, v0, v1, v2
+; SI-SAFE-NEXT: v_mul_f32_e32 v2, v0, v3
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v1
+; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NSZ-NEXT: v_and_b32_e32 v4, 0xffff, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v4, v0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v5, -v0
-; SI-NSZ-NEXT: v_fma_f32 v0, v4, v1, -v2
-; SI-NSZ-NEXT: v_mul_f32_e32 v1, v5, v3
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v4, -v4
+; SI-NSZ-NEXT: v_fma_f32 v0, v0, v1, -v2
+; SI-NSZ-NEXT: v_mul_f32_e32 v1, v4, v3
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fma_multi_use_fneg_x_y_f16:
@@ -3227,26 +3189,22 @@ define half @v_fneg_fmad_f16(half %a, half %b, half %c) #0 {
; SI-SAFE-LABEL: v_fneg_fmad_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v2
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v2
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fmad_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NSZ-NEXT: v_cvt_f16_f32_e64 v1, -v1
+; SI-NSZ-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fmad_f16:
@@ -3432,28 +3390,26 @@ define { half, half } @v_fneg_fmad_multi_use_fmad_f16(half %a, half %b, half %c)
; SI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_mac_f32_e32 v2, v0, v1
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v2
; SI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v2
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v2
+; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_fmad_multi_use_fmad_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NSZ-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NSZ-NEXT: v_mad_f32 v0, v0, v1, -v2
; SI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_fmad_multi_use_fmad_f16:
@@ -3518,7 +3474,9 @@ define double @v_fneg_fp_extend_f16_to_f64(half %a) #0 {
; SI-LABEL: v_fneg_fp_extend_f16_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f64_f32_e64 v[0:1], -v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_fp_extend_f16_to_f64:
@@ -3562,6 +3520,7 @@ define double @v_fneg_fp_extend_fneg_f16_to_f64(half %a) #0 {
; SI-LABEL: v_fneg_fp_extend_fneg_f16_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -3604,8 +3563,10 @@ define { double, half } @v_fneg_fp_extend_store_use_fneg_f16_to_f64(half %a) #0
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v2, v0
-; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2
-; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v2
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; SI-NEXT: v_xor_b32_e32 v2, 0x8000, v2
+; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_fp_extend_store_use_fneg_f16_to_f64:
@@ -3656,6 +3617,7 @@ define { double, double } @v_fneg_multi_use_fp_extend_fneg_f16_to_f64(half %a) #
; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0
; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v3
; SI-NEXT: v_mov_b32_e32 v0, v2
@@ -3711,6 +3673,7 @@ define { double, double } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64(h
; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; SI-NEXT: v_xor_b32_e32 v4, 0x80000000, v1
; SI-NEXT: v_mul_f64 v[2:3], v[0:1], 4.0
@@ -3765,8 +3728,9 @@ define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0
; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, v0
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT: v_cvt_f32_f16_e64 v2, -v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
+; SI-NEXT: v_mov_b32_e32 v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
@@ -3856,7 +3820,7 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_fp_round_f64_to_f16:
@@ -4008,7 +3972,7 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
; SI-NEXT: v_or_b32_e32 v0, v1, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_fp_round_fneg_f64_to_f16:
@@ -4120,48 +4084,48 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
; SI-NEXT: v_and_b32_e32 v0, 0x1ff, v1
; SI-NEXT: v_or_b32_e32 v0, v0, v3
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v1
+; SI-NEXT: v_lshrrev_b32_e32 v4, 8, v1
; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: v_and_b32_e32 v2, 0xffe, v2
-; SI-NEXT: v_bfe_u32 v4, v1, 20, 11
+; SI-NEXT: v_and_b32_e32 v4, 0xffe, v4
+; SI-NEXT: v_bfe_u32 v5, v1, 20, 11
; SI-NEXT: s_movk_i32 s4, 0x3f1
-; SI-NEXT: v_or_b32_e32 v0, v2, v0
-; SI-NEXT: v_sub_i32_e32 v5, vcc, s4, v4
-; SI-NEXT: v_or_b32_e32 v2, 0x1000, v0
-; SI-NEXT: v_med3_i32 v5, v5, 0, 13
-; SI-NEXT: v_lshrrev_b32_e32 v6, v5, v2
-; SI-NEXT: v_lshlrev_b32_e32 v5, v5, v6
-; SI-NEXT: v_cmp_ne_u32_e32 vcc, v5, v2
+; SI-NEXT: v_or_b32_e32 v0, v4, v0
+; SI-NEXT: v_sub_i32_e32 v6, vcc, s4, v5
+; SI-NEXT: v_or_b32_e32 v4, 0x1000, v0
+; SI-NEXT: v_med3_i32 v6, v6, 0, 13
+; SI-NEXT: v_lshrrev_b32_e32 v7, v6, v4
+; SI-NEXT: v_lshlrev_b32_e32 v6, v6, v7
+; SI-NEXT: v_cmp_ne_u32_e32 vcc, v6, v4
; SI-NEXT: s_movk_i32 s4, 0xfc10
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; SI-NEXT: v_add_i32_e32 v4, vcc, s4, v4
-; SI-NEXT: v_lshlrev_b32_e32 v5, 12, v4
-; SI-NEXT: v_or_b32_e32 v2, v6, v2
-; SI-NEXT: v_or_b32_e32 v5, v0, v5
-; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v4
-; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; SI-NEXT: v_and_b32_e32 v5, 7, v2
-; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v5
+; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SI-NEXT: v_add_i32_e32 v5, vcc, s4, v5
+; SI-NEXT: v_lshlrev_b32_e32 v6, 12, v5
+; SI-NEXT: v_or_b32_e32 v4, v7, v4
+; SI-NEXT: v_or_b32_e32 v6, v0, v6
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 1, v5
+; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; SI-NEXT: v_and_b32_e32 v6, 7, v4
+; SI-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6
+; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6
; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 3, v5
-; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; SI-NEXT: v_or_b32_e32 v5, v5, v6
-; SI-NEXT: v_lshrrev_b32_e32 v2, 2, v2
-; SI-NEXT: v_add_i32_e32 v2, vcc, v2, v5
-; SI-NEXT: v_mov_b32_e32 v5, 0x7c00
-; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v4
-; SI-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc
-; SI-NEXT: v_mov_b32_e32 v6, 0x7e00
+; SI-NEXT: v_or_b32_e32 v6, v6, v7
+; SI-NEXT: v_lshrrev_b32_e32 v4, 2, v4
+; SI-NEXT: v_add_i32_e32 v4, vcc, v4, v6
+; SI-NEXT: v_mov_b32_e32 v6, 0x7c00
+; SI-NEXT: v_cmp_gt_i32_e32 vcc, 31, v5
+; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
+; SI-NEXT: v_mov_b32_e32 v7, 0x7e00
; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; SI-NEXT: s_movk_i32 s4, 0x40f
-; SI-NEXT: v_cndmask_b32_e32 v0, v5, v6, vcc
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v4
-; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
-; SI-NEXT: v_lshrrev_b32_e32 v2, 16, v1
-; SI-NEXT: v_and_b32_e32 v2, 0x8000, v2
-; SI-NEXT: v_or_b32_e32 v0, v2, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v1
+; SI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5
+; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_mov_b32_e32 v1, v3
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -4423,7 +4387,7 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
; SI-NEXT: v_lshrrev_b32_e32 v5, 16, v1
; SI-NEXT: v_and_b32_e32 v5, 0x8000, v5
; SI-NEXT: v_or_b32_e32 v4, v5, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SI-NEXT: v_and_b32_e32 v4, 0xffff, v4
; SI-NEXT: v_mul_f64 v[1:2], -v[0:1], v[2:3]
; SI-NEXT: v_mov_b32_e32 v0, v4
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -4582,10 +4546,9 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
; SI-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; SI-NEXT: v_and_b32_e32 v1, 0x8000, v1
-; SI-NEXT: v_or_b32_e32 v1, v1, v0
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v1
-; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; SI-NEXT: v_xor_b32_e32 v0, 0x8000, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16:
@@ -4798,9 +4761,10 @@ define half @v_fneg_trunc_f16(half %a) #0 {
; SI-LABEL: v_fneg_trunc_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_trunc_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_trunc_f16:
@@ -4837,7 +4801,6 @@ define half @v_fneg_round_f16(half %a) #0 {
; SI-SAFE-LABEL: v_fneg_round_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SAFE-NEXT: v_trunc_f32_e32 v1, v0
; SI-SAFE-NEXT: v_sub_f32_e32 v2, v0, v1
@@ -4846,13 +4809,12 @@ define half @v_fneg_round_f16(half %a) #0 {
; SI-SAFE-NEXT: s_brev_b32 s4, -2
; SI-SAFE-NEXT: v_bfi_b32 v0, s4, v2, v0
; SI-SAFE-NEXT: v_add_f32_e32 v0, v1, v0
-; SI-SAFE-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-SAFE-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: v_fneg_round_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NSZ-NEXT: v_trunc_f32_e32 v1, v0
; SI-NSZ-NEXT: v_sub_f32_e32 v2, v0, v1
@@ -4861,6 +4823,7 @@ define half @v_fneg_round_f16(half %a) #0 {
; SI-NSZ-NEXT: s_brev_b32 s4, -2
; SI-NSZ-NEXT: v_bfi_b32 v0, s4, v2, v0
; SI-NSZ-NEXT: v_sub_f32_e64 v0, -v1, v0
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: v_fneg_round_f16:
@@ -4964,9 +4927,10 @@ define half @v_fneg_rint_f16(half %a) #0 {
; SI-LABEL: v_fneg_rint_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_rndne_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_rint_f16:
@@ -5003,9 +4967,10 @@ define half @v_fneg_nearbyint_f16(half %a) #0 {
; SI-LABEL: v_fneg_nearbyint_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_rndne_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_nearbyint_f16:
@@ -5042,11 +5007,12 @@ define half @v_fneg_sin_f16(half %a) #0 {
; SI-LABEL: v_fneg_sin_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_mul_f32_e32 v0, 0x3e22f983, v0
; SI-NEXT: v_fract_f32_e32 v0, v0
; SI-NEXT: v_sin_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_sin_f16:
@@ -5091,8 +5057,9 @@ define half @v_fneg_canonicalize_f16(half %a) #0 {
; SI-LABEL: v_fneg_canonicalize_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_canonicalize_f16:
@@ -5129,12 +5096,10 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half
; SI-LABEL: v_fneg_copytoreg_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_and_b32_e32 v6, 0x3ff, v31
-; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v6
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_and_b32_e32 v6, 0x3ff, v31
+; SI-NEXT: v_lshlrev_b32_e32 v6, 1, v6
; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v6
; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; SI-NEXT: v_mul_f32_e32 v2, v2, v3
@@ -5142,11 +5107,12 @@ define void @v_fneg_copytoreg_f16(ptr addrspace(1) %out, half %a, half %b, half
; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc
; SI-NEXT: s_cbranch_execz .LBB81_2
; SI-NEXT: ; %bb.1: ; %if
-; SI-NEXT: v_cvt_f16_f32_e64 v3, -v2
-; SI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v4
+; SI-NEXT: v_cvt_f16_f32_e64 v4, -v2
+; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SI-NEXT: v_mul_f32_e32 v3, v3, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_mul_f32_e32 v3, v4, v3
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
; SI-NEXT: flat_store_short v[0:1], v3
; SI-NEXT: s_waitcnt vmcnt(0)
@@ -5269,14 +5235,13 @@ define half @v_fneg_inlineasm_f16(half %a, half %b, half %c, i32 %d) #0 {
; SI-LABEL: v_fneg_inlineasm_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e64 v1, -v1
+; SI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ;;#ASMSTART
-; SI-NEXT: ; use v1
+; SI-NEXT: ; use v0
; SI-NEXT: ;;#ASMEND
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -5328,12 +5293,11 @@ define half @v_fneg_inlineasm_multi_use_src_f16(ptr addrspace(1) %out, half %a,
; SI-LABEL: v_fneg_inlineasm_multi_use_src_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; SI-NEXT: v_cvt_f16_f32_e64 v1, -v3
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
-; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v2
+; SI-NEXT: v_mul_f32_e32 v0, v1, v0
; SI-NEXT: v_cvt_f16_f32_e64 v1, -v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ;;#ASMSTART
; SI-NEXT: ; use v1
; SI-NEXT: ;;#ASMEND
@@ -5398,14 +5362,13 @@ define { half, half } @multiuse_fneg_2_vop3_users_f16(half %a, half %b, half %c)
; SI-LABEL: multiuse_fneg_2_vop3_users_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
-; SI-NEXT: v_fma_f32 v0, -v3, v1, v2
-; SI-NEXT: v_fma_f32 v1, -v3, v2, 2.0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_fma_f32 v1, -v0, v1, v2
+; SI-NEXT: v_fma_f32 v2, -v0, v2, 2.0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: multiuse_fneg_2_vop3_users_f16:
@@ -5454,14 +5417,14 @@ define { half, half } @multiuse_fneg_2_vop2_users_f16(half %a, half %b, half %c)
; SI-LABEL: multiuse_fneg_2_vop2_users_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_mul_f32_e32 v0, v3, v1
-; SI-NEXT: v_mul_f32_e32 v1, v3, v2
+; SI-NEXT: v_mul_f32_e32 v1, v0, v1
+; SI-NEXT: v_mul_f32_e32 v2, v0, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: multiuse_fneg_2_vop2_users_f16:
@@ -5509,14 +5472,14 @@ define { half, half } @multiuse_fneg_vop2_vop3_users_f16(ptr addrspace(1) %out,
; SI-LABEL: multiuse_fneg_vop2_vop3_users_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v3
-; SI-NEXT: v_cvt_f16_f32_e64 v1, -v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v4
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_fma_f32 v0, v1, v0, 2.0
-; SI-NEXT: v_mul_f32_e32 v1, v1, v2
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v3
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v4
+; SI-NEXT: v_fma_f32 v1, v0, v1, 2.0
+; SI-NEXT: v_mul_f32_e32 v2, v0, v2
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: multiuse_fneg_vop2_vop3_users_f16:
@@ -5563,33 +5526,29 @@ define { half, half } @free_fold_src_code_size_cost_use_f16(ptr addrspace(1) %ou
; SI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16:
; SI-SAFE: ; %bb.0:
; SI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v3
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v3
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v2
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v4
+; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v5
+; SI-SAFE-NEXT: v_fma_f32 v0, v1, v0, 2.0
+; SI-SAFE-NEXT: v_mul_f32_e64 v1, -v0, v2
+; SI-SAFE-NEXT: v_mul_f32_e64 v2, -v0, v3
+; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v1
; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v2
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v4
-; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v3, v5
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-SAFE-NEXT: v_fma_f32 v1, v1, v0, 2.0
-; SI-SAFE-NEXT: v_mul_f32_e64 v0, -v1, v2
-; SI-SAFE-NEXT: v_mul_f32_e64 v1, -v1, v3
; SI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; SI-NSZ-LABEL: free_fold_src_code_size_cost_use_f16:
; SI-NSZ: ; %bb.0:
; SI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v3
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v3
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v2
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v4
+; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v5
+; SI-NSZ-NEXT: v_fma_f32 v0, v1, -v0, -2.0
+; SI-NSZ-NEXT: v_mul_f32_e32 v1, v0, v2
+; SI-NSZ-NEXT: v_mul_f32_e32 v2, v0, v3
+; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v1
; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v2
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v4
-; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v3, v5
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SI-NSZ-NEXT: v_fma_f32 v1, v1, -v0, -2.0
-; SI-NSZ-NEXT: v_mul_f32_e32 v0, v1, v2
-; SI-NSZ-NEXT: v_mul_f32_e32 v1, v1, v3
; SI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: free_fold_src_code_size_cost_use_f16:
@@ -5661,14 +5620,12 @@ define half @one_use_cost_to_fold_into_src_f16(ptr addrspace(1) %out, half %a, h
; SI-LABEL: one_use_cost_to_fold_into_src_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
; SI-NEXT: v_trunc_f32_e32 v1, v1
; SI-NEXT: v_fma_f32 v0, -v1, v2, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: one_use_cost_to_fold_into_src_f16:
@@ -5713,17 +5670,15 @@ define { half, half } @multi_use_cost_to_fold_into_src(ptr addrspace(1) %out, ha
; SI-LABEL: multi_use_cost_to_fold_into_src:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v4
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v3
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v5
-; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v4
+; SI-NEXT: v_cvt_f32_f16_e32 v2, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v3, v5
; SI-NEXT: v_trunc_f32_e32 v1, v1
; SI-NEXT: v_fma_f32 v0, -v1, v2, v0
; SI-NEXT: v_mul_f32_e32 v1, v1, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: multi_use_cost_to_fold_into_src:
@@ -5841,11 +5796,11 @@ define half @nnan_fmul_neg1_to_fneg(half %x, half %y) #0 {
; SI-LABEL: nnan_fmul_neg1_to_fneg:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: nnan_fmul_neg1_to_fneg:
@@ -5880,11 +5835,11 @@ define half @denormal_fmul_neg1_to_fneg(half %x, half %y) {
; SI-LABEL: denormal_fmul_neg1_to_fneg:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: denormal_fmul_neg1_to_fneg:
@@ -5918,13 +5873,13 @@ define half @denorm_snan_fmul_neg1_to_fneg(half %x, half %y) {
; SI-LABEL: denorm_snan_fmul_neg1_to_fneg:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v0
-; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; SI-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v2, -v2
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, v2, v0
+; SI-NEXT: v_mul_f32_e32 v0, v0, v2
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: denorm_snan_fmul_neg1_to_fneg:
@@ -5965,11 +5920,11 @@ define half @flush_snan_fmul_neg1_to_fneg(half %x, half %y) #0 {
; SI-LABEL: flush_snan_fmul_neg1_to_fneg:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: flush_snan_fmul_neg1_to_fneg:
@@ -6010,15 +5965,13 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
; SI-LABEL: fadd_select_fneg_fneg_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-NEXT: v_sub_f32_e32 v0, v3, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: fadd_select_fneg_fneg_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index bdea710725ace..5d23f648f707b 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -7777,15 +7777,13 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
; SI-LABEL: fadd_select_fneg_fneg_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-NEXT: v_sub_f32_e32 v0, v3, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: fadd_select_fneg_fneg_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 58adbd4d0d250..afe0b8c3b392b 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -938,9 +938,10 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
; SI-LABEL: v_fneg_inv2pi_minnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_inv2pi_minnum_f16:
@@ -959,9 +960,10 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
; SI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_neg_inv2pi_minnum_f16:
@@ -1489,12 +1491,13 @@ define half @v_fneg_inv2pi_minimum_f16(half %a) #0 {
; SI-LABEL: v_fneg_inv2pi_minimum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_max_f32_e32 v2, 0xbe230000, v0
; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_inv2pi_minimum_f16:
@@ -1515,12 +1518,13 @@ define half @v_fneg_neg_inv2pi_minimum_f16(half %a) #0 {
; SI-LABEL: v_fneg_neg_inv2pi_minimum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_max_f32_e32 v2, 0x3e230000, v0
; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_neg_inv2pi_minimum_f16:
@@ -2080,9 +2084,10 @@ define half @v_fneg_inv2pi_minimumnum_f16(half %a) #0 {
; SI-LABEL: v_fneg_inv2pi_minimumnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_inv2pi_minimumnum_f16:
@@ -2101,9 +2106,10 @@ define half @v_fneg_neg_inv2pi_minimumnum_f16(half %a) #0 {
; SI-LABEL: v_fneg_neg_inv2pi_minimumnum_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_neg_inv2pi_minimumnum_f16:
@@ -2809,8 +2815,9 @@ define { float, float } @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(half %a) #0
; SI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mov_b32_e32 v1, v0
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v1
+; SI-NEXT: v_cvt_f32_f16_e64 v2, -v0
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
+; SI-NEXT: v_mov_b32_e32 v0, v2
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
@@ -2830,9 +2837,9 @@ define { float, float } @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(hal
; SI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
-; SI-NEXT: v_mul_f32_e32 v1, 4.0, v0
-; SI-NEXT: v_mov_b32_e32 v0, v2
+; SI-NEXT: v_cvt_f32_f16_e32 v1, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; SI-NEXT: v_mul_f32_e32 v1, 4.0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
@@ -2921,36 +2928,22 @@ define { float, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f32(double %a, d
}
define half @v_fneg_fp_round_f32_to_f16(float %a) #0 {
-; SI-LABEL: v_fneg_fp_round_f32_to_f16:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_fneg_fp_round_f32_to_f16:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; VI-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_fneg_fp_round_f32_to_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%fpround = fptrunc float %a to half
%fneg = fneg half %fpround
ret half %fneg
}
define half @v_fneg_fp_round_fneg_f32_to_f16(float %a) #0 {
-; SI-LABEL: v_fneg_fp_round_fneg_f32_to_f16:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_fneg_fp_round_fneg_f32_to_f16:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; VI-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_fneg_fp_round_fneg_f32_to_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg float %a
%fpround = fptrunc float %fneg.a to half
%fneg = fneg half %fpround
@@ -2972,22 +2965,13 @@ define { float, float } @v_fneg_multi_use_fp_round_fneg_f64_to_f32(double %a) #0
}
define { half, float } @v_fneg_fp_round_store_use_fneg_f32_to_f16(float %a) #0 {
-; SI-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v1
-; SI-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f16_f32_e32 v2, v0
-; VI-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
-; VI-NEXT: v_mov_b32_e32 v0, v2
-; VI-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_fneg_fp_round_store_use_fneg_f32_to_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg float %a
%fpround = fptrunc float %fneg.a to half
%fneg = fneg half %fpround
@@ -2997,22 +2981,13 @@ define { half, float } @v_fneg_fp_round_store_use_fneg_f32_to_f16(float %a) #0 {
}
define { half, float } @v_fneg_fp_round_multi_use_fneg_f32_to_f16(float %a, float %c) #0 {
-; SI-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16:
-; SI: ; %bb.0:
-; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v0
-; SI-NEXT: v_mul_f32_e64 v1, -v0, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; SI-NEXT: v_mov_b32_e32 v0, v2
-; SI-NEXT: s_setpc_b64 s[30:31]
-;
-; VI-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16:
-; VI: ; %bb.0:
-; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT: v_cvt_f16_f32_e32 v2, v0
-; VI-NEXT: v_mul_f32_e64 v1, -v0, v1
-; VI-NEXT: v_mov_b32_e32 v0, v2
-; VI-NEXT: s_setpc_b64 s[30:31]
+; GCN-LABEL: v_fneg_fp_round_multi_use_fneg_f32_to_f16:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f16_f32_e32 v2, v0
+; GCN-NEXT: v_mul_f32_e64 v1, -v0, v1
+; GCN-NEXT: v_mov_b32_e32 v0, v2
+; GCN-NEXT: s_setpc_b64 s[30:31]
%fneg.a = fneg float %a
%fpround = fptrunc float %fneg.a to half
%fneg = fneg half %fpround
@@ -3784,15 +3759,13 @@ define half @fadd_select_fneg_fneg_f16(i32 %arg0, half %x, half %y, half %z) {
; SI-LABEL: fadd_select_fneg_fneg_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; SI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; SI-NEXT: v_sub_f32_e32 v0, v3, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: fadd_select_fneg_fneg_f16:
@@ -4176,12 +4149,12 @@ define half @v_fneg_select_infloop_regression_f16(half %arg, i1 %arg1) {
; SI-LABEL: v_fneg_select_infloop_regression_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v1, 1, v1
; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc
; SI-NEXT: v_cndmask_b32_e64 v0, -v0, 0, vcc
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_select_infloop_regression_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index e5c34f695f9a7..d9dea4f1fd6e7 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -601,16 +601,16 @@ define half @select_fneg_select_f16(i1 %cond0, i1 %cond1, half %arg0, half %arg1
; GFX7-LABEL: select_fneg_select_f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e64 v2, -v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX7-NEXT: v_cvt_f32_f16_e64 v2, -v2
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_and_b32_e32 v1, 1, v1
-; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1
; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, -v0, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: select_fneg_select_f16:
@@ -1320,11 +1320,9 @@ define double @fneg_f64_bitcast_build_vector_v4f16_to_f64(half %elt0, half %elt1
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4f16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1370,14 +1368,12 @@ define double @fneg_f64_bitcast_build_vector_v4bf16_to_f64(bfloat %elt0, bfloat
; GFX7-LABEL: fneg_f64_bitcast_build_vector_v4bf16_to_f64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_alignbit_b32 v2, v3, v2, 16
-; GFX7-NEXT: v_alignbit_b32 v0, v1, v0, 16
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX7-NEXT: v_or_b32_e32 v2, v2, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-NEXT: v_or_b32_e32 v0, v0, v1
; GFX7-NEXT: v_xor_b32_e32 v1, 0x80000000, v2
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.ll b/llvm/test/CodeGen/AMDGPU/fneg.ll
index c277f3b546c6b..4fe0882a19f18 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.ll
@@ -605,6 +605,7 @@ define half @v_fneg_i16_fp_use(i16 %in) {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_sub_f32_e32 v0, 2.0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_i16_fp_use:
diff --git a/llvm/test/CodeGen/AMDGPU/fpow.ll b/llvm/test/CodeGen/AMDGPU/fpow.ll
index fd7816e7df1d9..3f1aea2e3773d 100644
--- a/llvm/test/CodeGen/AMDGPU/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpow.ll
@@ -136,13 +136,12 @@ define half @v_pow_f16(half %x, half %y) {
; GFX6-LABEL: v_pow_f16:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_log_f32_e32 v0, v0
; GFX6-NEXT: v_mul_legacy_f32_e32 v0, v1, v0
; GFX6-NEXT: v_exp_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_pow_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index a2bd98d3d7b27..5ff03c8dd4543 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1578,21 +1578,21 @@ define half @basic_fract_f16_nonan(half nofpclass(nan) %x) {
; GFX6-LABEL: basic_fract_f16_nonan:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_floor_f32_e32 v1, v0
; GFX6-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX6-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: basic_fract_f16_nonan:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_floor_f32_e32 v1, v0
; GFX7-NEXT: v_sub_f32_e32 v0, v0, v1
; GFX7-NEXT: v_min_f32_e32 v0, 0x3f7fe000, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: basic_fract_f16_nonan:
@@ -1859,18 +1859,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly
; GFX6-LABEL: safe_math_fract_f16_noinf_check:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: v_floor_f32_e32 v3, v0
; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4
; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -1878,18 +1878,18 @@ define half @safe_math_fract_f16_noinf_check(half %x, ptr addrspace(1) writeonly
; GFX7-LABEL: safe_math_fract_f16_noinf_check:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_floor_f32_e32 v3, v0
; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4
; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -2486,46 +2486,46 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) writeonly captures(no
; GFX6-LABEL: safe_math_fract_f16:
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_movk_i32 s8, 0x7c00
-; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX6-NEXT: s_mov_b32 s4, s6
-; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: s_movk_i32 s4, 0x7c00
+; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: v_floor_f32_e32 v4, v3
; GFX6-NEXT: v_sub_f32_e32 v5, v3, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5
; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX6-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_mov_b32 s7, 0xf000
+; GFX6-NEXT: s_mov_b32 s4, s6
+; GFX6-NEXT: s_mov_b32 s5, s6
+; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: safe_math_fract_f16:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_movk_i32 s8, 0x7c00
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: s_movk_i32 s4, 0x7c00
+; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_floor_f32_e32 v4, v3
; GFX7-NEXT: v_sub_f32_e32 v5, v3, v4
-; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5
; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
-; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX7-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_mov_b32 s7, 0xf000
+; GFX7-NEXT: s_mov_b32 s4, s6
+; GFX7-NEXT: s_mov_b32 s5, s6
+; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll
index 1117e7f74f11c..7ff700d2cd101 100644
--- a/llvm/test/CodeGen/AMDGPU/function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-args.ll
@@ -409,24 +409,14 @@ define void @void_func_i64(i64 %arg0) #0 {
}
define void @void_func_f16(half %arg0) #0 {
-; CI-LABEL: void_func_f16:
-; CI: ; %bb.0:
-; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX89-LABEL: void_func_f16:
-; GFX89: ; %bb.0:
-; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_setpc_b64 s[30:31]
+; CIGFX89-LABEL: void_func_f16:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_f16:
; GFX11: ; %bb.0:
@@ -2686,11 +2676,11 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
-; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
-; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_ubyte v34, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:8
-; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:12
+; CI-NEXT: buffer_load_ubyte v32, off, s[0:3], s32 offset:4
+; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
+; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:12
+; CI-NEXT: buffer_load_dword v35, off, s[0:3], s32 offset:16
+; CI-NEXT: buffer_load_dword v36, off, s[0:3], s32 offset:20
; CI-NEXT: s_mov_b32 s7, 0xf000
; CI-NEXT: s_mov_b32 s6, -1
; CI-NEXT: s_waitcnt vmcnt(5)
@@ -2704,25 +2694,22 @@ define void @void_func_v32i32_i1_i8_i16_bf16(<32 x i32> %arg0, i1 %arg1, i8 %arg
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[12:15], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_mul_f32_e32 v12, 1.0, v32
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v33
; CI-NEXT: buffer_store_dwordx4 v[8:11], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_and_b32_e32 v0, 1, v34
-; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v12
+; CI-NEXT: v_and_b32_e32 v0, 1, v32
; CI-NEXT: buffer_store_byte v0, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_byte v35, off, s[4:7], 0
+; CI-NEXT: buffer_store_byte v33, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v36, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v34, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v13, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v35, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: buffer_store_short v1, off, s[4:7], 0
+; CI-NEXT: buffer_store_short v36, off, s[4:7], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -4434,25 +4421,14 @@ define void @void_func_v32i32_v16i8(<32 x i32> %arg0, <16 x i8> %arg1) #0 {
define void @void_func_bf16(bfloat %arg0) #0 {
-; CI-LABEL: void_func_bf16:
-; CI: ; %bb.0:
-; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: buffer_store_short v0, off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX89-LABEL: void_func_bf16:
-; GFX89: ; %bb.0:
-; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_setpc_b64 s[30:31]
+; CIGFX89-LABEL: void_func_bf16:
+; CIGFX89: ; %bb.0:
+; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CIGFX89-NEXT: s_mov_b32 s7, 0xf000
+; CIGFX89-NEXT: s_mov_b32 s6, -1
+; CIGFX89-NEXT: buffer_store_short v0, off, s[4:7], 0
+; CIGFX89-NEXT: s_waitcnt vmcnt(0)
+; CIGFX89-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: void_func_bf16:
; GFX11: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/function-returns.ll b/llvm/test/CodeGen/AMDGPU/function-returns.ll
index c923431bb17c1..4add34dd7d956 100644
--- a/llvm/test/CodeGen/AMDGPU/function-returns.ll
+++ b/llvm/test/CodeGen/AMDGPU/function-returns.ll
@@ -2259,24 +2259,14 @@ define void @void_func_sret_max_known_zero_bits(ptr addrspace(5) sret(i8) %arg0)
}
define bfloat @bf16_func_void() #0 {
-; CI-LABEL: bf16_func_void:
-; CI: ; %bb.0:
-; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: s_mov_b32 s7, 0xf000
-; CI-NEXT: s_mov_b32 s6, -1
-; CI-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; CI-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX89-LABEL: bf16_func_void:
-; GFX89: ; %bb.0:
-; GFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-NEXT: s_mov_b32 s7, 0xf000
-; GFX89-NEXT: s_mov_b32 s6, -1
-; GFX89-NEXT: buffer_load_ushort v0, off, s[4:7], 0
-; GFX89-NEXT: s_waitcnt vmcnt(0)
-; GFX89-NEXT: s_setpc_b64 s[30:31]
+; GFX789-LABEL: bf16_func_void:
+; GFX789: ; %bb.0:
+; GFX789-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX789-NEXT: s_mov_b32 s7, 0xf000
+; GFX789-NEXT: s_mov_b32 s6, -1
+; GFX789-NEXT: buffer_load_ushort v0, off, s[4:7], 0
+; GFX789-NEXT: s_waitcnt vmcnt(0)
+; GFX789-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: bf16_func_void:
; GFX11: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
index 7405032441779..bd9b46f9e1710 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fadd.ll
@@ -9115,10 +9115,11 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
; GFX7-NEXT: v_not_b32_e32 v7, v2
@@ -9145,7 +9146,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory:
@@ -9158,10 +9158,11 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
; GFX6-NEXT: v_not_b32_e32 v7, v2
@@ -9188,7 +9189,6 @@ define half @global_agent_atomic_fadd_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -9630,10 +9630,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -9660,7 +9661,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9674,10 +9674,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -9705,7 +9706,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -10150,10 +10150,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -10180,7 +10181,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10194,10 +10194,11 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -10225,7 +10226,6 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
@@ -10642,13 +10642,14 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_not_b32_e32 v6, v3
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -10683,13 +10684,14 @@ define void @global_agent_atomic_fadd_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX6-NEXT: v_not_b32_e32 v6, v3
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB47_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -11139,10 +11141,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -11181,10 +11184,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -11641,10 +11645,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -11683,10 +11688,11 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -12036,30 +12042,31 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB50_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12070,31 +12077,32 @@ define half @global_agent_atomic_fadd_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB50_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -12405,8 +12413,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -12438,8 +12447,9 @@ define void @global_agent_atomic_fadd_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: .LBB51_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -12907,10 +12917,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -12937,7 +12948,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -12951,10 +12961,11 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -12982,7 +12993,6 @@ define half @global_system_atomic_fadd_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -13414,10 +13424,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -13456,10 +13467,11 @@ define void @global_system_atomic_fadd_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -14011,6 +14023,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -14040,7 +14053,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -14054,6 +14066,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -14083,7 +14096,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -14618,6 +14630,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -14647,7 +14660,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -14662,6 +14674,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -14692,7 +14705,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -15232,6 +15244,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -15261,7 +15274,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -15276,6 +15288,7 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -15306,7 +15319,6 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
@@ -15810,22 +15822,23 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX7-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
; GFX7-NEXT: v_mov_b32_e32 v7, v3
@@ -15851,23 +15864,24 @@ define void @global_agent_atomic_fadd_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v5, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX6-NEXT: .LBB57_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v6
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
; GFX6-NEXT: v_mov_b32_e32 v7, v3
@@ -16398,6 +16412,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -16440,6 +16455,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -16992,6 +17008,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -17034,6 +17051,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -17487,30 +17505,31 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB60_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -17521,31 +17540,32 @@ define bfloat @global_agent_atomic_fadd_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB60_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_add_f32_e32 v2, v2, v6
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_add_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB60_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -17954,6 +17974,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -17987,6 +18008,7 @@ define void @global_agent_atomic_fadd_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -18551,6 +18573,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -18580,7 +18603,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -18595,6 +18617,7 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -18625,7 +18648,6 @@ define bfloat @global_system_atomic_fadd_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -19149,6 +19171,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -19191,6 +19214,7 @@ define void @global_system_atomic_fadd_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
index 279cff3f5d368..bcf51f89920c0 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmax.ll
@@ -4777,10 +4777,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
; GFX7-NEXT: v_not_b32_e32 v7, v2
@@ -4807,7 +4808,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4820,10 +4820,11 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
; GFX6-NEXT: v_not_b32_e32 v7, v2
@@ -4850,7 +4851,6 @@ define half @global_agent_atomic_fmax_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -5233,10 +5233,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -5263,7 +5264,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5277,10 +5277,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -5308,7 +5309,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -5693,10 +5693,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -5723,7 +5724,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5737,10 +5737,11 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -5768,7 +5769,6 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
@@ -6125,13 +6125,14 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_not_b32_e32 v6, v3
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -6166,13 +6167,14 @@ define void @global_agent_atomic_fmax_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX6-NEXT: v_not_b32_e32 v6, v3
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -6567,10 +6569,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -6609,10 +6612,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -7012,10 +7016,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -7054,10 +7059,11 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -7366,30 +7372,31 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7400,31 +7407,32 @@ define half @global_agent_atomic_fmax_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB32_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -7697,8 +7705,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7730,8 +7739,9 @@ define void @global_agent_atomic_fmax_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8141,10 +8151,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -8171,7 +8182,6 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8185,10 +8195,11 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -8216,7 +8227,6 @@ define half @global_system_atomic_fmax_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -8593,10 +8603,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -8635,10 +8646,11 @@ define void @global_system_atomic_fmax_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -9097,6 +9109,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9127,7 +9140,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -9141,6 +9153,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9171,7 +9184,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -9613,6 +9625,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9643,7 +9656,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9658,6 +9670,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9689,7 +9702,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -10133,6 +10145,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -10163,7 +10176,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10178,6 +10190,7 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -10209,7 +10222,6 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
@@ -10624,23 +10636,24 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
; GFX7-NEXT: v_mov_b32_e32 v7, v3
@@ -10666,24 +10679,25 @@ define void @global_agent_atomic_fmax_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v5, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v6
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
; GFX6-NEXT: v_mov_b32_e32 v7, v3
@@ -11125,6 +11139,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -11168,6 +11183,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -11628,6 +11644,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -11671,6 +11688,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12052,31 +12070,32 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12087,32 +12106,33 @@ define bfloat @global_agent_atomic_fmax_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_max_f32_e32 v2, v2, v6
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_max_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB42_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -12452,6 +12472,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -12486,6 +12507,7 @@ define void @global_agent_atomic_fmax_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -12957,6 +12979,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12987,7 +13010,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13002,6 +13024,7 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -13033,7 +13056,6 @@ define bfloat @global_system_atomic_fmax_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -13467,6 +13489,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -13510,6 +13533,7 @@ define void @global_system_atomic_fmax_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
index e658cb658de78..9406e08e9e412 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fmin.ll
@@ -4777,10 +4777,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
; GFX7-NEXT: v_not_b32_e32 v7, v2
@@ -4807,7 +4808,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory:
@@ -4820,10 +4820,11 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
; GFX6-NEXT: v_not_b32_e32 v7, v2
@@ -4850,7 +4851,6 @@ define half @global_agent_atomic_fmin_ret_f16__amdgpu_no_fine_grained_memory(ptr
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -5233,10 +5233,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -5263,7 +5264,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -5277,10 +5277,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -5308,7 +5309,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gra
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -5693,10 +5693,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -5723,7 +5724,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -5737,10 +5737,11 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -5768,7 +5769,6 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_neg__amdgpu_no_fine_gra
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
@@ -6125,13 +6125,14 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_not_b32_e32 v6, v3
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -6166,13 +6167,14 @@ define void @global_agent_atomic_fmin_noret_f16__amdgpu_no_fine_grained_memory(p
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX6-NEXT: v_not_b32_e32 v6, v3
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -6567,10 +6569,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -6609,10 +6612,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -7012,10 +7016,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -7054,10 +7059,11 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b_neg__amdgpu_no_fine_g
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -7366,30 +7372,31 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB32_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -7400,31 +7407,32 @@ define half @global_agent_atomic_fmin_ret_f16__offset12b_pos__align4__amdgpu_no_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB32_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB32_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -7697,8 +7705,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -7730,8 +7739,9 @@ define void @global_agent_atomic_fmin_noret_f16__offset12b__align4_pos__amdgpu_n
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: .LBB33_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8141,10 +8151,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -8171,7 +8182,6 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -8185,10 +8195,11 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -8216,7 +8227,6 @@ define half @global_system_atomic_fmin_ret_f16__offset12b_pos__amdgpu_no_fine_gr
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -8593,10 +8603,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -8635,10 +8646,11 @@ define void @global_system_atomic_fmin_noret_f16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -9097,6 +9109,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9127,7 +9140,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory:
@@ -9141,6 +9153,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9171,7 +9184,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
@@ -9613,6 +9625,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9643,7 +9656,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -9658,6 +9670,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9689,7 +9702,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -10133,6 +10145,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -10163,7 +10176,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
@@ -10178,6 +10190,7 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -10209,7 +10222,6 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
@@ -10624,23 +10636,24 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX7-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
; GFX7-NEXT: v_mov_b32_e32 v7, v3
@@ -10666,24 +10679,25 @@ define void @global_agent_atomic_fmin_noret_bf16__amdgpu_no_fine_grained_memory(
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v5, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX6-NEXT: .LBB39_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v6
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
; GFX6-NEXT: v_mov_b32_e32 v7, v3
@@ -11125,6 +11139,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -11168,6 +11183,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -11628,6 +11644,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -11671,6 +11688,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b_neg__amdgpu_no_fine_
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12052,31 +12070,32 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX7-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB42_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_no_fine_grained_memory:
@@ -12087,32 +12106,33 @@ define bfloat @global_agent_atomic_fmin_ret_bf16__offset12b_pos__align4__amdgpu_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB42_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_min_f32_e32 v2, v2, v6
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_min_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB42_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -12452,6 +12472,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -12486,6 +12507,7 @@ define void @global_agent_atomic_fmin_noret_bf16__offset12b__align4_pos__amdgpu_
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -12957,6 +12979,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12987,7 +13010,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine_grained_memory:
@@ -13002,6 +13024,7 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -13033,7 +13056,6 @@ define bfloat @global_system_atomic_fmin_ret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -13467,6 +13489,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -13510,6 +13533,7 @@ define void @global_system_atomic_fmin_noret_bf16__offset12b_pos__amdgpu_no_fine
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
index 965c10b2e9ff9..f4b7280062bb8 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomicrmw-fsub.ll
@@ -5513,10 +5513,11 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
; GFX7-NEXT: v_not_b32_e32 v7, v2
@@ -5543,7 +5544,6 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_f16:
@@ -5556,10 +5556,11 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v5
; GFX6-NEXT: v_not_b32_e32 v7, v2
@@ -5586,7 +5587,6 @@ define half @global_agent_atomic_fsub_ret_f16(ptr addrspace(1) %ptr, half %val)
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fsub ptr addrspace(1) %ptr, half %val syncscope("agent") seq_cst
@@ -5944,10 +5944,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -5974,7 +5975,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos:
@@ -5988,10 +5988,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -6019,7 +6020,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %p
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -6379,10 +6379,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -6409,7 +6410,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_neg:
@@ -6423,10 +6423,11 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -6454,7 +6455,6 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_neg(ptr addrspace(1) %p
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 -1024
@@ -6791,13 +6791,14 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX7-NEXT: v_not_b32_e32 v6, v3
-; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
@@ -6832,13 +6833,14 @@ define void @global_agent_atomic_fsub_noret_f16(ptr addrspace(1) %ptr, half %val
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_cvt_f16_f32_e32 v5, v2
; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v5
; GFX6-NEXT: v_not_b32_e32 v6, v3
-; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: .LBB25_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
@@ -7208,10 +7210,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -7250,10 +7253,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -7628,10 +7632,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -7670,10 +7675,11 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -7962,30 +7968,31 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB28_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_f16__offset12b_pos__align4:
@@ -7996,31 +8003,32 @@ define half @global_agent_atomic_fsub_ret_f16__offset12b_pos__align4(ptr addrspa
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_cvt_f32_f16_e32 v6, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: .LBB28_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v3
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v5
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB28_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v4
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -8273,8 +8281,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX7-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8306,8 +8315,9 @@ define void @global_agent_atomic_fsub_noret_f16__offset12b__align4_pos(ptr addrs
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v2
; GFX6-NEXT: .LBB29_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
@@ -8692,10 +8702,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX7-NEXT: v_not_b32_e32 v8, v2
@@ -8722,7 +8733,6 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fsub_ret_f16__offset12b_pos:
@@ -8736,10 +8746,11 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v7, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v6
; GFX6-NEXT: v_not_b32_e32 v8, v2
@@ -8767,7 +8778,6 @@ define half @global_system_atomic_fsub_ret_f16__offset12b_pos(ptr addrspace(1) %
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(1) %ptr, i64 1023
@@ -9119,10 +9129,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX7-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: s_mov_b64 s[8:9], 0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX7-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX7-NEXT: v_not_b32_e32 v6, v2
@@ -9161,10 +9172,11 @@ define void @global_system_atomic_fsub_noret_f16__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: s_mov_b64 s[8:9], 0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v2
; GFX6-NEXT: v_lshl_b32_e32 v2, 0xffff, v4
; GFX6-NEXT: v_not_b32_e32 v6, v2
@@ -9623,6 +9635,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9652,7 +9665,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16:
@@ -9666,6 +9678,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -9695,7 +9708,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v5, v2
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fsub ptr addrspace(1) %ptr, bfloat %val syncscope("agent") seq_cst
@@ -10137,6 +10149,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -10166,7 +10179,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos:
@@ -10181,6 +10193,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -10211,7 +10224,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -10655,6 +10667,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -10684,7 +10697,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_neg:
@@ -10699,6 +10711,7 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -10729,7 +10742,6 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 -1024
@@ -11144,22 +11156,23 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX7-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_not_b32_e32 v6, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX7-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX7-NEXT: v_not_b32_e32 v5, v3
+; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX7-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v6
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX7-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX7-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX7-NEXT: v_or_b32_e32 v3, v7, v3
; GFX7-NEXT: v_mov_b32_e32 v8, v4
; GFX7-NEXT: v_mov_b32_e32 v7, v3
@@ -11185,23 +11198,24 @@ define void @global_agent_atomic_fsub_noret_bf16(ptr addrspace(1) %ptr, bfloat %
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v4, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: v_and_b32_e32 v3, 3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v5, 3, v3
-; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v5
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_not_b32_e32 v6, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v6, 16, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 3, v3
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v2
+; GFX6-NEXT: v_lshl_b32_e32 v3, 0xffff, v2
+; GFX6-NEXT: v_not_b32_e32 v5, v3
+; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v6
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v3
; GFX6-NEXT: .LBB35_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshrrev_b32_e32 v3, v5, v4
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, v2, v4
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v6
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v7, v4, v6
-; GFX6-NEXT: v_lshlrev_b32_e32 v3, v5, v3
+; GFX6-NEXT: v_and_b32_e32 v7, v4, v5
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, v2, v3
; GFX6-NEXT: v_or_b32_e32 v3, v7, v3
; GFX6-NEXT: v_mov_b32_e32 v8, v4
; GFX6-NEXT: v_mov_b32_e32 v7, v3
@@ -11643,6 +11657,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -11685,6 +11700,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12144,6 +12160,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12186,6 +12203,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b_neg(ptr addrspace(1)
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -12566,30 +12584,31 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX7-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX7-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX7-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT: v_or_b32_e32 v2, v4, v2
; GFX7-NEXT: v_mov_b32_e32 v5, v3
-; GFX7-NEXT: v_mov_b32_e32 v4, v2
-; GFX7-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX7-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX7-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX7-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX7-NEXT: v_mov_b32_e32 v3, v4
+; GFX7-NEXT: v_mov_b32_e32 v4, v5
+; GFX7-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: buffer_wbinvl1
-; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX7-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX7-NEXT: v_mov_b32_e32 v3, v4
; GFX7-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX7-NEXT: s_cbranch_execnz .LBB38_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX7-NEXT: v_mov_b32_e32 v0, v3
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4:
@@ -12600,31 +12619,32 @@ define bfloat @global_agent_atomic_fsub_ret_bf16__offset12b_pos__align4(ptr addr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
-; GFX6-NEXT: v_and_b32_e32 v6, 0xffff0000, v2
+; GFX6-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX6-NEXT: .LBB38_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3
-; GFX6-NEXT: v_sub_f32_e32 v2, v2, v6
-; GFX6-NEXT: s_waitcnt expcnt(0)
-; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v3
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v4, v2
; GFX6-NEXT: v_mov_b32_e32 v5, v3
-; GFX6-NEXT: v_mov_b32_e32 v4, v2
-; GFX6-NEXT: buffer_atomic_cmpswap v[4:5], v[0:1], s[4:7], 0 addr64 offset:2046 glc
+; GFX6-NEXT: s_waitcnt expcnt(0)
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v5
+; GFX6-NEXT: v_sub_f32_e32 v3, v3, v2
+; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v5
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v4, v4, v3
+; GFX6-NEXT: v_mov_b32_e32 v3, v4
+; GFX6-NEXT: v_mov_b32_e32 v4, v5
+; GFX6-NEXT: buffer_atomic_cmpswap v[3:4], v[0:1], s[4:7], 0 addr64 offset:2046 glc
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: buffer_wbinvl1
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
; GFX6-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
-; GFX6-NEXT: v_mov_b32_e32 v3, v4
; GFX6-NEXT: s_andn2_b64 exec, exec, s[8:9]
; GFX6-NEXT: s_cbranch_execnz .LBB38_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v4
+; GFX6-NEXT: v_mov_b32_e32 v0, v3
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -12964,6 +12984,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: s_mov_b64 s[8:9], 0
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -12997,6 +13018,7 @@ define void @global_agent_atomic_fsub_noret_bf16__offset12b__align4_pos(ptr addr
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 offset:2046
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX6-NEXT: s_mov_b64 s[8:9], 0
; GFX6-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
@@ -13467,6 +13489,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -13496,7 +13519,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: global_system_atomic_fsub_ret_bf16__offset12b_pos:
@@ -13511,6 +13533,7 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v6, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v4, 0xffff, v6
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -13541,7 +13564,6 @@ define bfloat @global_system_atomic_fsub_ret_bf16__offset12b_pos(ptr addrspace(1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[8:9]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v6, v4
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_waitcnt expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(1) %ptr, i64 1023
@@ -13975,6 +13997,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX7-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX7-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX7-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
@@ -14017,6 +14040,7 @@ define void @global_system_atomic_fsub_noret_bf16__offset12b_pos(ptr addrspace(1
; GFX6-NEXT: s_mov_b32 s5, s6
; GFX6-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: v_and_b32_e32 v4, 3, v4
+; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; GFX6-NEXT: v_lshlrev_b32_e32 v4, 3, v4
; GFX6-NEXT: v_lshl_b32_e32 v5, 0xffff, v4
; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
index 53cfd12a953d3..fb5674310442d 100644
--- a/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/i1-to-bf16.ll
@@ -13,7 +13,7 @@ define bfloat @v_uitofp_i1_to_bf16(i1 %num) {
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_i1_to_bf16:
@@ -1337,7 +1337,7 @@ define bfloat @v_sitofp_i1_to_bf16(i1 %num) {
; GFX7-NEXT: v_and_b32_e32 v0, 1, v0
; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
; GFX7-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_sitofp_i1_to_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll
index 7b356d26d608a..423ee839b06ba 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-canonicalizing-src-modifiers.ll
@@ -94,7 +94,6 @@ define half @v_uitofp_i16_to_f16_abs(i16 %arg0) nounwind {
; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_i16_to_f16_abs:
@@ -131,7 +130,6 @@ define half @v_uitofp_i16_to_f16_neg(i16 %arg0) nounwind {
; GFX7-NEXT: v_and_b32_e32 v0, 0x8000, v0
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_uitofp_i16_to_f16_neg:
@@ -168,7 +166,6 @@ define half @s_uitofp_i16_to_f16_abs(i16 inreg %arg0) nounwind {
; GFX7-NEXT: s_and_b32 s4, s16, 0x7fff
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: s_uitofp_i16_to_f16_abs:
@@ -205,7 +202,6 @@ define half @s_uitofp_i16_to_f16_neg(i16 inreg %arg0) nounwind {
; GFX7-NEXT: s_and_b32 s4, s16, 0x8000
; GFX7-NEXT: v_cvt_f32_u32_e32 v0, s4
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: s_uitofp_i16_to_f16_neg:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
index 6e887f54de861..f2bb5a4aadee5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.load.bf16.ll
@@ -12,7 +12,6 @@ define bfloat @raw_ptr_buffer_load_bf16(ptr addrspace(8) inreg %rsrc) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: buffer_load_ushort v0, off, s[16:19], 0
; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: raw_ptr_buffer_load_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
index 395de3d4e2379..60c3d8d8734f6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -9,8 +9,6 @@
define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %data, i32 %offset) {
; GFX7-LABEL: buffer_store_bf16:
; GFX7: ; %bb.0:
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: buffer_store_short v0, v1, s[0:3], 0 offen
; GFX7-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index 33b644181af52..ee01c9d0acdc7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -5527,16 +5527,17 @@ define float @v_exp_f32_from_fpext_bf16(bfloat %src) {
; SI-LABEL: v_exp_f32_from_fpext_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: s_mov_b32 s4, 0x3fb8aa3b
-; SI-NEXT: v_rndne_f32_e32 v2, v1
-; SI-NEXT: v_sub_f32_e32 v3, v1, v2
-; SI-NEXT: v_fma_f32 v1, v0, s4, -v1
+; SI-NEXT: v_mul_f32_e32 v1, 0x3fb8aa3b, v0
+; SI-NEXT: v_fma_f32 v2, v0, s4, -v1
; SI-NEXT: s_mov_b32 s4, 0x32a5705f
-; SI-NEXT: v_fma_f32 v1, v0, s4, v1
-; SI-NEXT: v_add_f32_e32 v1, v3, v1
+; SI-NEXT: v_rndne_f32_e32 v3, v1
+; SI-NEXT: v_fma_f32 v2, v0, s4, v2
+; SI-NEXT: v_sub_f32_e32 v1, v1, v3
+; SI-NEXT: v_add_f32_e32 v1, v1, v2
; SI-NEXT: v_exp_f32_e32 v1, v1
-; SI-NEXT: v_cvt_i32_f32_e32 v2, v2
+; SI-NEXT: v_cvt_i32_f32_e32 v2, v3
; SI-NEXT: s_mov_b32 s4, 0xc2ce8ed0
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; SI-NEXT: s_mov_b32 s4, 0x42b17218
@@ -5805,25 +5806,14 @@ define half @v_exp_f16(half %in) {
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; SI-SDAG-LABEL: v_exp_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-NEXT: v_exp_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp_f16:
; R600: ; %bb.0:
@@ -5848,25 +5838,14 @@ define half @v_exp_fabs_f16(half %in) {
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; SI-SDAG-LABEL: v_exp_fabs_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp_fabs_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp_fabs_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-NEXT: v_exp_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp_fabs_f16:
; R600: ; %bb.0:
@@ -5904,12 +5883,10 @@ define half @v_exp_fneg_fabs_f16(half %in) {
; SI-SDAG-LABEL: v_exp_fneg_fabs_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp_fneg_fabs_f16:
@@ -5958,12 +5935,10 @@ define half @v_exp_fneg_f16(half %in) {
; SI-SDAG-LABEL: v_exp_fneg_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8aa3b, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xbfb8aa3b, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp_fneg_f16:
@@ -6000,10 +5975,10 @@ define half @v_exp_f16_fast(half %in) {
; SI-SDAG-LABEL: v_exp_f16_fast:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3fb8a000, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp_f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
index de1f2e900e326..7d830a9306293 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp10.ll
@@ -5589,16 +5589,17 @@ define float @v_exp10_f32_from_fpext_bf16(bfloat %src) {
; SI-LABEL: v_exp10_f32_from_fpext_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: s_mov_b32 s4, 0x40549a78
-; SI-NEXT: v_rndne_f32_e32 v2, v1
-; SI-NEXT: v_sub_f32_e32 v3, v1, v2
-; SI-NEXT: v_fma_f32 v1, v0, s4, -v1
+; SI-NEXT: v_mul_f32_e32 v1, 0x40549a78, v0
+; SI-NEXT: v_fma_f32 v2, v0, s4, -v1
; SI-NEXT: s_mov_b32 s4, 0x33979a37
-; SI-NEXT: v_fma_f32 v1, v0, s4, v1
-; SI-NEXT: v_add_f32_e32 v1, v3, v1
+; SI-NEXT: v_rndne_f32_e32 v3, v1
+; SI-NEXT: v_fma_f32 v2, v0, s4, v2
+; SI-NEXT: v_sub_f32_e32 v1, v1, v3
+; SI-NEXT: v_add_f32_e32 v1, v1, v2
; SI-NEXT: v_exp_f32_e32 v1, v1
-; SI-NEXT: v_cvt_i32_f32_e32 v2, v2
+; SI-NEXT: v_cvt_i32_f32_e32 v2, v3
; SI-NEXT: s_mov_b32 s4, 0xc23369f4
; SI-NEXT: v_cmp_ngt_f32_e32 vcc, s4, v0
; SI-NEXT: s_mov_b32 s4, 0x421a209b
@@ -5876,25 +5877,14 @@ define half @v_exp10_f16(half %in) {
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; SI-SDAG-LABEL: v_exp10_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp10_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp10_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
+; SI-NEXT: v_exp_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp10_f16:
; R600: ; %bb.0:
@@ -5919,25 +5909,14 @@ define half @v_exp10_fabs_f16(half %in) {
; GCN-NEXT: v_cvt_f16_f32_e32 v0, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
-; SI-SDAG-LABEL: v_exp10_fabs_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp10_fabs_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp10_fabs_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
+; SI-NEXT: v_exp_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; R600-LABEL: v_exp10_fabs_f16:
; R600: ; %bb.0:
@@ -5975,12 +5954,10 @@ define half @v_exp10_fneg_fabs_f16(half %in) {
; SI-SDAG-LABEL: v_exp10_fneg_fabs_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xc0549a78, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp10_fneg_fabs_f16:
@@ -6029,12 +6006,10 @@ define half @v_exp10_fneg_f16(half %in) {
; SI-SDAG-LABEL: v_exp10_fneg_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40549a78, v0
+; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0xc0549a78, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp10_fneg_f16:
@@ -6074,13 +6049,13 @@ define half @v_exp10_f16_fast(half %in) {
; SI-SDAG-LABEL: v_exp10_f16_fast:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v1, 0x3a278000, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x40548000, v0
; SI-SDAG-NEXT: v_exp_f32_e32 v1, v1
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
+; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp10_f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 390fedb1d2ef3..97ecb5362a4bc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -2828,11 +2828,12 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) {
; SI-LABEL: v_exp2_f32_from_fpext_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: s_mov_b32 s4, 0xc2fc0000
+; SI-NEXT: v_mov_b32_e32 v1, 0x42800000
; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-NEXT: v_mov_b32_e32 v2, 0x42800000
-; SI-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
-; SI-NEXT: v_add_f32_e32 v0, v0, v2
+; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SI-NEXT: v_add_f32_e32 v0, v0, v1
; SI-NEXT: v_exp_f32_e32 v0, v0
; SI-NEXT: v_not_b32_e32 v1, 63
; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -2896,23 +2897,13 @@ define float @v_exp2_f32_from_fpext_bf16(bfloat %src) {
; FIXME: Fold out fp16_to_fp (FP_TO_FP16) on no-f16 targets
define half @v_exp2_f16(half %in) {
-; SI-SDAG-LABEL: v_exp2_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp2_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp2_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_exp_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_exp2_f16:
; VI: ; %bb.0:
@@ -2945,23 +2936,13 @@ define half @v_exp2_f16(half %in) {
}
define half @v_exp2_fabs_f16(half %in) {
-; SI-SDAG-LABEL: v_exp2_fabs_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp2_fabs_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp2_fabs_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_exp_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_exp2_fabs_f16:
; VI: ; %bb.0:
@@ -2998,11 +2979,10 @@ define half @v_exp2_fneg_fabs_f16(half %in) {
; SI-SDAG-LABEL: v_exp2_fneg_fabs_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp2_fneg_fabs_f16:
@@ -3049,11 +3029,10 @@ define half @v_exp2_fneg_f16(half %in) {
; SI-SDAG-LABEL: v_exp2_fneg_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_exp2_fneg_f16:
@@ -3096,21 +3075,13 @@ define half @v_exp2_fneg_f16(half %in) {
}
define half @v_exp2_f16_fast(half %in) {
-; SI-SDAG-LABEL: v_exp2_f16_fast:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_exp_f32_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_exp2_f16_fast:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_exp2_f16_fast:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_exp_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_exp2_f16_fast:
; VI: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index c562eb168478f..7300e4227925f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -19,14 +19,14 @@ define { half, i32 } @test_frexp_f16_i32(half %a) {
; GFX6-SDAG-LABEL: test_frexp_f16_i32:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0
-; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
+; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v2, v0
; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_frexp_f16_i32:
@@ -155,12 +155,12 @@ define half @test_frexp_f16_i32_only_use_fract(half %a) {
; GFX6-SDAG-LABEL: test_frexp_f16_i32_only_use_fract:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0
; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_frexp_f16_i32_only_use_fract:
@@ -258,9 +258,8 @@ define i32 @test_frexp_f16_i32_only_use_exp(half %a) {
; GFX6-SDAG-LABEL: test_frexp_f16_i32_only_use_exp:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
@@ -930,14 +929,14 @@ define { half, i16 } @test_frexp_f16_i16(half %a) {
; GFX6-SDAG-LABEL: test_frexp_f16_i16:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0
-; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
+; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v2, v0
; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_frexp_f16_i16:
@@ -1064,12 +1063,12 @@ define half @test_frexp_f16_i16_only_use_fract(half %a) {
; GFX6-SDAG-LABEL: test_frexp_f16_i16_only_use_fract:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_frexp_mant_f32_e32 v1, v0
; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_frexp_f16_i16_only_use_fract:
@@ -1167,9 +1166,8 @@ define i16 @test_frexp_f16_i16_only_use_exp(half %a) {
; GFX6-SDAG-LABEL: test_frexp_f16_i16_only_use_exp:
; GFX6-SDAG: ; %bb.0:
; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-SDAG-NEXT: s_mov_b32 s4, 0x7f800000
; GFX6-SDAG-NEXT: v_frexp_exp_i32_f32_e32 v1, v0
; GFX6-SDAG-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GFX6-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
index e94a2813f2ecc..46cd8c07345ce 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll
@@ -161,8 +161,7 @@ define i1 @snan_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: snan_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
@@ -232,8 +231,7 @@ define i1 @qnan_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: qnan_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fbf
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -288,8 +286,7 @@ define i1 @posinf_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: posinf_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -339,8 +336,7 @@ define i1 @neginf_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: neginf_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_mov_b32 s4, 0xff80
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -390,9 +386,8 @@ define i1 @posnormal_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: posnormal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00
@@ -466,9 +461,8 @@ define i1 @negnormal_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: negnormal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00
@@ -542,8 +536,6 @@ define i1 @possubnormal_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: possubnormal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f
@@ -600,9 +592,8 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: negsubnormal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_add_i32_e64 v0, s[4:5], -1, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
@@ -675,8 +666,7 @@ define i1 @poszero_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: poszero_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -723,8 +713,7 @@ define i1 @negzero_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: negzero_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_mov_b32 s4, 0x8000
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -774,8 +763,7 @@ define i1 @posfinite_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: posfinite_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -825,9 +813,8 @@ define i1 @negfinite_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: negfinite_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
@@ -894,8 +881,7 @@ define i1 @isnan_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: isnan_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -950,8 +936,7 @@ define i1 @not_isnan_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_isnan_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1276,8 +1261,7 @@ define i1 @isinf_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: isinf_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1332,8 +1316,7 @@ define i1 @isfinite_bf16(bfloat %x) nounwind {
; GFX7CHECK-LABEL: isfinite_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1388,8 +1371,6 @@ define i1 @issubnormal_or_zero_bf16(bfloat %x) {
; GFX7CHECK-LABEL: issubnormal_or_zero_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1443,8 +1424,6 @@ define i1 @not_issubnormal_or_zero_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_issubnormal_or_zero_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7f80, v0
; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1498,8 +1477,7 @@ define i1 @isnormal_bf16(bfloat %x) {
; GFX7CHECK-LABEL: isnormal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f00
@@ -1561,8 +1539,7 @@ define i1 @not_isnormal_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_isnormal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7eff
@@ -1624,9 +1601,8 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_is_plus_normal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7eff
@@ -1700,9 +1676,8 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_is_neg_normal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_bfe_i32 v1, v0, 0, 16
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0
; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7eff
@@ -1776,8 +1751,7 @@ define i1 @issubnormal_bf16(bfloat %x) {
; GFX7CHECK-LABEL: issubnormal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f
; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
@@ -1838,8 +1812,7 @@ define i1 @not_issubnormal_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_issubnormal_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7e
; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
@@ -1900,8 +1873,7 @@ define i1 @iszero_bf16(bfloat %x) {
; GFX7CHECK-LABEL: iszero_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -1953,8 +1925,7 @@ define i1 @not_iszero_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_iszero_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
@@ -2006,8 +1977,7 @@ define i1 @ispositive_bf16(bfloat %x) {
; GFX7CHECK-LABEL: ispositive_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81
; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2057,10 +2027,9 @@ define i1 @not_ispositive_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_ispositive_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v2, 16, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX7CHECK-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f80
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s6, v0
@@ -2154,10 +2123,9 @@ define i1 @isnegative_bf16(bfloat %x) {
; GFX7CHECK-LABEL: isnegative_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v2, 16, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX7CHECK-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
@@ -2239,9 +2207,8 @@ define i1 @not_isnegative_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_isnegative_bf16:
; GFX7CHECK: ; %bb.0:
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81
@@ -2311,8 +2278,7 @@ define i1 @iszero_or_nan_bf16(bfloat %x) {
; GFX7CHECK-LABEL: iszero_or_nan_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
@@ -2380,8 +2346,7 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX7CHECK-LABEL: iszero_or_nan_f_daz:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
@@ -2449,8 +2414,7 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX7CHECK-LABEL: iszero_or_nan_f_maybe_daz:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
@@ -2518,8 +2482,7 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_iszero_or_nan_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
@@ -2587,8 +2550,7 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 {
; GFX7CHECK-LABEL: not_iszero_or_nan_f_daz:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
@@ -2656,8 +2618,7 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 {
; GFX7CHECK-LABEL: not_iszero_or_nan_f_maybe_daz:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
@@ -2725,8 +2686,7 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) {
; GFX7CHECK-LABEL: iszero_or_qnan_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fbf
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
@@ -2794,8 +2754,7 @@ define i1 @iszero_or_snan_bf16(bfloat %x) {
; GFX7CHECK-LABEL: iszero_or_snan_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
@@ -2878,8 +2837,7 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_iszero_or_qnan_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0
; GFX7CHECK-NEXT: s_movk_i32 s8, 0x7f80
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
@@ -3005,8 +2963,7 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_iszero_or_snan_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_add_i32_e64 v1, s[4:5], -1, v0
@@ -3120,8 +3077,7 @@ define i1 @isinf_or_nan_bf16(bfloat %x) {
; GFX7CHECK-LABEL: isinf_or_nan_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f7f
; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3177,8 +3133,7 @@ define i1 @not_isinf_or_nan_bf16(bfloat %x) {
; GFX7CHECK-LABEL: not_isinf_or_nan_bf16:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3234,8 +3189,7 @@ define i1 @isfinite_or_nan_f(bfloat %x) {
; GFX7CHECK-LABEL: isfinite_or_nan_f:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3291,8 +3245,7 @@ define i1 @not_isfinite_or_nan_f(bfloat %x) {
; GFX7CHECK-LABEL: not_isfinite_or_nan_f:
; GFX7CHECK: ; %bb.0: ; %entry
; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80
; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index a7b6e5877adf4..3863da4fa6389 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -266,12 +266,11 @@ define i1 @snan_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: snan_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00
-; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
+; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -345,9 +344,8 @@ define i1 @qnan_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: qnan_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -420,7 +418,7 @@ define i1 @posinf_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: posinf_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -495,7 +493,7 @@ define i1 @neginf_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: neginf_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7SELDAG-NEXT: s_mov_b32 s4, 0xfc00
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -568,12 +566,11 @@ define i1 @posnormal_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: posnormal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800
; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
@@ -655,12 +652,11 @@ define i1 @negnormal_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: negnormal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800
; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
@@ -740,10 +736,9 @@ define i1 @possubnormal_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: possubnormal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -818,7 +813,6 @@ define i1 @negsubnormal_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: negsubnormal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e64 v0, s[4:5], -1, v0
@@ -899,21 +893,13 @@ define i1 @negsubnormal_f16(half %x) nounwind {
}
define i1 @poszero_f16(half %x) nounwind {
-; GFX7SELDAG-LABEL: poszero_f16:
-; GFX7SELDAG: ; %bb.0:
-; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7GLISEL-LABEL: poszero_f16:
-; GFX7GLISEL: ; %bb.0:
-; GFX7GLISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7GLISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7GLISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7GLISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7CHECK-LABEL: poszero_f16:
+; GFX7CHECK: ; %bb.0:
+; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX7CHECK-NEXT: s_setpc_b64 s[30:31]
;
; GFX8CHECK-LABEL: poszero_f16:
; GFX8CHECK: ; %bb.0:
@@ -973,7 +959,7 @@ define i1 @negzero_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: negzero_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7SELDAG-NEXT: s_mov_b32 s4, 0x8000
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1046,7 +1032,7 @@ define i1 @posfinite_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: posfinite_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -1121,10 +1107,9 @@ define i1 @negfinite_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: negfinite_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
@@ -1202,9 +1187,8 @@ define i1 @isnan_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: isnan_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1277,9 +1261,8 @@ define i1 @not_isnan_f16(half %x) {
; GFX7SELDAG-LABEL: not_isnan_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1859,12 +1842,8 @@ define i1 @isnan_f16_strictfp(half %x) strictfp nounwind {
; GFX7SELDAG-LABEL: isnan_f16_strictfp:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
-; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7SELDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -1937,9 +1916,8 @@ define i1 @isinf_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: isinf_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2014,9 +1992,8 @@ define i1 @isfinite_f16(half %x) nounwind {
; GFX7SELDAG-LABEL: isfinite_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2091,7 +2068,6 @@ define i1 @issubnormal_or_zero_f16(half %x) {
; GFX7SELDAG-LABEL: issubnormal_or_zero_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7c00, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2167,7 +2143,6 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
; GFX7SELDAG-LABEL: not_issubnormal_or_zero_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7c00, v0
; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2249,11 +2224,10 @@ define i1 @isnormal_f16(half %x) {
; GFX7SELDAG-LABEL: isnormal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7800
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7800
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2329,11 +2303,10 @@ define i1 @not_isnormal_f16(half %x) {
; GFX7SELDAG-LABEL: not_isnormal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x77ff
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x77ff
; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2414,12 +2387,11 @@ define i1 @not_is_plus_normal_f16(half %x) {
; GFX7SELDAG-LABEL: not_is_plus_normal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff
; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1
; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
@@ -2510,12 +2482,11 @@ define i1 @not_is_neg_normal_f16(half %x) {
; GFX7SELDAG-LABEL: not_is_neg_normal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff
; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1
; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
@@ -2606,10 +2577,9 @@ define i1 @issubnormal_f16(half %x) {
; GFX7SELDAG-LABEL: issubnormal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2685,10 +2655,9 @@ define i1 @not_issubnormal_f16(half %x) {
; GFX7SELDAG-LABEL: not_issubnormal_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3fe
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3fe
; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -2772,7 +2741,6 @@ define i1 @iszero_f16(half %x) {
; GFX7SELDAG-LABEL: iszero_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2847,7 +2815,6 @@ define i1 @not_iszero_f16(half %x) {
; GFX7SELDAG-LABEL: not_iszero_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -2935,7 +2902,7 @@ define i1 @ispositive_f16(half %x) {
; GFX7SELDAG-LABEL: ispositive_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
@@ -3010,17 +2977,17 @@ define i1 @not_ispositive_f16(half %x) {
; GFX7SELDAG-LABEL: not_ispositive_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7SELDAG-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX7SELDAG-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7c00
+; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
+; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s6, v0
; GFX7SELDAG-NEXT: s_mov_b32 s7, 0xfc00
-; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16
-; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
-; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s6, v2
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s7, v0
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s7, v1
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s6, v2
+; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -3101,15 +3068,15 @@ define i1 @isnegative_f16(half %x) {
; GFX7SELDAG-LABEL: isnegative_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7SELDAG-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX7SELDAG-NEXT: v_bfe_i32 v2, v0, 0, 16
+; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
+; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2
+; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_mov_b32 s6, 0xfc00
-; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16
-; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1
-; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v2
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
-; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0
+; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -3188,13 +3155,13 @@ define i1 @not_isnegative_f16(half %x) {
; GFX7SELDAG-LABEL: not_isnegative_f16:
; GFX7SELDAG: ; %bb.0:
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
-; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00
-; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0
+; GFX7SELDAG-NEXT: v_and_b32_e32 v1, 0xffff, v0
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0
-; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
+; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
+; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1
+; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
;
@@ -3272,9 +3239,8 @@ define i1 @iszero_or_nan_f16(half %x) {
; GFX7SELDAG-LABEL: iszero_or_nan_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
@@ -3353,9 +3319,8 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 {
; GFX7SELDAG-LABEL: iszero_or_nan_f_daz:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
@@ -3434,9 +3399,8 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 {
; GFX7SELDAG-LABEL: iszero_or_nan_f_maybe_daz:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
@@ -3515,9 +3479,8 @@ define i1 @not_iszero_or_nan_f16(half %x) {
; GFX7SELDAG-LABEL: not_iszero_or_nan_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
@@ -3605,9 +3568,8 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 {
; GFX7SELDAG-LABEL: not_iszero_or_nan_f_daz:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
@@ -3695,9 +3657,8 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 {
; GFX7SELDAG-LABEL: not_iszero_or_nan_f_maybe_daz:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
@@ -3785,9 +3746,8 @@ define i1 @iszero_or_qnan_f16(half %x) {
; GFX7SELDAG-LABEL: iszero_or_qnan_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
@@ -3866,12 +3826,11 @@ define i1 @iszero_or_snan_f16(half %x) {
; GFX7SELDAG-LABEL: iszero_or_snan_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00
-; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
-; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
+; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0
; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
@@ -3952,10 +3911,9 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
; GFX7SELDAG-LABEL: not_iszero_or_qnan_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00
; GFX7SELDAG-NEXT: s_movk_i32 s8, 0x7c00
-; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s8, v0
; GFX7SELDAG-NEXT: s_and_b64 s[6:7], s[4:5], vcc
@@ -4058,14 +4016,13 @@ define i1 @not_iszero_or_snan_f16(half %x) {
; GFX7SELDAG-LABEL: not_iszero_or_snan_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
-; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7dff
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_add_i32_e64 v1, s[4:5], -1, v0
; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff
; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1
+; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7dff
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s6, v0
; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc
@@ -4161,9 +4118,8 @@ define i1 @isinf_or_nan_f16(half %x) {
; GFX7SELDAG-LABEL: isinf_or_nan_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7bff
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7bff
; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -4239,9 +4195,8 @@ define i1 @not_isinf_or_nan_f16(half %x) {
; GFX7SELDAG-LABEL: not_isinf_or_nan_f16:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -4317,9 +4272,8 @@ define i1 @isfinite_or_nan_f(half %x) {
; GFX7SELDAG-LABEL: isfinite_or_nan_f:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
@@ -4395,9 +4349,8 @@ define i1 @not_isfinite_or_nan_f(half %x) {
; GFX7SELDAG-LABEL: not_isfinite_or_nan_f:
; GFX7SELDAG: ; %bb.0: ; %entry
; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00
; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0
; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
index 749600b4a99f7..8c4d4788c4bdf 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.ldexp.ll
@@ -192,14 +192,14 @@ define <2 x double> @test_ldexp_v2f64_v2i32(<2 x double> %a, <2 x i32> %b) {
; }
define half @test_ldexp_f16_i8(half %a, i8 %b) {
-; GFX6-SDAG-LABEL: test_ldexp_f16_i8:
-; GFX6-SDAG: ; %bb.0:
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: test_ldexp_f16_i8:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 8
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_ldexp_f16_i8:
; GFX8-SDAG: ; %bb.0:
@@ -229,15 +229,6 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) {
; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v0, v1
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-GISEL-LABEL: test_ldexp_f16_i8:
-; GFX6-GISEL: ; %bb.0:
-; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 8
-; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX8-GISEL-LABEL: test_ldexp_f16_i8:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -282,14 +273,14 @@ define half @test_ldexp_f16_i8(half %a, i8 %b) {
}
define half @test_ldexp_f16_i16(half %a, i16 %b) {
-; GFX6-SDAG-LABEL: test_ldexp_f16_i16:
-; GFX6-SDAG: ; %bb.0:
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-SDAG-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: test_ldexp_f16_i16:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_bfe_i32 v1, v1, 0, 16
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_ldexp_f16_i16:
; GFX8: ; %bb.0:
@@ -315,15 +306,6 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) {
; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v0, v1
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-GISEL-LABEL: test_ldexp_f16_i16:
-; GFX6-GISEL: ; %bb.0:
-; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-GISEL-NEXT: v_bfe_i32 v1, v1, 0, 16
-; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX11-GISEL-TRUE16-LABEL: test_ldexp_f16_i16:
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -340,13 +322,13 @@ define half @test_ldexp_f16_i16(half %a, i16 %b) {
}
define half @test_ldexp_f16_i32(half %a, i32 %b) {
-; GFX6-SDAG-LABEL: test_ldexp_f16_i32:
-; GFX6-SDAG: ; %bb.0:
-; GFX6-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-SDAG-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX6-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX6-LABEL: test_ldexp_f16_i32:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_ldexp_f32_e32 v0, v0, v1
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: test_ldexp_f16_i32:
; GFX8-SDAG: ; %bb.0:
@@ -384,14 +366,6 @@ define half @test_ldexp_f16_i32(half %a, i32 %b) {
; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e32 v0, v0, v1
; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31]
;
-; GFX6-GISEL-LABEL: test_ldexp_f16_i32:
-; GFX6-GISEL: ; %bb.0:
-; GFX6-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-GISEL-NEXT: v_ldexp_f32_e32 v0, v0, v1
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
; GFX8-GISEL-LABEL: test_ldexp_f16_i32:
; GFX8-GISEL: ; %bb.0:
; GFX8-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1313,13 +1287,13 @@ define <4 x half> @test_ldexp_v4f16_v4i16(<4 x half> %a, <4 x i16> %b) {
}
define amdgpu_ps half @test_ldexp_f16_i16_uniform(half inreg %a, i16 inreg %b) {
-; GFX6-SDAG-LABEL: test_ldexp_f16_i16_uniform:
-; GFX6-SDAG: ; %bb.0:
-; GFX6-SDAG-NEXT: v_cvt_f16_f32_e32 v0, s0
-; GFX6-SDAG-NEXT: s_sext_i32_i16 s0, s1
-; GFX6-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-SDAG-NEXT: v_ldexp_f32_e64 v0, v0, s0
-; GFX6-SDAG-NEXT: ; return to shader part epilog
+; GFX6-LABEL: test_ldexp_f16_i16_uniform:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, s0
+; GFX6-NEXT: s_sext_i32_i16 s0, s1
+; GFX6-NEXT: v_ldexp_f32_e64 v0, v0, s0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: ; return to shader part epilog
;
; GFX8-LABEL: test_ldexp_f16_i16_uniform:
; GFX8: ; %bb.0:
@@ -1343,14 +1317,6 @@ define amdgpu_ps half @test_ldexp_f16_i16_uniform(half inreg %a, i16 inreg %b) {
; GFX11-SDAG-FAKE16-NEXT: v_ldexp_f16_e64 v0, s0, s1
; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog
;
-; GFX6-GISEL-LABEL: test_ldexp_f16_i16_uniform:
-; GFX6-GISEL: ; %bb.0:
-; GFX6-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s0
-; GFX6-GISEL-NEXT: s_sext_i32_i16 s0, s1
-; GFX6-GISEL-NEXT: v_ldexp_f32_e64 v0, v0, s0
-; GFX6-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-GISEL-NEXT: ; return to shader part epilog
-;
; GFX11-GISEL-TRUE16-LABEL: test_ldexp_f16_i16_uniform:
; GFX11-GISEL-TRUE16: ; %bb.0:
; GFX11-GISEL-TRUE16-NEXT: v_ldexp_f16_e64 v0.l, s0, s1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index 59c1c2facb5c9..4e8ffdcb00310 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -6110,6 +6110,7 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
; SI-LABEL: v_log_f32_from_fpext_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: s_mov_b32 s4, 0x800000
; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -6287,25 +6288,14 @@ define float @v_log_f32_from_fpext_bf16(bfloat %src) {
}
define half @v_log_f16(half %in) {
-; SI-SDAG-LABEL: v_log_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_log_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_log_f32_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log_f16:
; VI: ; %bb.0:
@@ -6367,25 +6357,14 @@ define half @v_log_f16(half %in) {
}
define half @v_log_fabs_f16(half %in) {
-; SI-SDAG-LABEL: v_log_fabs_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log_fabs_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_log_fabs_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_log_f32_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log_fabs_f16:
; VI: ; %bb.0:
@@ -6451,12 +6430,11 @@ define half @v_log_fneg_fabs_f16(half %in) {
; SI-SDAG-LABEL: v_log_fneg_fabs_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_log_fneg_fabs_f16:
@@ -6533,12 +6511,11 @@ define half @v_log_fneg_f16(half %in) {
; SI-SDAG-LABEL: v_log_fneg_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_log_fneg_f16:
@@ -6611,23 +6588,14 @@ define half @v_log_fneg_f16(half %in) {
}
define half @v_log_f16_fast(half %in) {
-; SI-SDAG-LABEL: v_log_f16_fast:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log_f16_fast:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_log_f16_fast:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_log_f32_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 0x3f317218, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log_f16_fast:
; VI: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index 2dc85d3c161a0..843b829f28742 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -6110,6 +6110,7 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
; SI-LABEL: v_log10_f32_from_fpext_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: s_mov_b32 s4, 0x800000
; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
; SI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
@@ -6287,25 +6288,14 @@ define float @v_log10_f32_from_fpext_bf16(bfloat %src) {
}
define half @v_log10_f16(half %in) {
-; SI-SDAG-LABEL: v_log10_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log10_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_log10_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_log_f32_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log10_f16:
; VI: ; %bb.0:
@@ -6367,25 +6357,14 @@ define half @v_log10_f16(half %in) {
}
define half @v_log10_fabs_f16(half %in) {
-; SI-SDAG-LABEL: v_log10_fabs_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log10_fabs_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_log10_fabs_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_log_f32_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log10_fabs_f16:
; VI: ; %bb.0:
@@ -6451,12 +6430,11 @@ define half @v_log10_fneg_fabs_f16(half %in) {
; SI-SDAG-LABEL: v_log10_fneg_fabs_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_log10_fneg_fabs_f16:
@@ -6533,12 +6511,11 @@ define half @v_log10_fneg_f16(half %in) {
; SI-SDAG-LABEL: v_log10_fneg_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_log10_fneg_f16:
@@ -6611,23 +6588,14 @@ define half @v_log10_fneg_f16(half %in) {
}
define half @v_log10_f16_fast(half %in) {
-; SI-SDAG-LABEL: v_log10_f16_fast:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log10_f16_fast:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_log10_f16_fast:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_log_f32_e32 v0, v0
+; SI-NEXT: v_mul_f32_e32 v0, 0x3e9a209b, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log10_f16_fast:
; VI: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 047cc9addbcfc..35ae1337d8e76 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -3622,10 +3622,11 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
; SI-LABEL: v_log2_f32_from_fpext_bf16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SI-NEXT: s_mov_b32 s4, 0x800000
; SI-NEXT: v_cmp_gt_f32_e32 vcc, s4, v0
-; SI-NEXT: v_cndmask_b32_e64 v2, 0, 32, vcc
-; SI-NEXT: v_ldexp_f32_e32 v0, v0, v2
+; SI-NEXT: v_cndmask_b32_e64 v1, 0, 32, vcc
+; SI-NEXT: v_ldexp_f32_e32 v0, v0, v1
; SI-NEXT: v_log_f32_e32 v0, v0
; SI-NEXT: v_mov_b32_e32 v1, 0x42000000
; SI-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -3737,23 +3738,13 @@ define float @v_log2_f32_from_fpext_bf16(bfloat %src) {
}
define half @v_log2_f16(half %in) {
-; SI-SDAG-LABEL: v_log2_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log2_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_log2_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_log_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log2_f16:
; VI: ; %bb.0:
@@ -3805,23 +3796,13 @@ define half @v_log2_f16(half %in) {
}
define half @v_log2_fabs_f16(half %in) {
-; SI-SDAG-LABEL: v_log2_fabs_f16:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log2_fabs_f16:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_log2_fabs_f16:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SI-NEXT: v_log_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log2_fabs_f16:
; VI: ; %bb.0:
@@ -3877,11 +3858,10 @@ define half @v_log2_fneg_fabs_f16(half %in) {
; SI-SDAG-LABEL: v_log2_fneg_fabs_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-SDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_log2_fneg_fabs_f16:
@@ -3947,11 +3927,10 @@ define half @v_log2_fneg_f16(half %in) {
; SI-SDAG-LABEL: v_log2_fneg_f16:
; SI-SDAG: ; %bb.0:
; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; SI-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; SI-GISEL-LABEL: v_log2_fneg_f16:
@@ -4013,21 +3992,13 @@ define half @v_log2_fneg_f16(half %in) {
}
define half @v_log2_f16_fast(half %in) {
-; SI-SDAG-LABEL: v_log2_f16_fast:
-; SI-SDAG: ; %bb.0:
-; SI-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-SDAG-NEXT: v_log_f32_e32 v0, v0
-; SI-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; SI-GISEL-LABEL: v_log2_f16_fast:
-; SI-GISEL: ; %bb.0:
-; SI-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT: v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT: s_setpc_b64 s[30:31]
+; SI-LABEL: v_log2_f16_fast:
+; SI: ; %bb.0:
+; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_log_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_log2_f16_fast:
; VI: ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
index 6d371d4b76e0b..3c27adde10b78 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maximum.f16.ll
@@ -13,14 +13,13 @@ define half @v_maximum_f16(half %src0, half %src1) {
; GFX7-LABEL: v_maximum_f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_max_f32_e32 v3, v0, v1
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_f16:
@@ -100,11 +99,10 @@ define half @v_maximum_f16__nnan(half %src0, half %src1) {
; GFX7-LABEL: v_maximum_f16__nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_f16__nnan:
@@ -164,14 +162,13 @@ define half @v_maximum_f16__nsz(half %src0, half %src1) {
; GFX7-LABEL: v_maximum_f16__nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_max_f32_e32 v3, v0, v1
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_f16__nsz:
@@ -251,11 +248,10 @@ define half @v_maximum_f16__nnan_nsz(half %src0, half %src1) {
; GFX7-LABEL: v_maximum_f16__nnan_nsz:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_f16__nnan_nsz:
@@ -315,15 +311,14 @@ define half @v_maximum_f16__nnan_src0(half %arg0, half %src1) {
; GFX7-LABEL: v_maximum_f16__nnan_src0:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_add_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v3, v0, v1
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_f16__nnan_src0:
@@ -414,15 +409,14 @@ define half @v_maximum_f16__nnan_src1(half %src0, half %arg1) {
; GFX7-LABEL: v_maximum_f16__nnan_src1:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX7-NEXT: v_add_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_max_f32_e32 v3, v0, v1
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximum_f16__nnan_src1:
@@ -513,11 +507,9 @@ define void @s_maximum_f16(half inreg %src0, half inreg %src1) {
; GFX7-LABEL: s_maximum_f16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, s17
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, s16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s17
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16
; GFX7-NEXT: v_mov_b32_e32 v2, 0x7fc00000
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_max_f32_e32 v3, v1, v0
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v1, v0
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 24d6f4f84e816..e79324d7655fc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -179,11 +179,10 @@ entry:
define amdgpu_ps half @minnum_f16_no_ieee(half %a, half %b) #0 {
; SI-LABEL: minnum_f16_no_ieee:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: minnum_f16_no_ieee:
@@ -589,7 +588,6 @@ define amdgpu_ps <2 x half> @minnum_v2f16_no_ieee(<2 x half> %a, <2 x half> %b)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v1
-; SI-NEXT: v_readfirstlane_b32 s0, v0
; SI-NEXT: ; return to shader part epilog
;
; VI-LABEL: minnum_v2f16_no_ieee:
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 4233367b3d5bb..9778c61c44e6e 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -1572,7 +1572,6 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fadd_ret_f16:
@@ -1606,7 +1605,6 @@ define half @local_atomic_fadd_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr addrspace(3) %ptr, half 4.0 seq_cst
ret half %result
@@ -1967,7 +1965,6 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fadd_ret_f16__offset:
@@ -2002,7 +1999,6 @@ define half @local_atomic_fadd_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fadd ptr addrspace(3) %gep, half 4.0 seq_cst
@@ -3021,34 +3017,33 @@ define half @local_atomic_fadd_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fadd_ret_f16__offset__align4:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
+; GFX6-NEXT: ds_read_b32 v0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_add_f32_e32 v0, 4.0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fadd ptr addrspace(3) %gep, half 4.0 seq_cst, align 4
@@ -3754,7 +3749,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fadd_ret_bf16:
@@ -3788,7 +3782,6 @@ define bfloat @local_atomic_fadd_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fadd ptr addrspace(3) %ptr, bfloat 4.0 seq_cst
ret bfloat %result
@@ -4220,7 +4213,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fadd_ret_bf16__offset:
@@ -4255,7 +4247,6 @@ define bfloat @local_atomic_fadd_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fadd ptr addrspace(3) %gep, bfloat 4.0 seq_cst
@@ -5492,34 +5483,33 @@ define bfloat @local_atomic_fadd_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fadd_ret_bf16__offset__align4:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
+; GFX6-NEXT: ds_read_b32 v0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v1, 4.0, v1
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v0, 4.0, v0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB18_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fadd ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
index 4dd7f0e3c450a..91add012bdcfa 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmax.ll
@@ -1127,7 +1127,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_f16:
@@ -1161,7 +1160,6 @@ define half @local_atomic_fmax_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(3) %ptr, half 4.0 seq_cst
ret half %result
@@ -1533,7 +1531,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_f16__offset:
@@ -1568,7 +1565,6 @@ define half @local_atomic_fmax_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fmax ptr addrspace(3) %gep, half 4.0 seq_cst
@@ -2620,34 +2616,33 @@ define half @local_atomic_fmax_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_f16__offset__align4:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
+; GFX6-NEXT: ds_read_b32 v0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_max_f32_e32 v0, 4.0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fmax ptr addrspace(3) %gep, half 4.0 seq_cst, align 4
@@ -3365,7 +3360,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_bf16:
@@ -3400,7 +3394,6 @@ define bfloat @local_atomic_fmax_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmax ptr addrspace(3) %ptr, bfloat 4.0 seq_cst
ret bfloat %result
@@ -3833,7 +3826,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_bf16__offset:
@@ -3869,7 +3861,6 @@ define bfloat @local_atomic_fmax_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fmax ptr addrspace(3) %gep, bfloat 4.0 seq_cst
@@ -5111,35 +5102,34 @@ define bfloat @local_atomic_fmax_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmax_ret_bf16__offset__align4:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
+; GFX6-NEXT: ds_read_b32 v0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_max_f32_e32 v1, 4.0, v1
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_max_f32_e32 v0, 4.0, v0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB18_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fmax ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
index 57fe5f708e216..8597c2e256584 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fmin.ll
@@ -1127,7 +1127,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_f16:
@@ -1161,7 +1160,6 @@ define half @local_atomic_fmin_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(3) %ptr, half 4.0 seq_cst
ret half %result
@@ -1533,7 +1531,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_f16__offset:
@@ -1568,7 +1565,6 @@ define half @local_atomic_fmin_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fmin ptr addrspace(3) %gep, half 4.0 seq_cst
@@ -2620,34 +2616,33 @@ define half @local_atomic_fmin_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_f16__offset__align4:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
+; GFX6-NEXT: ds_read_b32 v0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_min_f32_e32 v0, 4.0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fmin ptr addrspace(3) %gep, half 4.0 seq_cst, align 4
@@ -3365,7 +3360,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_bf16:
@@ -3400,7 +3394,6 @@ define bfloat @local_atomic_fmin_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fmin ptr addrspace(3) %ptr, bfloat 4.0 seq_cst
ret bfloat %result
@@ -3833,7 +3826,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_bf16__offset:
@@ -3869,7 +3861,6 @@ define bfloat @local_atomic_fmin_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fmin ptr addrspace(3) %gep, bfloat 4.0 seq_cst
@@ -5111,35 +5102,34 @@ define bfloat @local_atomic_fmin_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fmin_ret_bf16__offset__align4:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
+; GFX6-NEXT: ds_read_b32 v0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX6-NEXT: v_min_f32_e32 v1, 4.0, v1
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX6-NEXT: v_min_f32_e32 v0, 4.0, v0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB18_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fmin ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
index 3a971a3b5a8d2..290d3117cac9a 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fsub.ll
@@ -2034,7 +2034,6 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fsub_ret_f16:
@@ -2068,7 +2067,6 @@ define half @local_atomic_fsub_ret_f16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fsub ptr addrspace(3) %ptr, half 4.0 seq_cst
ret half %result
@@ -2429,7 +2427,6 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fsub_ret_f16__offset:
@@ -2464,7 +2461,6 @@ define half @local_atomic_fsub_ret_f16__offset(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fsub ptr addrspace(3) %gep, half 4.0 seq_cst
@@ -3483,34 +3479,33 @@ define half @local_atomic_fsub_ret_f16__offset__align4(ptr addrspace(3) %ptr) no
; GFX7-NEXT: s_cbranch_execnz .LBB12_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fsub_ret_f16__offset__align4:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
+; GFX6-NEXT: ds_read_b32 v0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB12_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v2
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v2
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_add_f32_e32 v0, -4.0, v0
+; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB12_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr half, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fsub ptr addrspace(3) %gep, half 4.0 seq_cst, align 4
@@ -4216,7 +4211,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fsub_ret_bf16:
@@ -4250,7 +4244,6 @@ define bfloat @local_atomic_fsub_ret_bf16(ptr addrspace(3) %ptr) nounwind {
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v0, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%result = atomicrmw fsub ptr addrspace(3) %ptr, bfloat 4.0 seq_cst
ret bfloat %result
@@ -4682,7 +4675,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX7-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fsub_ret_bf16__offset:
@@ -4717,7 +4709,6 @@ define bfloat @local_atomic_fsub_ret_bf16__offset(ptr addrspace(3) %ptr) nounwin
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
; GFX6-NEXT: v_lshrrev_b32_e32 v0, v1, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst
@@ -5954,34 +5945,33 @@ define bfloat @local_atomic_fsub_ret_bf16__offset__align4(ptr addrspace(3) %ptr)
; GFX7-NEXT: s_cbranch_execnz .LBB18_1
; GFX7-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX7-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v1
+; GFX7-NEXT: v_mov_b32_e32 v0, v1
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX6-LABEL: local_atomic_fsub_ret_bf16__offset__align4:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, 0xfffe, v0
+; GFX6-NEXT: v_add_i32_e32 v1, vcc, 0xfffe, v0
; GFX6-NEXT: s_mov_b32 m0, -1
-; GFX6-NEXT: ds_read_b32 v1, v0
+; GFX6-NEXT: ds_read_b32 v0, v1
; GFX6-NEXT: s_mov_b64 s[4:5], 0
; GFX6-NEXT: .LBB18_1: ; %atomicrmw.start
; GFX6-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_mov_b32_e32 v2, v1
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; GFX6-NEXT: v_add_f32_e32 v1, -4.0, v1
+; GFX6-NEXT: v_mov_b32_e32 v2, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v2
+; GFX6-NEXT: v_add_f32_e32 v0, -4.0, v0
; GFX6-NEXT: v_and_b32_e32 v3, 0xffff0000, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v1
-; GFX6-NEXT: v_or_b32_e32 v1, v3, v1
-; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT: ds_cmpst_rtn_b32 v0, v1, v2, v0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
; GFX6-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
; GFX6-NEXT: s_andn2_b64 exec, exec, s[4:5]
; GFX6-NEXT: s_cbranch_execnz .LBB18_1
; GFX6-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX6-NEXT: s_or_b64 exec, exec, s[4:5]
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v1
; GFX6-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr bfloat, ptr addrspace(3) %ptr, i32 32767
%result = atomicrmw fsub ptr addrspace(3) %gep, bfloat 4.0 seq_cst, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
index 3d48ff437e8ff..7dc9304d5715b 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-hi.ll
@@ -38,6 +38,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo(half %src0, half %s
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -101,6 +104,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo(half %src0, half %s
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_constlo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -173,9 +179,12 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo(half %src0, half %src
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_reglo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v1, v3
+; SDAG-CI-NEXT: v_and_b32_e32 v1, 0xffff, v3
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; SDAG-CI-NEXT: v_or_b32_e32 v0, v1, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
@@ -246,6 +255,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack(half %src0, half %src1, ha
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -325,6 +337,9 @@ define i32 @v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext(half %src0, half %src
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_intpack_sext:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -419,6 +434,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt(half %
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_precvt:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SDAG-CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
@@ -480,6 +498,9 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt(half
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
@@ -560,10 +581,13 @@ define <2 x half> @v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi
; SDAG-CI-LABEL: v_mad_mixhi_f16_f16lo_f16lo_f16lo_undeflo_clamp_postcvt_multi_use:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
-; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_mov_b32 s7, 0xf000
; SDAG-CI-NEXT: s_mov_b32 s6, -1
+; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
+; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v1, v0 clamp
; SDAG-CI-NEXT: buffer_store_short v0, off, s[4:7], 0
; SDAG-CI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index c452f9701ca00..87d33c1c063eb 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -45,7 +45,6 @@ define half @mixlo_simple(float %src0, float %src1, float %src2) #0 {
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: mixlo_simple:
@@ -92,7 +91,6 @@ define half @mixlo_simpl_no_flush(float %src0, float %src1, float %src2) {
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: mixlo_simpl_no_flush:
@@ -138,9 +136,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo(half %src0, half %src1, half %src
; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo:
@@ -197,9 +197,11 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush(half %src0, half %src1,
; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f16lo_no_flush:
@@ -250,9 +252,10 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32(half %src0, half %src1, float %src2
; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32:
@@ -301,9 +304,12 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt(half %src0, half %sr
; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v2
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, v0 clamp
+; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_post_cvt:
@@ -372,9 +378,10 @@ define half @v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt(half %src0, half %src
; SDAG-CI-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2 clamp
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX1100-LABEL: v_mad_mixlo_f16_f16lo_f16lo_f32_clamp_pre_cvt:
@@ -2524,7 +2531,6 @@ define half @mixlo_fptrunc(float %a, float %b) #0 {
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: mixlo_fptrunc:
@@ -2571,7 +2577,6 @@ define half @mixlo_fptrunc_no_flush(float %a, float %b) {
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: mixlo_fptrunc_no_flush:
@@ -2617,7 +2622,6 @@ define half @mixlo_fptrunc_abs_src_mod(float %a, float %b) #0 {
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-CI-NEXT: v_mul_f32_e64 v0, |v0|, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: mixlo_fptrunc_abs_src_mod:
@@ -2664,7 +2668,6 @@ define half @mixlo_fptrunc_neg_src_mod(float %a, float %b) #0 {
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-CI-NEXT: v_mul_f32_e64 v0, -v0, v1
; SDAG-CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: mixlo_fptrunc_neg_src_mod:
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index a252a63ca83e0..ee250fc74c7ae 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -56,7 +56,11 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo(half %src0, half %src1, half %src2
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
+; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo:
@@ -459,6 +463,9 @@ define float @v_mad_mix_f32_negf16lo_f16lo_f16lo(half %src0, half %src1, half %s
; SDAG-CI-LABEL: v_mad_mix_f32_negf16lo_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -536,7 +543,11 @@ define float @v_mad_mix_f32_absf16lo_f16lo_f16lo(half %src0, half %src1, half %s
; SDAG-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_mad_f32 v0, |v0|, v1, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
+; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_f32_absf16lo_f16lo_f16lo:
@@ -595,7 +606,10 @@ define float @v_mad_mix_f32_negabsf16lo_f16lo_f16lo(half %src0, half %src1, half
; SDAG-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_mad_f32 v0, -|v0|, v1, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_f32_negabsf16lo_f16lo_f16lo:
@@ -653,6 +667,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32(half %src0, half %src1, float %src2)
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -707,6 +723,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_negf32(half %src0, half %src1, float %sr
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negf32:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -762,6 +780,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_absf32(half %src0, half %src1, float %sr
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_absf32:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, |v2|
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -817,6 +837,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_negabsf32(half %src0, half %src1, float
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_negabsf32:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, -|v2|
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -881,6 +903,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imm1(half %src0, half %src1) #0 {
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imm1:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, 1.0
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -961,6 +985,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32imminv2pi(half %src0, half %src1) #0
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32imminv2pi:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e22f983
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1048,6 +1074,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi(half %src0, half %src1)
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imminv2pi:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x3e230000
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1149,6 +1177,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_cvtf16imm63:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_madak_f32 v0, v0, v1, 0x367c0000
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1814,6 +1844,9 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals(half %src0, half %sr
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1873,6 +1906,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals(half %src0, half %src1, fl
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_fma_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1955,6 +1990,9 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd(half %src0,
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_denormals_fmulfadd:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1
; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
@@ -2047,6 +2085,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd(half %src0, half
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_denormals_fmulfadd:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mul_f32_e32 v0, v0, v1
; SDAG-CI-NEXT: v_add_f32_e32 v0, v0, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
@@ -2116,7 +2156,11 @@ define float @v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd(half %src0, hal
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-CI-NEXT: v_mac_f32_e32 v2, v0, v1
+; SDAG-CI-NEXT: v_mov_b32_e32 v0, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f16lo_f32_flush_fmulfadd:
@@ -2173,6 +2217,8 @@ define float @v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd(half %src0, half %src
; SDAG-CI-LABEL: v_mad_mix_f32_f16lo_f16lo_f32_flush_fmulfadd:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2230,6 +2276,8 @@ define float @v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo(i32 %src0.arg, half %src1
; SDAG-CI-LABEL: v_mad_mix_f32_negprecvtf16lo_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-CI-NEXT: v_mad_f32 v0, -v0, v1, v2
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
@@ -2329,9 +2377,11 @@ define float @v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo(i32 %src0.arg, half %
; SDAG-CI-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3|
+; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX1100-LABEL: v_mad_mix_f32_precvtnegf16hi_abs_f16lo_f16lo:
@@ -2403,9 +2453,11 @@ define float @v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo(i32 %src0.arg, half %src1
; SDAG-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3|
+; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-CI-LABEL: v_mad_mix_f32_precvtabsf16hi_f16lo_f16lo:
@@ -2467,9 +2519,11 @@ define float @v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo(i32 %src0.arg, half
; SDAG-CI-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, -v3
+; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfneg_f16hi_f16lo_f16lo:
@@ -2552,9 +2606,11 @@ define float @v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo(i32 %src0.arg, half
; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, |v3|
+; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabs_f16hi_f16lo_f16lo:
@@ -2637,9 +2693,11 @@ define float @v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo(i32 %src0.arg,
; SDAG-CI-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
; SDAG-CI: ; %bb.0:
; SDAG-CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-CI-NEXT: v_lshrrev_b32_e32 v0, 16, v0
-; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; SDAG-CI-NEXT: v_mad_f32 v0, v0, v1, v2
+; SDAG-CI-NEXT: v_lshrrev_b32_e32 v3, 16, v0
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v0, v2
+; SDAG-CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v2, -|v3|
+; SDAG-CI-NEXT: v_mac_f32_e32 v0, v2, v1
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
; GISEL-GFX9GEN-LABEL: v_mad_mix_f32_preextractfabsfneg_f16hi_f16lo_f16lo:
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
index 65b2f016a6ba0..c7acbb0584904 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.bf16.ll
@@ -13,12 +13,12 @@ define bfloat @v_maximumnum_bf16(bfloat %x, bfloat %y) {
; GFX7-LABEL: v_maximumnum_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximumnum_bf16:
@@ -222,12 +222,10 @@ define bfloat @v_maximumnum_bf16_nnan(bfloat %x, bfloat %y) {
; GFX7-LABEL: v_maximumnum_bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximumnum_bf16_nnan:
@@ -12322,12 +12320,10 @@ define bfloat @v_maximumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
; GFX7-LABEL: v_maximumnum_bf16_no_ieee:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximumnum_bf16_no_ieee:
diff --git a/llvm/test/CodeGen/AMDGPU/maximumnum.ll b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
index 7d52b2e1d70c6..086c78fd041fc 100644
--- a/llvm/test/CodeGen/AMDGPU/maximumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/maximumnum.ll
@@ -27,24 +27,14 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s
define half @v_maximumnum_f16(half %x, half %y) {
-; GFX7-SDAG-LABEL: v_maximumnum_f16:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f16:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_maximumnum_f16:
; GFX8-SDAG: ; %bb.0:
@@ -189,11 +179,10 @@ define half @v_maximumnum_f16_nnan(half %x, half %y) {
; GFX7-SDAG-LABEL: v_maximumnum_f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_maximumnum_f16_nnan:
@@ -259,21 +248,13 @@ define half @v_maximumnum_f16_nnan(half %x, half %y) {
}
define half @v_maximumnum_f16_1.0(half %x) {
-; GFX7-SDAG-LABEL: v_maximumnum_f16_1.0:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_max_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f16_1.0:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f16_1.0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_max_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_maximumnum_f16_1.0:
; GFX8: ; %bb.0:
@@ -924,24 +905,14 @@ define double @v_maximumnum_f64_1.0(double %x) {
}
define half @v_maximumnum_f16_s_v(half inreg %x, half %y) {
-; GFX7-SDAG-LABEL: v_maximumnum_f16_s_v:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s16
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v1, v0
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f16_s_v:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v1, v0
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f16_s_v:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_max_f32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_maximumnum_f16_s_v:
; GFX8-SDAG: ; %bb.0:
@@ -1099,24 +1070,14 @@ define half @v_maximumnum_f16_s_v(half inreg %x, half %y) {
}
define half @v_maximumnum_f16_v_s(half %x, half inreg %y) {
-; GFX7-SDAG-LABEL: v_maximumnum_f16_v_s:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s16
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f16_v_s:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f16_v_s:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_maximumnum_f16_v_s:
; GFX8-SDAG: ; %bb.0:
@@ -1274,24 +1235,14 @@ define half @v_maximumnum_f16_v_s(half %x, half inreg %y) {
}
define half @v_maximumnum_f16_s_s(half inreg %x, half inreg %y) {
-; GFX7-SDAG-LABEL: v_maximumnum_f16_s_s:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, s16
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s17
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f16_s_s:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s16
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s17
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f16_s_s:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s17
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_maximumnum_f16_s_s:
; GFX8-SDAG: ; %bb.0:
@@ -2612,24 +2563,14 @@ define float @v_maximumnum_f32_fneg(float %x, float %y) {
}
define half @v_maximumnum_f16_fabs_rhs(half %x, half %y) {
-; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f16_fabs_rhs:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f16_fabs_rhs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_maximumnum_f16_fabs_rhs:
; GFX8-SDAG: ; %bb.0:
@@ -2775,11 +2716,11 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) {
; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_maximumnum_f16_fneg_fabs_rhs:
@@ -2933,24 +2874,14 @@ define half @v_maximumnum_f16_fneg_fabs_rhs(half %x, half %y) {
}
define half @v_maximumnum_f16_fabs(half %x, half %y) {
-; GFX7-SDAG-LABEL: v_maximumnum_f16_fabs:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_maximumnum_f16_fabs:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-GISEL-NEXT: v_max_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_maximumnum_f16_fabs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_maximumnum_f16_fabs:
; GFX8-SDAG: ; %bb.0:
@@ -3097,11 +3028,12 @@ define half @v_maximumnum_f16_fneg(half %x, half %y) {
; GFX7-SDAG-LABEL: v_maximumnum_f16_fneg:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v1, -v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v1
; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_maximumnum_f16_fneg:
@@ -8222,11 +8154,10 @@ define half @v_maximumnum_f16_no_ieee(half %x, half %y) #0 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_no_ieee:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_maximumnum_f16_no_ieee:
@@ -8327,11 +8258,10 @@ define half @v_maximumnum_f16_nan_no_ieee(half %x, half %y) #0 {
; GFX7-SDAG-LABEL: v_maximumnum_f16_nan_no_ieee:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_maximumnum_f16_nan_no_ieee:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
index a3c9977fee488..0a794a3ac49b1 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.bf16.ll
@@ -13,12 +13,12 @@ define bfloat @v_minimumnum_bf16(bfloat %x, bfloat %y) {
; GFX7-LABEL: v_minimumnum_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minimumnum_bf16:
@@ -224,12 +224,10 @@ define bfloat @v_minimumnum_bf16_nnan(bfloat %x, bfloat %y) {
; GFX7-LABEL: v_minimumnum_bf16_nnan:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minimumnum_bf16_nnan:
@@ -12355,12 +12353,10 @@ define bfloat @v_minimumnum_bf16_no_ieee(bfloat %x, bfloat %y) #0 {
; GFX7-LABEL: v_minimumnum_bf16_no_ieee:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minimumnum_bf16_no_ieee:
diff --git a/llvm/test/CodeGen/AMDGPU/minimumnum.ll b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
index 329118e3dca01..0311caf93a14e 100644
--- a/llvm/test/CodeGen/AMDGPU/minimumnum.ll
+++ b/llvm/test/CodeGen/AMDGPU/minimumnum.ll
@@ -27,24 +27,14 @@
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-FAKE16,GFX12-GISEL,GFX12-FAKE16-GISEL %s
define half @v_minimumnum_f16(half %x, half %y) {
-; GFX7-SDAG-LABEL: v_minimumnum_f16:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f16:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f16:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_minimumnum_f16:
; GFX8-SDAG: ; %bb.0:
@@ -189,11 +179,10 @@ define half @v_minimumnum_f16_nnan(half %x, half %y) {
; GFX7-SDAG-LABEL: v_minimumnum_f16_nnan:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_minimumnum_f16_nnan:
@@ -259,21 +248,13 @@ define half @v_minimumnum_f16_nnan(half %x, half %y) {
}
define half @v_minimumnum_f16_1.0(half %x) {
-; GFX7-SDAG-LABEL: v_minimumnum_f16_1.0:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_min_f32_e32 v0, 1.0, v0
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f16_1.0:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, 1.0, v0
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f16_1.0:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_min_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_minimumnum_f16_1.0:
; GFX8: ; %bb.0:
@@ -924,24 +905,14 @@ define double @v_minimumnum_f64_1.0(double %x) {
}
define half @v_minimumnum_f16_v_s(half %x, half inreg %y) {
-; GFX7-SDAG-LABEL: v_minimumnum_f16_v_s:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s16
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f16_v_s:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s16
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f16_v_s:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s16
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_minimumnum_f16_v_s:
; GFX8-SDAG: ; %bb.0:
@@ -1099,24 +1070,14 @@ define half @v_minimumnum_f16_v_s(half %x, half inreg %y) {
}
define half @v_minimumnum_f16_s_s(half inreg %x, half inreg %y) {
-; GFX7-SDAG-LABEL: v_minimumnum_f16_s_s:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, s16
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, s17
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f16_s_s:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, s16
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v1, s17
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f16_s_s:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, s16
+; GFX7-NEXT: v_cvt_f32_f16_e32 v1, s17
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_minimumnum_f16_s_s:
; GFX8-SDAG: ; %bb.0:
@@ -2437,24 +2398,14 @@ define float @v_minimumnum_f32_fneg(float %x, float %y) {
}
define half @v_minimumnum_f16_fabs_rhs(half %x, half %y) {
-; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f16_fabs_rhs:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f16_fabs_rhs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_minimumnum_f16_fabs_rhs:
; GFX8-SDAG: ; %bb.0:
@@ -2600,11 +2551,11 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) {
; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_minimumnum_f16_fneg_fabs_rhs:
@@ -2758,24 +2709,14 @@ define half @v_minimumnum_f16_fneg_fabs_rhs(half %x, half %y) {
}
define half @v_minimumnum_f16_fabs(half %x, half %y) {
-; GFX7-SDAG-LABEL: v_minimumnum_f16_fabs:
-; GFX7-SDAG: ; %bb.0:
-; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX7-GISEL-LABEL: v_minimumnum_f16_fabs:
-; GFX7-GISEL: ; %bb.0:
-; GFX7-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX7-GISEL-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; GFX7-GISEL-NEXT: v_min_f32_e32 v0, v0, v1
-; GFX7-GISEL-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-GISEL-NEXT: s_setpc_b64 s[30:31]
+; GFX7-LABEL: v_minimumnum_f16_fabs:
+; GFX7: ; %bb.0:
+; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0|
+; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-SDAG-LABEL: v_minimumnum_f16_fabs:
; GFX8-SDAG: ; %bb.0:
@@ -2922,11 +2863,12 @@ define half @v_minimumnum_f16_fneg(half %x, half %y) {
; GFX7-SDAG-LABEL: v_minimumnum_f16_fneg:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v0, -v0
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e64 v1, -v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX7-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v0, -v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e64 v1, -v1
; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_minimumnum_f16_fneg:
@@ -8047,11 +7989,10 @@ define half @v_minimumnum_f16_no_ieee(half %x, half %y) #0 {
; GFX7-SDAG-LABEL: v_minimumnum_f16_no_ieee:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_minimumnum_f16_no_ieee:
@@ -8152,11 +8093,10 @@ define half @v_minimumnum_f16_nan_no_ieee(half %x, half %y) #0 {
; GFX7-SDAG-LABEL: v_minimumnum_f16_nan_no_ieee:
; GFX7-SDAG: ; %bb.0:
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: v_minimumnum_f16_nan_no_ieee:
diff --git a/llvm/test/CodeGen/AMDGPU/omod.ll b/llvm/test/CodeGen/AMDGPU/omod.ll
index 9371ce57dc0fe..90632c663bf4a 100644
--- a/llvm/test/CodeGen/AMDGPU/omod.ll
+++ b/llvm/test/CodeGen/AMDGPU/omod.ll
@@ -1083,10 +1083,9 @@ define amdgpu_ps void @v_omod_mul2_f64_denormals(double %a) #2 {
define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
; SI-LABEL: v_omod_div2_f16_denormals:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -1140,10 +1139,9 @@ define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
; SI-LABEL: v_omod_mul2_f16_denormals:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 mul:2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -1196,10 +1194,9 @@ define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
; SI-LABEL: v_omod_div2_f16_no_denormals:
; SI: ; %bb.0:
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_add_f32_e64 v0, v0, 1.0 div:2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index b43454840ee16..a549a2ff0f9a3 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -462,8 +462,8 @@ define bfloat @atomicrmw_fadd_private_bf16(ptr addrspace(5) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_ushort v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT: v_add_f32_e32 v2, 2.0, v1
+; GCN-NEXT: v_lshlrev_b32_e32 v2, 16, v1
+; GCN-NEXT: v_add_f32_e32 v2, 2.0, v2
; GCN-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GCN-NEXT: buffer_store_short v2, v0, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
index 38d0a377a3ffb..ff3a735bd32b4 100644
--- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
+++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll
@@ -246,17 +246,14 @@ define <2 x half> @v_repeat_divisor_f16_x2_arcp(half %x, half %y, half %D) #0 {
; GFX6-LABEL: v_repeat_divisor_f16_x2_arcp:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX6-NEXT: v_div_scale_f32 v3, s[4:5], v2, v2, 1.0
; GFX6-NEXT: v_rcp_f32_e32 v4, v3
+; GFX6-NEXT: v_fma_f32 v5, -v3, v4, 1.0
+; GFX6-NEXT: v_fma_f32 v4, v5, v4, v4
; GFX6-NEXT: v_div_scale_f32 v5, vcc, 1.0, v2, 1.0
-; GFX6-NEXT: v_fma_f32 v6, -v3, v4, 1.0
-; GFX6-NEXT: v_fma_f32 v4, v6, v4, v4
; GFX6-NEXT: v_mul_f32_e32 v6, v5, v4
; GFX6-NEXT: v_fma_f32 v7, -v3, v6, v5
; GFX6-NEXT: v_fma_f32 v6, v7, v4, v6
@@ -530,10 +527,6 @@ define <3 x half> @v_repeat_divisor_f16_x3_arcp(half %x, half %y, half %z, half
; GFX6-LABEL: v_repeat_divisor_f16_x3_arcp:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/roundeven.ll b/llvm/test/CodeGen/AMDGPU/roundeven.ll
index 0b7e4e90dc317..99d494d4feaf4 100644
--- a/llvm/test/CodeGen/AMDGPU/roundeven.ll
+++ b/llvm/test/CodeGen/AMDGPU/roundeven.ll
@@ -380,17 +380,17 @@ define half @v_roundeven_f16(half %x) {
; SDAG_GFX6-LABEL: v_roundeven_f16:
; SDAG_GFX6: ; %bb.0:
; SDAG_GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; SDAG_GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG_GFX6-NEXT: v_rndne_f32_e32 v0, v0
+; SDAG_GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
; SDAG_GFX6-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX7-LABEL: v_roundeven_f16:
; SDAG_GFX7: ; %bb.0:
; SDAG_GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; SDAG_GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG_GFX7-NEXT: v_rndne_f32_e32 v0, v0
+; SDAG_GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; SDAG_GFX7-NEXT: s_setpc_b64 s[30:31]
;
; SDAG_GFX8-LABEL: v_roundeven_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
index 1222d0efd62bb..9ae6e60385bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll
@@ -13,15 +13,13 @@ define half @add_select_fabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_fabs_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e64 v0, |v0|, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_fabs_f16:
@@ -79,10 +77,6 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h
; CI-LABEL: add_select_multi_use_lhs_fabs_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
@@ -91,6 +85,8 @@ define { half, half } @add_select_multi_use_lhs_fabs_fabs_f16(i32 %c, half %x, h
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e64 v0, |v0|, v4
; CI-NEXT: v_add_f32_e64 v1, |v1|, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_multi_use_lhs_fabs_fabs_f16:
@@ -156,16 +152,14 @@ define { half, half } @add_select_multi_store_use_lhs_fabs_fabs_f16(i32 %c, half
; CI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_and_b32_e32 v1, 0x7fffffff, v4
-; CI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e64 v0, |v0|, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e64 v1, |v1|
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_multi_store_use_lhs_fabs_fabs_f16:
@@ -231,10 +225,6 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h
; CI-LABEL: add_select_multi_use_rhs_fabs_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -243,6 +233,8 @@ define { half, half } @add_select_multi_use_rhs_fabs_fabs_f16(i32 %c, half %x, h
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e64 v0, |v0|, v3
; CI-NEXT: v_add_f32_e64 v1, |v2|, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_multi_use_rhs_fabs_fabs_f16:
@@ -308,15 +300,13 @@ define half @add_select_fabs_var_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_fabs_var_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_var_f16:
@@ -378,13 +368,12 @@ define half @add_select_fabs_negk_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_fabs_negk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_negk_f16:
@@ -448,11 +437,11 @@ define half @add_select_fabs_negk_negk_f16(i32 %c, half %x) {
; CI-LABEL: add_select_fabs_negk_negk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_sub_f32_e32 v0, v1, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_negk_negk_f16:
@@ -515,11 +504,11 @@ define half @add_select_posk_posk_f16(i32 %c, half %x) {
; CI-LABEL: add_select_posk_posk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e64 v0, 1.0, 2.0, vcc
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_add_f32_e32 v0, v0, v1
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_posk_posk_f16:
@@ -581,13 +570,12 @@ define half @add_select_negk_fabs_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_negk_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negk_fabs_f16:
@@ -650,14 +638,13 @@ define half @add_select_negliteralk_fabs_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_negliteralk_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_mov_b32_e32 v3, 0xc4800000
-; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_mov_b32_e32 v3, 0xc4800000
+; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negliteralk_fabs_f16:
@@ -720,13 +707,12 @@ define half @add_select_fabs_posk_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_fabs_posk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; CI-NEXT: v_add_f32_e64 v0, |v0|, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_posk_f16:
@@ -784,13 +770,12 @@ define half @add_select_posk_fabs_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_posk_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; CI-NEXT: v_add_f32_e64 v0, |v0|, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_posk_fabs_f16:
@@ -848,15 +833,13 @@ define half @add_select_fneg_fneg_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_fneg_fneg_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v3, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fneg_fneg_f16:
@@ -914,10 +897,6 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h
; CI-LABEL: add_select_multi_use_lhs_fneg_fneg_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -926,6 +905,8 @@ define { half, half } @add_select_multi_use_lhs_fneg_fneg_f16(i32 %c, half %x, h
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v3, v0
; CI-NEXT: v_sub_f32_e32 v1, v4, v1
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_multi_use_lhs_fneg_fneg_f16:
@@ -991,16 +972,15 @@ define { half, half } @add_select_multi_store_use_lhs_fneg_fneg_f16(i32 %c, half
; CI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v4, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc
; CI-NEXT: v_sub_f32_e32 v0, v3, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_xor_b32_e32 v1, 0x8000, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_multi_store_use_lhs_fneg_fneg_f16:
@@ -1066,10 +1046,6 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h
; CI-LABEL: add_select_multi_use_rhs_fneg_fneg_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -1078,6 +1054,8 @@ define { half, half } @add_select_multi_use_rhs_fneg_fneg_f16(i32 %c, half %x, h
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v3, v0
; CI-NEXT: v_sub_f32_e32 v1, v4, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_multi_use_rhs_fneg_fneg_f16:
@@ -1143,15 +1121,14 @@ define half @add_select_fneg_var_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_fneg_var_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e64 v1, -v1
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fneg_var_f16:
@@ -1213,13 +1190,12 @@ define half @add_select_fneg_negk_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_fneg_negk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v2, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fneg_negk_f16:
@@ -1277,14 +1253,13 @@ define half @add_select_fneg_inv2pi_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_fneg_inv2pi_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_mov_b32_e32 v3, 0xbe230000
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_mov_b32_e32 v3, 0xbe230000
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v2, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fneg_inv2pi_f16:
@@ -1342,14 +1317,13 @@ define half @add_select_fneg_neginv2pi_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_fneg_neginv2pi_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_mov_b32_e32 v3, 0x3e230000
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_mov_b32_e32 v3, 0x3e230000
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v2, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fneg_neginv2pi_f16:
@@ -1407,11 +1381,11 @@ define half @add_select_negk_negk_f16(i32 %c, half %x) {
; CI-LABEL: add_select_negk_negk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_add_f32_e32 v0, v0, v1
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negk_negk_f16:
@@ -1473,13 +1447,13 @@ define half @add_select_negliteralk_negliteralk_f16(i32 %c, half %x) {
; CI-LABEL: add_select_negliteralk_negliteralk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_mov_b32_e32 v2, 0xc5800000
; CI-NEXT: v_mov_b32_e32 v3, 0xc5000000
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v1
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negliteralk_negliteralk_f16:
@@ -1541,11 +1515,11 @@ define half @add_select_fneg_negk_negk_f16(i32 %c, half %x) {
; CI-LABEL: add_select_fneg_negk_negk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e64 v0, -1.0, -2.0, vcc
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_sub_f32_e32 v0, v1, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fneg_negk_negk_f16:
@@ -1608,13 +1582,12 @@ define half @add_select_negk_fneg_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_negk_fneg_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 1.0, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v2, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negk_fneg_f16:
@@ -1672,13 +1645,12 @@ define half @add_select_fneg_posk_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_fneg_posk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v2, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fneg_posk_f16:
@@ -1736,13 +1708,12 @@ define half @add_select_posk_fneg_f16(i32 %c, half %x, half %y) {
; CI-LABEL: add_select_posk_fneg_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, -1.0, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v2, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_posk_fneg_f16:
@@ -1800,15 +1771,14 @@ define half @add_select_negfabs_fabs_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_negfabs_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
+; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negfabs_fabs_f16:
@@ -1877,15 +1847,14 @@ define half @add_select_fabs_negfabs_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_fabs_negfabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_and_b32_e32 v2, 0x7fff, v2
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_cvt_f32_f16_e64 v2, -|v2|
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_negfabs_f16:
@@ -1954,15 +1923,14 @@ define half @add_select_neg_fabs_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_neg_fabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e64 v1, -v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_and_b32_e32 v1, 0xffff, v1
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
+; CI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_neg_fabs_f16:
@@ -2030,15 +1998,14 @@ define half @add_select_fabs_neg_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_fabs_neg_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e64 v2, -v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_and_b32_e32 v2, 0xffff, v2
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
-; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cvt_f32_f16_e64 v2, -v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_add_f32_e32 v0, v0, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_fabs_neg_f16:
@@ -2106,15 +2073,13 @@ define half @add_select_neg_negfabs_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_neg_negfabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e64 v2, |v2|
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc
; CI-NEXT: v_sub_f32_e32 v0, v3, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_neg_negfabs_f16:
@@ -2178,15 +2143,13 @@ define half @add_select_negfabs_neg_f16(i32 %c, half %x, half %y, half %z) {
; CI-LABEL: add_select_negfabs_neg_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
; CI-NEXT: v_sub_f32_e32 v0, v3, v0
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: add_select_negfabs_neg_f16:
@@ -2250,13 +2213,13 @@ define half @mul_select_negfabs_posk_f16(i32 %c, half %x, half %y) {
; CI-LABEL: mul_select_negfabs_posk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc
; CI-NEXT: v_mul_f32_e32 v0, v0, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mul_select_negfabs_posk_f16:
@@ -2320,13 +2283,13 @@ define half @mul_select_posk_negfabs_f16(i32 %c, half %x, half %y) {
; CI-LABEL: mul_select_posk_negfabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, 4.0, v1, vcc
; CI-NEXT: v_mul_f32_e32 v0, v0, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mul_select_posk_negfabs_f16:
@@ -2390,13 +2353,13 @@ define half @mul_select_negfabs_negk_f16(i32 %c, half %x, half %y) {
; CI-LABEL: mul_select_negfabs_negk_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc
; CI-NEXT: v_mul_f32_e32 v0, v0, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mul_select_negfabs_negk_f16:
@@ -2460,13 +2423,13 @@ define half @mul_select_negk_negfabs_f16(i32 %c, half %x, half %y) {
; CI-LABEL: mul_select_negk_negfabs_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; CI-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; CI-NEXT: v_cvt_f32_f16_e64 v1, -|v1|
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; CI-NEXT: v_cndmask_b32_e32 v0, -4.0, v1, vcc
; CI-NEXT: v_mul_f32_e32 v0, v0, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: mul_select_negk_negfabs_f16:
@@ -2534,11 +2497,11 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
; CI-SAFE-LABEL: select_fneg_posk_src_add_f16:
; CI-SAFE: ; %bb.0:
; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-SAFE-NEXT: v_add_f32_e32 v1, 4.0, v1
; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc
+; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: select_fneg_posk_src_add_f16:
@@ -2574,11 +2537,11 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) {
; CI-NSZ-LABEL: select_fneg_posk_src_add_f16:
; CI-NSZ: ; %bb.0:
; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NSZ-NEXT: v_sub_f32_e32 v1, -4.0, v1
; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-NSZ-LABEL: select_fneg_posk_src_add_f16:
@@ -2618,11 +2581,11 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
; CI-SAFE-LABEL: select_fneg_posk_src_sub_f16:
; CI-SAFE: ; %bb.0:
; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-SAFE-NEXT: v_add_f32_e32 v1, -4.0, v1
; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc
+; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: select_fneg_posk_src_sub_f16:
@@ -2658,11 +2621,11 @@ define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) {
; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
; CI-NSZ: ; %bb.0:
; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NSZ-NEXT: v_sub_f32_e32 v1, 4.0, v1
; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-NSZ-LABEL: select_fneg_posk_src_sub_f16:
@@ -2702,11 +2665,11 @@ define half @select_fneg_posk_src_mul_f16(i32 %c, half %x) {
; CI-LABEL: select_fneg_posk_src_mul_f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NEXT: v_mul_f32_e32 v1, -4.0, v1
; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: select_fneg_posk_src_mul_f16:
@@ -2764,13 +2727,12 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
; CI-SAFE-LABEL: select_fneg_posk_src_fma_f16:
; CI-SAFE: ; %bb.0:
; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-SAFE-NEXT: v_fma_f32 v1, v1, 4.0, v2
; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc
+; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: select_fneg_posk_src_fma_f16:
@@ -2806,13 +2768,12 @@ define half @select_fneg_posk_src_fma_f16(i32 %c, half %x, half %z) {
; CI-NSZ-LABEL: select_fneg_posk_src_fma_f16:
; CI-NSZ: ; %bb.0:
; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NSZ-NEXT: v_fma_f32 v1, v1, -4.0, -v2
; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-NSZ-LABEL: select_fneg_posk_src_fma_f16:
@@ -2852,14 +2813,13 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) {
; CI-SAFE-LABEL: select_fneg_posk_src_fmad_f16:
; CI-SAFE: ; %bb.0:
; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-SAFE-NEXT: v_mul_f32_e32 v1, 4.0, v1
; CI-SAFE-NEXT: v_add_f32_e32 v1, v1, v2
; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc
+; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-SAFE-NEXT: s_setpc_b64 s[30:31]
;
; VI-SAFE-LABEL: select_fneg_posk_src_fmad_f16:
@@ -2895,14 +2855,13 @@ define half @select_fneg_posk_src_fmad_f16(i32 %c, half %x, half %z) {
; CI-NSZ-LABEL: select_fneg_posk_src_fmad_f16:
; CI-NSZ: ; %bb.0:
; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v2, v2
-; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; CI-NSZ-NEXT: v_mul_f32_e32 v1, -4.0, v1
; CI-NSZ-NEXT: v_sub_f32_e32 v1, v1, v2
; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc
+; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NSZ-NEXT: s_setpc_b64 s[30:31]
;
; VI-NSZ-LABEL: select_fneg_posk_src_fmad_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
index 0de366132e31e..c7422a25f71e7 100644
--- a/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/select-flags-to-fmin-fmax.ll
@@ -521,11 +521,10 @@ define half @v_test_fmin_legacy_ule_f16_safe(half %a, half %b) {
; GFX7-LABEL: v_test_fmin_legacy_ule_f16_safe:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_fmin_legacy_ule_f16_safe:
@@ -567,11 +566,10 @@ define half @v_test_fmin_legacy_ule_f16_nnan_flag(half %a, half %b) {
; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_flag:
@@ -613,11 +611,10 @@ define half @v_test_fmin_legacy_ule_f16_nsz_flag(half %a, half %b) {
; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_min_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nsz_flag:
@@ -659,11 +656,10 @@ define half @v_test_fmin_legacy_ule_f16_nnan_nsz_flag(half %a, half %b) {
; GFX7-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_fmin_legacy_ule_f16_nnan_nsz_flag:
@@ -700,11 +696,10 @@ define half @v_test_fmax_legacy_uge_f16_safe(half %a, half %b) {
; GFX7-LABEL: v_test_fmax_legacy_uge_f16_safe:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_fmax_legacy_uge_f16_safe:
@@ -746,11 +741,10 @@ define half @v_test_fmax_legacy_uge_f16_nnan_flag(half %a, half %b) {
; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_flag:
@@ -792,11 +786,10 @@ define half @v_test_fmax_legacy_uge_f16_nsz_flag(half %a, half %b) {
; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_max_legacy_f32_e32 v0, v1, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nsz_flag:
@@ -838,11 +831,10 @@ define half @v_test_fmax_legacy_uge_f16_nnan_nsz_flag(half %a, half %b) {
; GFX7-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX9-LABEL: v_test_fmax_legacy_uge_f16_nnan_nsz_flag:
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
index 5335787a820be..9a52b96bde709 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fpext.ll
@@ -10,8 +10,6 @@ define float @v_constrained_fpext_f16_to_f32_fpexcept_strict(half %arg) #0 {
; SI-LABEL: v_constrained_fpext_f16_to_f32_fpexcept_strict:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -199,8 +197,6 @@ define double @v_constrained_fpext_f16_to_f64_fpexcept_strict(half %arg) #0 {
; SI-LABEL: v_constrained_fpext_f16_to_f64_fpexcept_strict:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -352,10 +348,7 @@ define float @v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict(half %arg) #0
; SI-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v_constrained_fneg_fpext_f16_to_f32_fpexcept_strict:
@@ -394,10 +387,8 @@ define float @v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict(half %arg) #0
; SI-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v_constrained_fpext_fneg_f16_to_f32_fpexcept_strict:
diff --git a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
index dc57c22f16a26..31c64046de11a 100644
--- a/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/strict_fptrunc.ll
@@ -13,6 +13,7 @@ define half @v_constrained_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v_constrained_fptrunc_f32_to_f16_fpexcept_strict:
@@ -227,7 +228,7 @@ define half @v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict(float %arg) #
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v_constrained_fneg_fptrunc_f32_to_f16_fpexcept_strict:
@@ -269,6 +270,7 @@ define half @v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict(float %arg) #
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_and_b32_e32 v0, 0xffff, v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; GFX89-LABEL: v_constrained_fptrunc_fneg_f32_to_f16_fpexcept_strict:
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 8e43f4e788bb0..9fe064c717972 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -17,8 +17,6 @@ define void @f16_arg(half %arg, ptr %ptr) #0 {
; GFX7-LABEL: f16_arg:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: flat_store_dword v[1:2], v0
; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -102,6 +100,7 @@ define half @f16_return(float %arg) #0 {
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
%fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
ret half %fptrunc
@@ -194,8 +193,6 @@ define void @outgoing_f16_arg(ptr %ptr) #0 {
; GFX7-NEXT: s_mov_b32 s16, f16_user at abs32@lo
; GFX7-NEXT: s_addk_i32 s32, 0x400
; GFX7-NEXT: v_writelane_b32 v40, s31, 1
-; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX7-NEXT: v_readlane_b32 s31, v40, 1
; GFX7-NEXT: v_readlane_b32 s30, v40, 0
@@ -264,17 +261,13 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
; GFX7-NEXT: v_mov_b32_e32 v41, v1
; GFX7-NEXT: v_mov_b32_e32 v40, v0
; GFX7-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX7-NEXT: flat_store_short v[40:41], v0
+; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT: v_readlane_b32 s31, v42, 1
; GFX7-NEXT: v_readlane_b32 s30, v42, 0
; GFX7-NEXT: s_mov_b32 s32, s33
-; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-NEXT: v_readlane_b32 s4, v42, 2
-; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: flat_store_short v[40:41], v0
-; GFX7-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
@@ -436,6 +429,7 @@ define half @call_split_type_used_outside_block_v8f16() #0 {
; GFX7-NEXT: v_readlane_b32 s4, v40, 2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_or_saveexec_b64 s[6:7], -1
; GFX7-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
; GFX7-NEXT: s_mov_b64 exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
index 98e7df04be444..deb140fa7e941 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fadd.ll
@@ -20,13 +20,13 @@ define half @test_vector_reduce_fadd_v2half(half %sp, <2 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v2half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v2half:
@@ -158,15 +158,15 @@ define half @test_vector_reduce_fadd_v3half(half %sp, <3 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v3half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v3half:
@@ -311,11 +311,10 @@ define half @test_vector_reduce_fadd_v4half(half %sp, <4 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v4half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -323,6 +322,7 @@ define half @test_vector_reduce_fadd_v4half(half %sp, <4 x half> %v) {
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v2
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v3
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v4half:
@@ -499,11 +499,10 @@ define half @test_vector_reduce_fadd_v8half(half %sp, <8 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v8half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7
@@ -521,6 +520,7 @@ define half @test_vector_reduce_fadd_v8half(half %sp, <8 x half> %v) {
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v6
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v4
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v5
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v8half:
@@ -787,11 +787,10 @@ define half @test_vector_reduce_fadd_v16half(half %sp, <16 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fadd_v16half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15
@@ -829,6 +828,7 @@ define half @test_vector_reduce_fadd_v16half(half %sp, <16 x half> %v) {
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v10
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v8
; GFX7-SDAG-NEXT: v_add_f32_e32 v0, v0, v9
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fadd_v16half:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
index b9dcb1b7295c2..4c212daab39ee 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmax.ll
@@ -24,6 +24,7 @@ define half @test_vector_reduce_fmax_v2half(<2 x half> %v) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v2half:
@@ -189,6 +190,7 @@ define half @test_vector_reduce_fmax_v3half(<3 x half> %v) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v3half:
@@ -396,6 +398,7 @@ define half @test_vector_reduce_fmax_v4half(<4 x half> %v) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_max_f32_e32 v0, v0, v3
; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v2
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v4half:
@@ -631,6 +634,7 @@ define half @test_vector_reduce_fmax_v8half(<8 x half> %v) {
; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v1, v6
; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v2, v5
; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v3, v4
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v8half:
@@ -994,6 +998,7 @@ define half @test_vector_reduce_fmax_v16half(<16 x half> %v) {
; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v5, v10
; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v6, v9
; GFX7-SDAG-NEXT: v_max3_f32 v0, v0, v7, v8
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmax_v16half:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
index 58da94d7c4683..d198bb45654da 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmaximum.ll
@@ -19,6 +19,7 @@ define half @test_vector_reduce_fmaximum_v2half(<2 x half> %v) {
; GFX7-NEXT: v_max_f32_e32 v3, v0, v1
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fmaximum_v2half:
@@ -109,6 +110,7 @@ define half @test_vector_reduce_fmaximum_v3half(<3 x half> %v) {
; GFX7-NEXT: v_max_f32_e32 v2, v0, v1
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fmaximum_v3half:
@@ -230,6 +232,7 @@ define half @test_vector_reduce_fmaximum_v4half(<4 x half> %v) {
; GFX7-NEXT: v_max_f32_e32 v1, v0, v3
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3
; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fmaximum_v4half:
@@ -384,6 +387,7 @@ define half @test_vector_reduce_fmaximum_v8half(<8 x half> %v) {
; GFX7-NEXT: v_max_f32_e32 v1, v0, v4
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fmaximum_v8half:
@@ -646,6 +650,7 @@ define half @test_vector_reduce_fmaximum_v16half(<16 x half> %v) {
; GFX7-NEXT: v_max_f32_e32 v1, v0, v8
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8
; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fmaximum_v16half:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
index 07524d6917740..479dc08a4f7aa 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmin.ll
@@ -24,6 +24,7 @@ define half @test_vector_reduce_fmin_v2half(<2 x half> %v) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v2half:
@@ -189,6 +190,7 @@ define half @test_vector_reduce_fmin_v3half(<3 x half> %v) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v1
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v3half:
@@ -396,6 +398,7 @@ define half @test_vector_reduce_fmin_v4half(<4 x half> %v) {
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX7-SDAG-NEXT: v_min_f32_e32 v0, v0, v3
; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v2
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v4half:
@@ -631,6 +634,7 @@ define half @test_vector_reduce_fmin_v8half(<8 x half> %v) {
; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v1, v6
; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v2, v5
; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v3, v4
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v8half:
@@ -994,6 +998,7 @@ define half @test_vector_reduce_fmin_v16half(<16 x half> %v) {
; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v5, v10
; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v6, v9
; GFX7-SDAG-NEXT: v_min3_f32 v0, v0, v7, v8
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmin_v16half:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
index 16732a429e4b0..506d847c1144b 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fminimum.ll
@@ -21,6 +21,7 @@ define half @test_vector_reduce_fminimum_v2half(<2 x half> %v) {
; GFX7-NEXT: v_min_f32_e32 v3, v0, v1
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fminimum_v2half:
@@ -133,6 +134,7 @@ define half @test_vector_reduce_fminimum_v3half(<3 x half> %v) {
; GFX7-NEXT: v_min_f32_e32 v2, v0, v1
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v1
; GFX7-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fminimum_v3half:
@@ -279,6 +281,7 @@ define half @test_vector_reduce_fminimum_v4half(<4 x half> %v) {
; GFX7-NEXT: v_min_f32_e32 v1, v0, v3
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v3
; GFX7-NEXT: v_cndmask_b32_e32 v0, v5, v1, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fminimum_v4half:
@@ -462,6 +465,7 @@ define half @test_vector_reduce_fminimum_v8half(<8 x half> %v) {
; GFX7-NEXT: v_min_f32_e32 v1, v0, v4
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v4
; GFX7-NEXT: v_cndmask_b32_e32 v0, v9, v1, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fminimum_v8half:
@@ -766,6 +770,7 @@ define half @test_vector_reduce_fminimum_v16half(<16 x half> %v) {
; GFX7-NEXT: v_min_f32_e32 v1, v0, v8
; GFX7-NEXT: v_cmp_o_f32_e32 vcc, v0, v8
; GFX7-NEXT: v_cndmask_b32_e32 v0, v17, v1, vcc
+; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: test_vector_reduce_fminimum_v16half:
diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
index 45fc82abb507e..7ea92e7b3582c 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-fmul.ll
@@ -20,13 +20,13 @@ define half @test_vector_reduce_fmul_v2half(half %sp, <2 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v2half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v2half:
@@ -158,15 +158,15 @@ define half @test_vector_reduce_fmul_v3half(half %sp, <3 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v3half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v1
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v3half:
@@ -311,11 +311,10 @@ define half @test_vector_reduce_fmul_v4half(half %sp, <4 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v4half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v4, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v4, v4
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v3, v3
@@ -323,6 +322,7 @@ define half @test_vector_reduce_fmul_v4half(half %sp, <4 x half> %v) {
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v2
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v3
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v4half:
@@ -499,11 +499,10 @@ define half @test_vector_reduce_fmul_v8half(half %sp, <8 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v8half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v8, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v8, v8
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v7, 16, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v7, v7
@@ -521,6 +520,7 @@ define half @test_vector_reduce_fmul_v8half(half %sp, <8 x half> %v) {
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v6
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v4
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v5
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v8half:
@@ -787,11 +787,10 @@ define half @test_vector_reduce_fmul_v16half(half %sp, <16 x half> %v) {
; GFX7-SDAG-LABEL: test_vector_reduce_fmul_v16half:
; GFX7-SDAG: ; %bb.0: ; %entry
; GFX7-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v16, 16, v1
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v1, v1
-; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v16, v16
; GFX7-SDAG-NEXT: v_lshrrev_b32_e32 v15, 16, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX7-SDAG-NEXT: v_cvt_f32_f16_e32 v15, v15
@@ -829,6 +828,7 @@ define half @test_vector_reduce_fmul_v16half(half %sp, <16 x half> %v) {
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v10
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v8
; GFX7-SDAG-NEXT: v_mul_f32_e32 v0, v0, v9
+; GFX7-SDAG-NEXT: v_cvt_f16_f32_e32 v0, v0
; GFX7-SDAG-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-GISEL-LABEL: test_vector_reduce_fmul_v16half:
More information about the llvm-branch-commits
mailing list