[llvm-branch-commits] [llvm] AMDGPU: Add pattern for copysign of 0 (PR #172699)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Wed Dec 17 09:31:43 PST 2025


https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/172699

Avoiding v_bfi_b32 is desirable since on gfx9 it
requires materializing the constant.

Similar could be done for infinity, with or 0x7fffffff

>From 65eefb7de6e1a3f29805fbc0b86039ffb3b9360c Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 17 Dec 2025 11:54:03 +0100
Subject: [PATCH] AMDGPU: Add pattern for copysign of 0

Avoiding v_bfi_b32 is desirable since on gfx9 it
requires materializing the constant.

Similar could be done for infinity, with or 0x7fffffff
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td      |   8 +
 llvm/lib/Target/AMDGPU/SIInstructions.td   |  88 ++
 llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll | 119 +--
 llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll  | 136 ++-
 llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll  |  21 +-
 llvm/test/CodeGen/AMDGPU/frem.ll           | 913 ++++++++++-----------
 llvm/test/CodeGen/AMDGPU/wave32.ll         |   8 +-
 7 files changed, 618 insertions(+), 675 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 628b972f97086..2ec3ec8674488 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -956,6 +956,9 @@ def InlineImmFP64 : FPImmLeaf<f64, [{
   return isInlineImmediate(Imm);
 }]>;
 
+def fpimm_pos_zero : FPImmLeaf<fAny, [{
+  return Imm.isZero() && !Imm.isNegative();
+}]>;
 
 class VGPRImm <dag frag> : PatLeaf<frag, [{
   return isVGPRImm(N);
@@ -984,6 +987,11 @@ def fp16_zeros_high_16bits : PatLeaf<(f16 VGPR_32:$src), [{
   return fp16SrcZerosHighBits(N->getOpcode());
 }]>;
 
+
+class build_vector_fpimm_pos_zero_v2<VTVec vec> : PatLeaf<
+  (vec (build_vector (vec.ElementType fpimm_pos_zero),
+                     (vec.ElementType fpimm_pos_zero)))>;
+
 def MFMALdScaleXForm : SDNodeXForm<timm, [{
   unsigned Val = N->getZExtValue();
   unsigned New = 0;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ac6f35a997264..22564b8347110 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2275,12 +2275,34 @@ def : GCNPat <
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0, $src1)
 >;
 
+def : GCNPat <
+  (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), fp16vt:$src1),
+  (S_AND_B32 (S_MOV_B32 (i32 0x00008000)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+  (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)), VGPR_32:$src1)
+>;
+
 def : GCNPat <
   (fcopysign f32:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
              (V_LSHLREV_B32_e64 (i32 16), $src1))
 >;
 
+def : GCNPat <
+  (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), fp16vt:$src1),
+  (S_AND_B32 (S_MOV_B32 (i32 0x80000000)),
+             (S_LSHL_B32 SReg_32:$src1, (i32 16)))
+>;
+
+def : GCNPat <
+  (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)),
+                 (V_LSHLREV_B32_e64 (i32 16), VGPR_32:$src1))
+>;
+
 def : GCNPat <
   (fcopysign f64:$src0, fp16vt:$src1),
   (REG_SEQUENCE SReg_64,
@@ -2295,6 +2317,18 @@ def : GCNPat <
              (V_LSHRREV_B32_e64 (i32 16), $src1))
 >;
 
+def : GCNPat <
+  (UniformBinFrag<fcopysign> (fp16vt fpimm_pos_zero), f32:$src1),
+  (S_AND_B32 (S_MOV_B32 (i32 0x00008000)),
+             (S_LSHR_B32 SReg_32:$src1, (i32 16)))
+>;
+
+def : GCNPat <
+  (fcopysign (fp16vt fpimm_pos_zero), f32:$src1),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x00008000)),
+                 (V_LSHRREV_B32_e64 (i32 16), VGPR_32:$src1))
+>;
+
 def : GCNPat <
   (fcopysign fp16vt:$src0, f64:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)), $src0,
@@ -2309,12 +2343,27 @@ def : GCNPat <
     (REG_SEQUENCE VGPR_32, $src1, lo16, (i16 (IMPLICIT_DEF)), hi16)), lo16)
 >;
 
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+  (fcopysign (fp16vt fpimm_pos_zero), fp16vt:$src1),
+  (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1)
+>;
+
+// TODO: Handle 0 magnitude special case
 def : GCNPat <
   (fcopysign f32:$src0, fp16vt:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
              (REG_SEQUENCE VGPR_32, (i16 (IMPLICIT_DEF)), lo16, $src1, hi16))
 >;
 
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+  (fcopysign (f32 fpimm_pos_zero), fp16vt:$src1),
+  (REG_SEQUENCE VGPR_32,
+    (V_MOV_B16_t16_e64 0, (i16 0), 0), lo16,
+    (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)), 0, VGPR_16:$src1), hi16)
+>;
+
 def : GCNPat <
   (fcopysign f64:$src0, fp16vt:$src1),
   (REG_SEQUENCE VReg_64,
@@ -2330,6 +2379,13 @@ def : GCNPat <
              (V_LSHRREV_B32_e64 (i32 16), $src1)), lo16)
 >;
 
+// TODO: Scalar case for 0 magnitude special case
+def : GCNPat <
+  (fcopysign (fp16vt fpimm_pos_zero), f32:$src1),
+  (V_AND_B16_t16_e64 0, (S_MOV_B32 (i32 0x00008000)),
+                     0, (EXTRACT_SUBREG VGPR_32:$src1, hi16))
+>;
+
 def : GCNPat <
   (fcopysign fp16vt:$src0, f64:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x00007fff)),
@@ -2347,6 +2403,16 @@ def : GCNPat <
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src0, $src1)
 >;
 
+def : GCNPat <
+  (UniformBinFrag<fcopysign> build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1),
+  (S_AND_B32 (S_MOV_B32 (i32 0x80008000)), SReg_32:$src1)
+>;
+
+def : GCNPat <
+  (fcopysign build_vector_fpimm_pos_zero_v2<fp16vt>, fp16vt:$src1),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x80008000)), VGPR_32:$src1)
+>;
+
 }
 
 /********** ================== **********/
@@ -2675,12 +2741,34 @@ def : AMDGPUPat <
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0, $src1)
 >;
 
+def : AMDGPUPat <
+  (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), (f32 SReg_32:$src1)),
+  (S_AND_B32 (S_MOV_B32 (i32 0x80000000)), $src1)
+>;
+
+def : AMDGPUPat <
+  (fcopysign (f32 fpimm_pos_zero), (f32 VGPR_32:$src1)),
+  (V_AND_B32_e32 (S_MOV_B32 (i32 0x80000000)), $src1)
+>;
+
 def : AMDGPUPat <
   (fcopysign f32:$src0, f64:$src1),
   (V_BFI_B32_e64 (S_MOV_B32 (i32 0x7fffffff)), $src0,
              (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
 >;
 
+def : AMDGPUPat <
+  (UniformBinFrag<fcopysign> (f32 fpimm_pos_zero), SReg_64:$src1),
+  (S_AND_B32 (i32 (S_MOV_B32 (i32 0x80000000))),
+             (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
+def : AMDGPUPat <
+  (fcopysign (f32 fpimm_pos_zero), VReg_64:$src1),
+  (V_AND_B32_e32 (i32 (S_MOV_B32 (i32 0x80000000))),
+                 (i32 (EXTRACT_SUBREG SReg_64:$src1, sub1)))
+>;
+
 def : AMDGPUPat <
   (fcopysign f64:$src0, f64:$src1),
   (REG_SEQUENCE SReg_64,
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
index 1fb87bf2c80a0..cdec7545ac411 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.bf16.ll
@@ -8109,35 +8109,31 @@ define bfloat @v_copysign_bf16_0_bf16(bfloat %sign) {
 ; GFX8-LABEL: v_copysign_bf16_0_bf16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_0_bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_0_bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11TRUE16-LABEL: v_copysign_bf16_0_bf16:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, v0.l
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_copysign_bf16_0_bf16:
 ; GFX11FAKE16:       ; %bb.0:
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign)
   ret bfloat %op
@@ -8232,33 +8228,28 @@ define bfloat @v_copysign_bf16_0_f32(float %sign) {
 ; GFX8-LABEL: v_copysign_bf16_0_f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0x8000
+; GFX8-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_0_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x8000
+; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_0_f32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0x8000
+; GFX10-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11TRUE16-LABEL: v_copysign_bf16_0_f32:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, v0.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_copysign_bf16_0_f32:
@@ -8266,7 +8257,7 @@ define bfloat @v_copysign_bf16_0_f32(float %sign) {
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %sign.trunc = fptrunc float %sign to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign.trunc)
@@ -8417,33 +8408,28 @@ define bfloat @v_copysign_bf16_0_f64(double %sign) {
 ; GFX8-LABEL: v_copysign_bf16_0_f64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX8-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX8-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0x8000
+; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_bf16_0_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x8000
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_bf16_0_f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX10-NEXT:    v_mov_b32_e32 v0, 0x8000
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11TRUE16-LABEL: v_copysign_bf16_0_f64:
 ; GFX11TRUE16:       ; %bb.0:
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
-; GFX11TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, v1.h
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_copysign_bf16_0_f64:
@@ -8451,7 +8437,7 @@ define bfloat @v_copysign_bf16_0_f64(double %sign) {
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %sign.trunc = fptrunc double %sign to bfloat
   %op = call bfloat @llvm.copysign.bf16(bfloat 0.0, bfloat %sign.trunc)
@@ -8489,23 +8475,17 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2bf16(<2 x bfloat> inreg %sign) {
 ;
 ; GFX9-LABEL: s_copysign_v2bf16_0_v2bf16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_bfi_b32 v0, s1, 0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_v2bf16_0_v2bf16:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_v2bf16_0_v2bf16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign)
   %cast = bitcast <2 x bfloat> %op to i32
@@ -8541,20 +8521,19 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf16(<2 x bfloat> %sign) {
 ; GFX9-LABEL: v_copysign_v2bf16_0_v2bf16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_v2bf16_0_v2bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_v2bf16_0_v2bf16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign)
   ret <2 x bfloat> %op
@@ -8622,10 +8601,7 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f32(<2 x float> inreg %sign) {
 ; GFX9-NEXT:    s_cselect_b32 s0, s3, s4
 ; GFX9-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GFX9-NEXT:    s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_bfi_b32 v0, s1, 0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_v2bf16_0_v2f32:
@@ -8647,8 +8623,7 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f32(<2 x float> inreg %sign) {
 ; GFX10-NEXT:    s_cselect_b32 s0, s0, s2
 ; GFX10-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_v2bf16_0_v2f32:
@@ -8671,9 +8646,8 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f32(<2 x float> inreg %sign) {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_lshr_b32 s0, s0, 16
 ; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX11-NEXT:    ; return to shader part epilog
   %sign.trunc = fptrunc <2 x float> %sign to <2 x bfloat>
   %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
@@ -8735,8 +8709,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
 ; GFX9-NEXT:    s_mov_b32 s4, 0x7060302
 ; GFX9-NEXT:    v_perm_b32 v0, v1, v0, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_v2bf16_0_v2bf32:
@@ -8753,7 +8726,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) {
 ; GFX10-NEXT:    v_cmp_u_f32_e32 vcc_lo, v1, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
 ; GFX10-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11TRUE16-LABEL: v_copysign_v2bf16_0_v2bf32:
@@ -8772,7 +8745,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) {
 ; GFX11TRUE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.h
-; GFX11TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_copysign_v2bf16_0_v2bf32:
@@ -8791,7 +8764,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf32(<2 x float> %sign) {
 ; GFX11FAKE16-NEXT:    v_cndmask_b32_e32 v1, v3, v5, vcc_lo
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
-; GFX11FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %sign.trunc = fptrunc <2 x float> %sign to <2 x bfloat>
   %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
@@ -8829,25 +8802,20 @@ define amdgpu_ps i32 @s_copysign_v2bf16_0_v2f64(<2 x double> inreg %sign) {
 ; GFX9-LABEL: s_copysign_v2bf16_0_v2f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s1, s3
-; GFX9-NEXT:    s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_bfi_b32 v0, s1, 0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_copysign_v2bf16_0_v2f64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s1, s3
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX10-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_v2bf16_0_v2f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s1, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX11-NEXT:    ; return to shader part epilog
   %sign.trunc = fptrunc <2 x double> %sign to <2 x bfloat>
   %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
@@ -8893,15 +8861,14 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf64(<2 x double> %sign) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v3, v1, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_copysign_v2bf16_0_v2bf64:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
-; GFX10-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11TRUE16-LABEL: v_copysign_v2bf16_0_v2bf64:
@@ -8909,7 +8876,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf64(<2 x double> %sign) {
 ; GFX11TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
 ; GFX11TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v1
+; GFX11TRUE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v1
 ; GFX11TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11FAKE16-LABEL: v_copysign_v2bf16_0_v2bf64:
@@ -8917,7 +8884,7 @@ define <2 x bfloat> @v_copysign_v2bf16_0_v2bf64(<2 x double> %sign) {
 ; GFX11FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11FAKE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %sign.trunc = fptrunc <2 x double> %sign to <2 x bfloat>
   %op = call <2 x bfloat> @llvm.copysign.v2bf16(<2 x bfloat> zeroinitializer, <2 x bfloat> %sign.trunc)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 3e7b7f3878bd8..eec6bab67b6c2 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -6922,36 +6922,31 @@ define half @v_copysign_f16_0_f16(half %sign) {
 ; SI-LABEL: v_copysign_f16_0_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
-; SI-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_copysign_f16_0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    s_movk_i32 s4, 0x7fff
-; VI-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_f16_0_f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_copysign_f16_0_f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, v0.l
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_f16_0_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %op = call half @llvm.copysign.f16(half 0.0, half %sign)
   ret half %op
@@ -7005,33 +7000,27 @@ define half @v_copysign_f16_0_f32(float %sign) {
 ; SI-LABEL: v_copysign_f16_0_f32:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
-; SI-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_copysign_f16_0_f32:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; VI-NEXT:    s_movk_i32 s4, 0x7fff
-; VI-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT:    v_mov_b32_e32 v1, 0x8000
+; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_f16_0_f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0x8000
+; GFX9-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_copysign_f16_0_f32:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, v0.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_f16_0_f32:
@@ -7039,7 +7028,7 @@ define half @v_copysign_f16_0_f32(float %sign) {
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %sign.trunc = fptrunc float %sign to half
   %op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
@@ -7246,33 +7235,27 @@ define half @v_copysign_f16_0_f64(double %sign) {
 ; SI-LABEL: v_copysign_f16_0_f64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
-; SI-NEXT:    v_bfi_b32 v0, s4, 0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_copysign_f16_0_f64:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; VI-NEXT:    s_movk_i32 s4, 0x7fff
-; VI-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT:    v_mov_b32_e32 v0, 0x8000
+; VI-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_copysign_f16_0_f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
-; GFX9-NEXT:    s_movk_i32 s4, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x8000
+; GFX9-NEXT:    v_and_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_copysign_f16_0_f64:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0
-; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v0, v1
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, v1.h
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_f16_0_f64:
@@ -7280,7 +7263,7 @@ define half @v_copysign_f16_0_f64(double %sign) {
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %sign.trunc = fptrunc double %sign to half
   %op = call half @llvm.copysign.f16(half 0.0, half %sign.trunc)
@@ -7292,12 +7275,11 @@ define amdgpu_ps i32 @s_copysign_v2f16_0_v2f16(<2 x half> inreg %sign) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s0
-; SI-NEXT:    s_brev_b32 s0, -2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_bfi_b32 v0, s0, 0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_bfi_b32 v1, s0, 0, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -7314,17 +7296,12 @@ define amdgpu_ps i32 @s_copysign_v2f16_0_v2f16(<2 x half> inreg %sign) {
 ;
 ; GFX9-LABEL: s_copysign_v2f16_0_v2f16:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_bfi_b32 v0, s1, 0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_v2f16_0_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX11-NEXT:    ; return to shader part epilog
   %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign)
   %cast = bitcast <2 x half> %op to i32
@@ -7337,11 +7314,10 @@ define <2 x half> @v_copysign_v2f16_0_v2f16(<2 x half> %sign) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    s_brev_b32 s4, -2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_bfi_b32 v0, s4, 0, v0
-; SI-NEXT:    v_bfi_b32 v1, s4, 0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_copysign_v2f16_0_v2f16:
@@ -7354,14 +7330,13 @@ define <2 x half> @v_copysign_v2f16_0_v2f16(<2 x half> %sign) {
 ; GFX9-LABEL: v_copysign_v2f16_0_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_v2f16_0_v2f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign)
   ret <2 x half> %op
@@ -7372,12 +7347,11 @@ define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) {
 ; SI:       ; %bb.0:
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, s0
-; SI-NEXT:    s_brev_b32 s0, -2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_bfi_b32 v0, s0, 0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_bfi_b32 v1, s0, 0, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -7399,9 +7373,8 @@ define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) {
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, s1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, s0
-; GFX9-NEXT:    s_mov_b32 s0, 0x7fff7fff
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX9-NEXT:    v_bfi_b32 v0, s0, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -7410,7 +7383,7 @@ define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) {
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, s1
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, s0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-TRUE16-NEXT:    ; return to shader part epilog
 ;
@@ -7420,7 +7393,7 @@ define amdgpu_ps i32 @s_copysign_v2f16_0_v2f32(<2 x float> inreg %sign) {
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, s0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v1, v0
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
@@ -7436,11 +7409,10 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) {
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; SI-NEXT:    s_brev_b32 s4, -2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    v_bfi_b32 v0, s4, 0, v0
-; SI-NEXT:    v_bfi_b32 v1, s4, 0, v1
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
+; SI-NEXT:    v_and_b32_e32 v1, 0x80000000, v1
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_copysign_v2f16_0_v2bf32:
@@ -7458,9 +7430,8 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
 ; GFX9-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_copysign_v2f16_0_v2bf32:
@@ -7469,7 +7440,7 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) {
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.h, v1
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v1.l, v0
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_v2f16_0_v2bf32:
@@ -7479,7 +7450,7 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) {
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %sign.trunc = fptrunc <2 x float> %sign to <2 x half>
   %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
@@ -7489,13 +7460,10 @@ define <2 x half> @v_copysign_v2f16_0_v2bf32(<2 x float> %sign) {
 define amdgpu_ps i32 @s_copysign_v2f16_0_v2f64(<2 x double> inreg %sign) {
 ; SI-LABEL: s_copysign_v2f16_0_v2f64:
 ; SI:       ; %bb.0:
-; SI-NEXT:    s_brev_b32 s0, -2
-; SI-NEXT:    v_mov_b32_e32 v0, s3
-; SI-NEXT:    v_bfi_b32 v0, s0, 0, v0
-; SI-NEXT:    v_mov_b32_e32 v1, s1
-; SI-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT:    v_bfi_b32 v1, s0, 0, v1
-; SI-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT:    s_and_b32 s0, 0x80000000, s3
+; SI-NEXT:    v_cvt_f16_f32_e32 v0, s0
+; SI-NEXT:    s_and_b32 s0, 0x80000000, s1
+; SI-NEXT:    v_cvt_f16_f32_e32 v1, s0
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; SI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; SI-NEXT:    v_readfirstlane_b32 s0, v0
@@ -7515,18 +7483,14 @@ define amdgpu_ps i32 @s_copysign_v2f16_0_v2f64(<2 x double> inreg %sign) {
 ; GFX9-LABEL: s_copysign_v2f16_0_v2f64:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s1, s3
-; GFX9-NEXT:    s_mov_b32 s1, 0x7fff7fff
-; GFX9-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-NEXT:    v_bfi_b32 v0, s1, 0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX9-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_copysign_v2f16_0_v2f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_pack_ll_b32_b16 s0, s1, s3
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, s0
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-NEXT:    s_and_b32 s0, 0x80008000, s0
 ; GFX11-NEXT:    ; return to shader part epilog
   %sign.trunc = fptrunc <2 x double> %sign to <2 x half>
   %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
@@ -7538,9 +7502,8 @@ define <2 x half> @v_copysign_v2f16_0_v2bf64(<2 x double> %sign) {
 ; SI-LABEL: v_copysign_v2f16_0_v2bf64:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
-; SI-NEXT:    v_bfi_b32 v0, s4, 0, v1
-; SI-NEXT:    v_bfi_b32 v1, s4, 0, v3
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v1
+; SI-NEXT:    v_and_b32_e32 v1, 0x80000000, v3
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_copysign_v2f16_0_v2bf64:
@@ -7558,8 +7521,7 @@ define <2 x half> @v_copysign_v2f16_0_v2bf64(<2 x double> %sign) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_mov_b32 s4, 0x5040100
 ; GFX9-NEXT:    v_perm_b32 v0, v3, v1, s4
-; GFX9-NEXT:    s_mov_b32 s4, 0x7fff7fff
-; GFX9-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-TRUE16-LABEL: v_copysign_v2f16_0_v2bf64:
@@ -7567,7 +7529,7 @@ define <2 x half> @v_copysign_v2f16_0_v2bf64(<2 x double> %sign) {
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.h, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v1
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v1
 ; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-FAKE16-LABEL: v_copysign_v2f16_0_v2bf64:
@@ -7575,7 +7537,7 @@ define <2 x half> @v_copysign_v2f16_0_v2bf64(<2 x double> %sign) {
 ; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-FAKE16-NEXT:    v_perm_b32 v0, v3, v1, 0x5040100
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff7fff, 0, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x80008000, v0
 ; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %sign.trunc = fptrunc <2 x double> %sign to <2 x half>
   %op = call <2 x half> @llvm.copysign.v2f16(<2 x half> zeroinitializer, <2 x half> %sign.trunc)
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 83d57e4e6e48c..0efb7962ef708 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -1126,14 +1126,13 @@ define float @v_copysign_f32_0_f32(float %sign) {
 ; SIVI-LABEL: v_copysign_f32_0_f32:
 ; SIVI:       ; %bb.0:
 ; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SIVI-NEXT:    s_brev_b32 s4, -2
-; SIVI-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; SIVI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f32_0_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call float @llvm.copysign.f32(float 0.0, float %sign)
   ret float %result
@@ -1164,14 +1163,13 @@ define float @v_copysign_f32_0_f64(double %sign) {
 ; SIVI-LABEL: v_copysign_f32_0_f64:
 ; SIVI:       ; %bb.0:
 ; SIVI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SIVI-NEXT:    s_brev_b32 s4, -2
-; SIVI-NEXT:    v_bfi_b32 v0, s4, 0, v1
+; SIVI-NEXT:    v_and_b32_e32 v0, 0x80000000, v1
 ; SIVI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f32_0_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, v1
+; GFX11-NEXT:    v_and_b32_e32 v0, 0x80000000, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign.trunc = fptrunc double %sign to float
   %result = call float @llvm.copysign.f32(float 0.0, float %sign.trunc)
@@ -1208,24 +1206,21 @@ define float @v_copysign_f32_0_f16(half %sign) {
 ; SI-LABEL: v_copysign_f32_0_f16:
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT:    s_brev_b32 s4, -2
-; SI-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; SI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; SI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_copysign_f32_0_f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; VI-NEXT:    s_brev_b32 s4, -2
-; VI-NEXT:    v_bfi_b32 v0, s4, 0, v0
+; VI-NEXT:    v_and_b32_e32 v0, 0x80000000, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_copysign_f32_0_f16:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_mov_b16_e32 v0.h, v0.l
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_and_b16 v0.h, 0x8000, v0.l
+; GFX11-NEXT:    v_mov_b16_e32 v0.l, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %sign.ext = fpext half %sign to float
   %result = call float @llvm.copysign.f32(float 0.0, float %sign.ext)
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 11954ab7e5a2c..c17a1d181015c 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -29,15 +29,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; SI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT:    s_brev_b32 s0, -2
 ; SI-NEXT:    v_and_b32_e32 v3, 0x7fffffff, v0
 ; SI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
-; SI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1|
-; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v0
 ; SI-NEXT:    s_cbranch_vccz .LBB0_2
 ; SI-NEXT:  ; %bb.1: ; %frem.else
-; SI-NEXT:    v_bfi_b32 v5, s0, 0, v0
+; SI-NEXT:    v_and_b32_e32 v5, 0x80000000, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v3, v2
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
@@ -147,20 +146,19 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    buffer_load_ushort v0, off, s[4:7], 0
 ; CI-NEXT:    s_mov_b32 s3, s7
 ; CI-NEXT:    buffer_load_ushort v1, off, s[0:3], 0 offset:8
-; CI-NEXT:    s_brev_b32 s0, -2
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    s_waitcnt vmcnt(0)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v3, v0
 ; CI-NEXT:    v_and_b32_e32 v4, 0x7fffffff, v0
-; CI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v0|, |v1|
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v0|, |v1|
 ; CI-NEXT:    v_and_b32_e32 v2, 0x7fffffff, v1
-; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CI-NEXT:    s_cbranch_vccz .LBB0_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
 ; CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT:    v_bfi_b32 v5, s0, 0, v0
+; CI-NEXT:    v_and_b32_e32 v5, 0x80000000, v0
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v2
 ; CI-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB0_3
@@ -259,27 +257,26 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    s_waitcnt vmcnt(1)
 ; VI-NEXT:    v_cvt_f32_f16_e64 v4, |v0|
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_cvt_f32_f16_e64 v2, |v1|
-; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v2
+; VI-NEXT:    v_cvt_f32_f16_e64 v3, |v1|
+; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
 ; VI-NEXT:    s_cbranch_vccz .LBB0_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_movk_i32 s2, 0x7fff
-; VI-NEXT:    v_bfi_b32 v3, s2, 0, v0
-; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v2
-; VI-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; VI-NEXT:    v_and_b32_e32 v2, 0x8000, v0
+; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
+; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB0_3
 ; VI-NEXT:    s_branch .LBB0_8
 ; VI-NEXT:  .LBB0_2:
-; VI-NEXT:    ; implicit-def: $vgpr3
+; VI-NEXT:    ; implicit-def: $vgpr2
 ; VI-NEXT:  .LBB0_3: ; %frem.compute
-; VI-NEXT:    v_frexp_mant_f32_e32 v3, v4
-; VI-NEXT:    v_ldexp_f32 v5, v3, 11
-; VI-NEXT:    v_frexp_mant_f32_e32 v3, v2
-; VI-NEXT:    v_ldexp_f32 v3, v3, 1
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
+; VI-NEXT:    v_frexp_mant_f32_e32 v2, v4
+; VI-NEXT:    v_frexp_mant_f32_e32 v4, v3
+; VI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v3
+; VI-NEXT:    v_ldexp_f32 v3, v4, 1
 ; VI-NEXT:    v_div_scale_f32 v9, s[2:3], v3, v3, 1.0
-; VI-NEXT:    v_frexp_exp_i32_f32_e32 v8, v2
+; VI-NEXT:    v_ldexp_f32 v5, v2, 11
 ; VI-NEXT:    v_add_u32_e32 v2, vcc, -1, v8
-; VI-NEXT:    v_frexp_exp_i32_f32_e32 v7, v4
 ; VI-NEXT:    v_not_b32_e32 v4, v2
 ; VI-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
 ; VI-NEXT:    v_div_scale_f32 v6, vcc, 1.0, v3, 1.0
@@ -326,17 +323,17 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    v_ldexp_f32 v2, v3, v2
 ; VI-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; VI-NEXT:    s_movk_i32 s2, 0x7fff
-; VI-NEXT:    v_bfi_b32 v3, s2, v2, v0
+; VI-NEXT:    v_bfi_b32 v2, s2, v2, v0
 ; VI-NEXT:  .LBB0_8: ; %Flow19
-; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v3, s0
 ; VI-NEXT:    s_movk_i32 s0, 0x7c00
-; VI-NEXT:    v_mov_b32_e32 v5, s1
+; VI-NEXT:    v_mov_b32_e32 v4, s1
 ; VI-NEXT:    v_cmp_lg_f16_e32 vcc, 0, v1
 ; VI-NEXT:    v_cmp_nge_f16_e64 s[0:1], |v0|, s0
 ; VI-NEXT:    s_and_b64 vcc, s[0:1], vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, 0x7e00
-; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; VI-NEXT:    flat_store_short v[4:5], v0
+; VI-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; VI-NEXT:    flat_store_short v[3:4], v0
 ; VI-NEXT:    s_endpgm
 ;
 ; GFX9-LABEL: frem_f16:
@@ -354,8 +351,7 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v2
 ; GFX9-NEXT:    s_cbranch_vccz .LBB0_2
 ; GFX9-NEXT:  ; %bb.1: ; %frem.else
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v3, s2, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v3, 0x8000, v0
 ; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB0_3
@@ -446,7 +442,7 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v3, v2
 ; GFX10-NEXT:    s_cbranch_vccz .LBB0_2
 ; GFX10-NEXT:  ; %bb.1: ; %frem.else
-; GFX10-NEXT:    v_bfi_b32 v4, 0x7fff, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0x8000, v0
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v3, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB0_3
@@ -531,32 +527,30 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-TRUE16-NEXT:    global_load_d16_b16 v1, v0, s[2:3]
 ; GFX11-TRUE16-NEXT:    global_load_d16_b16 v0, v0, s[4:5] offset:8
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v3, |v1.l|
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v4, |v1.l|
 ; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v2, |v0.l|
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v3, v2
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v2
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB0_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v3, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, v5, v4
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v1.l, v4.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, v1.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v3.l, v1.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX11-TRUE16-NEXT:    s_branch .LBB0_8
 ; GFX11-TRUE16-NEXT:  .LBB0_2:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr4
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr3
 ; GFX11-TRUE16-NEXT:  .LBB0_3: ; %frem.compute
-; GFX11-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v5, v3
-; GFX11-TRUE16-NEXT:    v_frexp_mant_f32_e32 v3, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s2, v5
+; GFX11-TRUE16-NEXT:    v_frexp_mant_f32_e32 v3, v4
+; GFX11-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v5, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v4, v3, 11
 ; GFX11-TRUE16-NEXT:    v_frexp_mant_f32_e32 v3, v2
 ; GFX11-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s2, v5
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_ldexp_f32 v3, v3, 1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v2, -1, v2
@@ -625,14 +619,14 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v3.l, v1.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v2.l, v2
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, v2, v3
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v3, 0x7fff, v2, v3
 ; GFX11-TRUE16-NEXT:  .LBB0_8: ; %Flow19
 ; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v0.l
 ; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v1.l|
 ; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v4.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.l, 0x7e00, v3.l, s2
 ; GFX11-TRUE16-NEXT:    global_store_b16 v2, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -654,7 +648,7 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-FAKE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v3, v2
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB0_2
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v4, 0x7fff, 0, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v3, v2
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc_lo
@@ -770,13 +764,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s1, s0
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GFX1150-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s1, s0
+; GFX1150-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, v0.l
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v3, v2
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, v2.l, s2
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, v0.h, s2
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB0_8
 ; GFX1150-TRUE16-NEXT:  .LBB0_2:
@@ -895,7 +887,7 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-FAKE16-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GFX1150-FAKE16-NEXT:  ; %bb.1: ; %frem.else
 ; GFX1150-FAKE16-NEXT:    s_cmp_eq_f32 s1, s0
-; GFX1150-FAKE16-NEXT:    v_bfi_b32 v2, 0x7fff, 0, v0
+; GFX1150-FAKE16-NEXT:    v_and_b32_e32 v2, 0x8000, v0
 ; GFX1150-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
@@ -1015,13 +1007,11 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s1, s0
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GFX1200-TRUE16-NEXT:  ; %bb.1: ; %frem.else
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s1, s0
+; GFX1200-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, v0.l
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v3, v2
-; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, v2.l, s2
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, v0.h, s2
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB0_3
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB0_8
 ; GFX1200-TRUE16-NEXT:  .LBB0_2:
@@ -1144,7 +1134,7 @@ define amdgpu_kernel void @frem_f16(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1200-FAKE16-NEXT:    s_cbranch_scc0 .LBB0_2
 ; GFX1200-FAKE16-NEXT:  ; %bb.1: ; %frem.else
 ; GFX1200-FAKE16-NEXT:    s_cmp_eq_f32 s1, s0
-; GFX1200-FAKE16-NEXT:    v_bfi_b32 v2, 0x7fff, 0, v0
+; GFX1200-FAKE16-NEXT:    v_and_b32_e32 v2, 0x8000, v0
 ; GFX1200-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
@@ -2076,8 +2066,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_vccz .LBB3_2
 ; SI-NEXT:  ; %bb.1: ; %frem.else
-; SI-NEXT:    s_brev_b32 s2, -2
-; SI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; SI-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
 ; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
 ; SI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; SI-NEXT:    s_mov_b64 vcc, exec
@@ -2182,8 +2171,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CI-NEXT:    s_cbranch_vccz .LBB3_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; CI-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
 ; CI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB3_3
@@ -2275,8 +2263,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; VI-NEXT:    s_cbranch_vccz .LBB3_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else
-; VI-NEXT:    s_brev_b32 s2, -2
-; VI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; VI-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB3_3
@@ -2363,8 +2350,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB3_2
 ; GFX9-NEXT:  ; %bb.1: ; %frem.else
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
 ; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v1|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB3_3
@@ -2452,7 +2438,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX10-NEXT:    s_cbranch_vccz .LBB3_2
 ; GFX10-NEXT:  ; %bb.1: ; %frem.else
-; GFX10-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
 ; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1|
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB3_3
@@ -2540,7 +2526,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX11-NEXT:    s_cbranch_vccz .LBB3_2
 ; GFX11-NEXT:  ; %bb.1: ; %frem.else
-; GFX11-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_and_b32_e32 v2, 0x80000000, v0
 ; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v1|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
@@ -2650,7 +2636,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1150-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
 ; GFX1150-NEXT:    s_cbranch_vccz .LBB3_2
 ; GFX1150-NEXT:  ; %bb.1: ; %frem.else
-; GFX1150-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v2
+; GFX1150-NEXT:    v_and_b32_e32 v4, 0x80000000, v2
 ; GFX1150-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v1, v3
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1150-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc_lo
@@ -2763,7 +2749,7 @@ define amdgpu_kernel void @frem_f32(ptr addrspace(1) %out, ptr addrspace(1) %in1
 ; GFX1200-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v1, v3
 ; GFX1200-NEXT:    s_cbranch_vccz .LBB3_2
 ; GFX1200-NEXT:  ; %bb.1: ; %frem.else
-; GFX1200-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v2
+; GFX1200-NEXT:    v_and_b32_e32 v4, 0x80000000, v2
 ; GFX1200-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v1, v3
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v3, v2, v4, vcc_lo
@@ -4851,15 +4837,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
 ; SI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; SI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; SI-NEXT:    s_brev_b32 s0, -2
 ; SI-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v2
 ; SI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v3
-; SI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v2|, |v3|
-; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v2|, |v3|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; SI-NEXT:    v_cvt_f16_f32_e32 v4, v2
 ; SI-NEXT:    s_cbranch_vccz .LBB9_2
 ; SI-NEXT:  ; %bb.1: ; %frem.else20
-; SI-NEXT:    v_bfi_b32 v7, s0, 0, v2
+; SI-NEXT:    v_and_b32_e32 v7, 0x80000000, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v4, v4
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v5, v6
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
@@ -4945,8 +4930,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v6, v7
 ; SI-NEXT:    s_cbranch_vccz .LBB9_10
 ; SI-NEXT:  ; %bb.9: ; %frem.else
-; SI-NEXT:    s_brev_b32 s0, -2
-; SI-NEXT:    v_bfi_b32 v8, s0, 0, v0
+; SI-NEXT:    v_and_b32_e32 v8, 0x80000000, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v7
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
@@ -5069,7 +5053,6 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; CI-NEXT:    s_mov_b32 s3, s7
 ; CI-NEXT:    buffer_load_dword v3, off, s[0:3], 0 offset:16
-; CI-NEXT:    s_brev_b32 s0, -2
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; CI-NEXT:    v_lshrrev_b32_e32 v0, 16, v1
@@ -5079,14 +5062,14 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f16_f32_e32 v4, v2
-; CI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v2|, |v3|
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v2|, |v3|
 ; CI-NEXT:    v_and_b32_e32 v6, 0x7fffffff, v2
 ; CI-NEXT:    v_and_b32_e32 v5, 0x7fffffff, v3
-; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CI-NEXT:    s_cbranch_vccz .LBB9_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else20
 ; CI-NEXT:    v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT:    v_bfi_b32 v7, s0, 0, v2
+; CI-NEXT:    v_and_b32_e32 v7, 0x80000000, v2
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v5
 ; CI-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB9_3
@@ -5159,8 +5142,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB9_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else
 ; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT:    s_brev_b32 s0, -2
-; CI-NEXT:    v_bfi_b32 v8, s0, 0, v0
+; CI-NEXT:    v_and_b32_e32 v8, 0x80000000, v0
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v7, v6
 ; CI-NEXT:    v_cndmask_b32_e32 v5, v5, v8, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB9_11
@@ -5276,8 +5258,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
 ; VI-NEXT:    s_cbranch_vccz .LBB9_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else20
-; VI-NEXT:    s_movk_i32 s2, 0x7fff
-; VI-NEXT:    v_bfi_b32 v2, s2, 0, v0
+; VI-NEXT:    v_and_b32_e32 v2, 0x8000, v0
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB9_3
@@ -5348,8 +5329,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v7, v6
 ; VI-NEXT:    s_cbranch_vccz .LBB9_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else
-; VI-NEXT:    s_movk_i32 s2, 0x7fff
-; VI-NEXT:    v_bfi_b32 v5, s2, 0, v3
+; VI-NEXT:    v_and_b32_e32 v5, 0x8000, v3
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v7, v6
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v3, v5, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB9_11
@@ -5444,8 +5424,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v4, v3
 ; GFX9-NEXT:    s_cbranch_vccz .LBB9_2
 ; GFX9-NEXT:  ; %bb.1: ; %frem.else20
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v2, s2, 0, v1
+; GFX9-NEXT:    v_and_b32_e32 v2, 0x8000, v1
 ; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v4, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB9_3
@@ -5515,8 +5494,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v6, v5
 ; GFX9-NEXT:    s_cbranch_vccz .LBB9_10
 ; GFX9-NEXT:  ; %bb.9: ; %frem.else
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v4, s2, 0, v3
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x8000, v3
 ; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB9_11
@@ -5613,7 +5591,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v3
 ; GFX10-NEXT:    s_cbranch_vccz .LBB9_2
 ; GFX10-NEXT:  ; %bb.1: ; %frem.else20
-; GFX10-NEXT:    v_bfi_b32 v2, 0x7fff, 0, v1
+; GFX10-NEXT:    v_and_b32_e32 v2, 0x8000, v1
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB9_3
@@ -5680,31 +5658,31 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    v_bfi_b32 v2, 0x7fff, v2, v1
 ; GFX10-NEXT:  .LBB9_8:
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX10-NEXT:    v_cvt_f32_f16_sdwa v4, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GFX10-NEXT:    v_cvt_f32_f16_sdwa v5, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GFX10-NEXT:    v_cvt_f32_f16_e64 v6, |v3|
-; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v4
+; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
 ; GFX10-NEXT:    s_cbranch_vccz .LBB9_10
 ; GFX10-NEXT:  ; %bb.9: ; %frem.else
-; GFX10-NEXT:    v_bfi_b32 v5, 0x7fff, 0, v3
-; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v4
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v3, v5, vcc_lo
+; GFX10-NEXT:    v_and_b32_e32 v4, 0x8000, v3
+; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB9_11
 ; GFX10-NEXT:    s_branch .LBB9_16
 ; GFX10-NEXT:  .LBB9_10:
-; GFX10-NEXT:    ; implicit-def: $vgpr5
+; GFX10-NEXT:    ; implicit-def: $vgpr4
 ; GFX10-NEXT:  .LBB9_11: ; %frem.compute
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v5, v6
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v4, v6
+; GFX10-NEXT:    v_frexp_mant_f32_e32 v8, v5
 ; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v7, v6
-; GFX10-NEXT:    v_ldexp_f32 v6, v5, 11
-; GFX10-NEXT:    v_frexp_mant_f32_e32 v5, v4
-; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v4, v4
+; GFX10-NEXT:    v_ldexp_f32 v6, v4, 11
+; GFX10-NEXT:    v_frexp_exp_i32_f32_e32 v4, v5
+; GFX10-NEXT:    v_ldexp_f32 v5, v8, 1
 ; GFX10-NEXT:    v_readfirstlane_b32 s2, v7
-; GFX10-NEXT:    v_ldexp_f32 v5, v5, 1
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, -1, v4
 ; GFX10-NEXT:    v_div_scale_f32 v9, s4, v5, v5, 1.0
-; GFX10-NEXT:    v_not_b32_e32 v8, v4
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, -1, v4
 ; GFX10-NEXT:    v_rcp_f32_e32 v10, v9
+; GFX10-NEXT:    v_not_b32_e32 v8, v4
 ; GFX10-NEXT:    v_add_nc_u32_e32 v8, v8, v7
 ; GFX10-NEXT:    v_div_scale_f32 v7, vcc_lo, 1.0, v5, 1.0
 ; GFX10-NEXT:    s_denorm_mode 15
@@ -5749,7 +5727,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc_lo
 ; GFX10-NEXT:    v_ldexp_f32 v4, v5, v4
 ; GFX10-NEXT:    v_cvt_f16_f32_e32 v4, v4
-; GFX10-NEXT:    v_bfi_b32 v5, 0x7fff, v4, v3
+; GFX10-NEXT:    v_bfi_b32 v4, 0x7fff, v4, v3
 ; GFX10-NEXT:  .LBB9_16: ; %Flow54
 ; GFX10-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v0
 ; GFX10-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v1|
@@ -5760,7 +5738,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX10-NEXT:    v_cmp_lg_f16_sdwa s2, v0, v2 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    s_and_b32 vcc_lo, s3, s2
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0x7e00, v4, vcc_lo
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX10-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GFX10-NEXT:    s_endpgm
@@ -5783,11 +5761,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v3
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB9_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %frem.else20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v2.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v5.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v2.l, 0x8000, v0.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v3
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v5, v2
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v2.l, v0.l, v2.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX11-TRUE16-NEXT:    s_branch .LBB9_8
@@ -5875,39 +5851,37 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v6, |v3.l|
-; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v5, |v4.l|
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v7, |v3.l|
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v6, |v4.l|
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
+; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v7, v6
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB9_10
 ; GFX11-TRUE16-NEXT:  ; %bb.9: ; %frem.else
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v3.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, 0
-; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0x7fff, v8, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v3.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, v3.l
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v7, v6
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v5.l, v3.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB9_11
 ; GFX11-TRUE16-NEXT:    s_branch .LBB9_16
 ; GFX11-TRUE16-NEXT:  .LBB9_10:
-; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr7
+; GFX11-TRUE16-NEXT:    ; implicit-def: $vgpr5
 ; GFX11-TRUE16-NEXT:  .LBB9_11: ; %frem.compute
-; GFX11-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v8, v6
-; GFX11-TRUE16-NEXT:    v_frexp_mant_f32_e32 v6, v6
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_frexp_mant_f32_e32 v5, v7
+; GFX11-TRUE16-NEXT:    v_frexp_mant_f32_e32 v9, v6
+; GFX11-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v8, v7
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v7, v5, 11
+; GFX11-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v5, v6
+; GFX11-TRUE16-NEXT:    v_ldexp_f32 v6, v9, 1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX11-TRUE16-NEXT:    v_ldexp_f32 v7, v6, 11
-; GFX11-TRUE16-NEXT:    v_frexp_mant_f32_e32 v6, v5
-; GFX11-TRUE16-NEXT:    v_frexp_exp_i32_f32_e32 v5, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-TRUE16-NEXT:    v_ldexp_f32 v6, v6, 1
 ; GFX11-TRUE16-NEXT:    v_readfirstlane_b32 s3, v5
-; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, -1, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_div_scale_f32 v10, null, v6, v6, 1.0
-; GFX11-TRUE16-NEXT:    v_not_b32_e32 v9, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v5, -1, v5
 ; GFX11-TRUE16-NEXT:    v_rcp_f32_e32 v11, v10
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_not_b32_e32 v9, v5
 ; GFX11-TRUE16-NEXT:    v_add_nc_u32_e32 v9, v9, v8
 ; GFX11-TRUE16-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0
 ; GFX11-TRUE16-NEXT:    s_denorm_mode 15
@@ -5968,7 +5942,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v6.l, v3.l
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v5.l, v5
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0x7fff, v5, v6
+; GFX11-TRUE16-NEXT:    v_bfi_b32 v5, 0x7fff, v5, v6
 ; GFX11-TRUE16-NEXT:  .LBB9_16: ; %Flow54
 ; GFX11-TRUE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v1.l
 ; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v0.l|
@@ -5979,7 +5953,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v3.l|
 ; GFX11-TRUE16-NEXT:    s_and_b32 s2, s2, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v7.l, s2
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v0.h, 0x7e00, v5.l, s2
 ; GFX11-TRUE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-TRUE16-NEXT:    s_endpgm
 ;
@@ -6001,7 +5975,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v4, v3
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB9_2
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %frem.else20
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v2, 0x7fff, 0, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v2, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v4, v3
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
@@ -6091,36 +6065,36 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v4, 16, v1
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v7, |v3|
-; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v5, |v4|
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v6, |v4|
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-FAKE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v7, v5
+; GFX11-FAKE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v7, v6
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB9_10
 ; GFX11-FAKE16-NEXT:  ; %bb.9: ; %frem.else
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v6, 0x7fff, 0, v3
-; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v7, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v5, 0x8000, v3
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v7, v6
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v6, v3, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v5, v3, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    s_cbranch_execz .LBB9_11
 ; GFX11-FAKE16-NEXT:    s_branch .LBB9_16
 ; GFX11-FAKE16-NEXT:  .LBB9_10:
-; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr6
+; GFX11-FAKE16-NEXT:    ; implicit-def: $vgpr5
 ; GFX11-FAKE16-NEXT:  .LBB9_11: ; %frem.compute
-; GFX11-FAKE16-NEXT:    v_frexp_mant_f32_e32 v6, v7
+; GFX11-FAKE16-NEXT:    v_frexp_mant_f32_e32 v5, v7
+; GFX11-FAKE16-NEXT:    v_frexp_mant_f32_e32 v9, v6
 ; GFX11-FAKE16-NEXT:    v_frexp_exp_i32_f32_e32 v8, v7
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX11-FAKE16-NEXT:    v_ldexp_f32 v7, v6, 11
-; GFX11-FAKE16-NEXT:    v_frexp_mant_f32_e32 v6, v5
-; GFX11-FAKE16-NEXT:    v_frexp_exp_i32_f32_e32 v5, v5
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v7, v5, 11
+; GFX11-FAKE16-NEXT:    v_frexp_exp_i32_f32_e32 v5, v6
+; GFX11-FAKE16-NEXT:    v_ldexp_f32 v6, v9, 1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s2, v8
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-FAKE16-NEXT:    v_ldexp_f32 v6, v6, 1
 ; GFX11-FAKE16-NEXT:    v_readfirstlane_b32 s3, v5
-; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, -1, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_div_scale_f32 v10, null, v6, v6, 1.0
-; GFX11-FAKE16-NEXT:    v_not_b32_e32 v9, v5
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v5, -1, v5
 ; GFX11-FAKE16-NEXT:    v_rcp_f32_e32 v11, v10
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_not_b32_e32 v9, v5
 ; GFX11-FAKE16-NEXT:    v_add_nc_u32_e32 v9, v9, v8
 ; GFX11-FAKE16-NEXT:    v_div_scale_f32 v8, vcc_lo, 1.0, v6, 1.0
 ; GFX11-FAKE16-NEXT:    s_denorm_mode 15
@@ -6180,7 +6154,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    v_ldexp_f32 v5, v6, v5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v5, v5
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v6, 0x7fff, v5, v3
+; GFX11-FAKE16-NEXT:    v_bfi_b32 v5, 0x7fff, v5, v3
 ; GFX11-FAKE16-NEXT:  .LBB9_16: ; %Flow54
 ; GFX11-FAKE16-NEXT:    v_cmp_lg_f16_e32 vcc_lo, 0, v1
 ; GFX11-FAKE16-NEXT:    v_cmp_nle_f16_e64 s2, 0x7c00, |v0|
@@ -6192,7 +6166,7 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-FAKE16-NEXT:    s_and_b32 vcc_lo, s2, vcc_lo
-; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v6, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v2, 0x7e00, v5, vcc_lo
 ; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX11-FAKE16-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-FAKE16-NEXT:    s_endpgm
@@ -6219,12 +6193,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s6, s5
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX1150-TRUE16-NEXT:  ; %bb.1: ; %frem.else20
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s6, s5
+; GFX1150-TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, s4
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s7, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, v0.l, s7
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB9_8
@@ -6322,13 +6294,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s8, s7
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB9_10
 ; GFX1150-TRUE16-NEXT:  ; %bb.9: ; %frem.else
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s6
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s8, s7
+; GFX1150-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, s6
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff, v2, v1
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v1.l, s6, v1.l, s9
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v1.l, s6, v0.h, s9
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB9_11
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB9_16
 ; GFX1150-TRUE16-NEXT:  .LBB9_10:
@@ -6455,8 +6425,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-FAKE16-NEXT:    s_cmp_ngt_f32 s6, s5
 ; GFX1150-FAKE16-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX1150-FAKE16-NEXT:  ; %bb.1: ; %frem.else20
+; GFX1150-FAKE16-NEXT:    s_and_b32 s7, 0x8000, s4
 ; GFX1150-FAKE16-NEXT:    s_cmp_eq_f32 s6, s5
-; GFX1150-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, s4
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX1150-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-FAKE16-NEXT:    v_cndmask_b32_e32 v0, s4, v0, vcc_lo
@@ -6555,8 +6526,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-FAKE16-NEXT:    s_cmp_ngt_f32 s8, s7
 ; GFX1150-FAKE16-NEXT:    s_cbranch_scc0 .LBB9_10
 ; GFX1150-FAKE16-NEXT:  ; %bb.9: ; %frem.else
+; GFX1150-FAKE16-NEXT:    s_and_b32 s9, 0x8000, s6
 ; GFX1150-FAKE16-NEXT:    s_cmp_eq_f32 s8, s7
-; GFX1150-FAKE16-NEXT:    v_bfi_b32 v1, 0x7fff, 0, s6
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX1150-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-FAKE16-NEXT:    v_cndmask_b32_e32 v1, s6, v1, vcc_lo
@@ -6686,12 +6658,10 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s6, s5
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX1200-TRUE16-NEXT:  ; %bb.1: ; %frem.else20
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s6, s5
+; GFX1200-TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, s4
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s7, -1, 0
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.l, s4, v0.l, s7
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB9_3
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB9_8
@@ -6795,13 +6765,11 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s8, s7
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB9_10
 ; GFX1200-TRUE16-NEXT:  ; %bb.9: ; %frem.else
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s6
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s8, s7
+; GFX1200-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, s6
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff, v2, v1
-; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v1.l, s6, v1.l, s9
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v1.l, s6, v0.h, s9
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB9_11
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB9_16
 ; GFX1200-TRUE16-NEXT:  .LBB9_10:
@@ -6933,8 +6901,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-FAKE16-NEXT:    s_cmp_ngt_f32 s6, s5
 ; GFX1200-FAKE16-NEXT:    s_cbranch_scc0 .LBB9_2
 ; GFX1200-FAKE16-NEXT:  ; %bb.1: ; %frem.else20
+; GFX1200-FAKE16-NEXT:    s_and_b32 s7, 0x8000, s4
 ; GFX1200-FAKE16-NEXT:    s_cmp_eq_f32 s6, s5
-; GFX1200-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, s4
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v0, s7
 ; GFX1200-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-FAKE16-NEXT:    v_cndmask_b32_e32 v0, s4, v0, vcc_lo
@@ -7040,8 +7009,9 @@ define amdgpu_kernel void @frem_v2f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-FAKE16-NEXT:    s_cmp_ngt_f32 s8, s7
 ; GFX1200-FAKE16-NEXT:    s_cbranch_scc0 .LBB9_10
 ; GFX1200-FAKE16-NEXT:  ; %bb.9: ; %frem.else
+; GFX1200-FAKE16-NEXT:    s_and_b32 s9, 0x8000, s6
 ; GFX1200-FAKE16-NEXT:    s_cmp_eq_f32 s8, s7
-; GFX1200-FAKE16-NEXT:    v_bfi_b32 v1, 0x7fff, 0, s6
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX1200-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1200-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -7193,15 +7163,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
 ; SI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; SI-NEXT:    v_cvt_f32_f16_e32 v7, v7
-; SI-NEXT:    s_brev_b32 s0, -2
 ; SI-NEXT:    v_and_b32_e32 v9, 0x7fffffff, v6
 ; SI-NEXT:    v_and_b32_e32 v10, 0x7fffffff, v7
-; SI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v6|, |v7|
-; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; SI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v6|, |v7|
+; SI-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; SI-NEXT:    v_cvt_f16_f32_e32 v8, v6
 ; SI-NEXT:    s_cbranch_vccz .LBB10_2
 ; SI-NEXT:  ; %bb.1: ; %frem.else86
-; SI-NEXT:    v_bfi_b32 v11, s0, 0, v6
+; SI-NEXT:    v_and_b32_e32 v11, 0x80000000, v6
 ; SI-NEXT:    v_cvt_f32_f16_e32 v8, v8
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v9, v10
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
@@ -7287,8 +7256,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v10, v11
 ; SI-NEXT:    s_cbranch_vccz .LBB10_10
 ; SI-NEXT:  ; %bb.9: ; %frem.else53
-; SI-NEXT:    s_brev_b32 s0, -2
-; SI-NEXT:    v_bfi_b32 v12, s0, 0, v4
+; SI-NEXT:    v_and_b32_e32 v12, 0x80000000, v4
 ; SI-NEXT:    v_cvt_f32_f16_e32 v9, v9
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v10, v11
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
@@ -7374,8 +7342,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v11, v12
 ; SI-NEXT:    s_cbranch_vccz .LBB10_18
 ; SI-NEXT:  ; %bb.17: ; %frem.else20
-; SI-NEXT:    s_brev_b32 s0, -2
-; SI-NEXT:    v_bfi_b32 v13, s0, 0, v2
+; SI-NEXT:    v_and_b32_e32 v13, 0x80000000, v2
 ; SI-NEXT:    v_cvt_f32_f16_e32 v10, v10
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v11, v12
 ; SI-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
@@ -7461,8 +7428,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    v_cmp_ngt_f32_e32 vcc, v12, v13
 ; SI-NEXT:    s_cbranch_vccz .LBB10_26
 ; SI-NEXT:  ; %bb.25: ; %frem.else
-; SI-NEXT:    s_brev_b32 s0, -2
-; SI-NEXT:    v_bfi_b32 v14, s0, 0, v0
+; SI-NEXT:    v_and_b32_e32 v14, 0x80000000, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v11, v11
 ; SI-NEXT:    v_cmp_eq_f32_e32 vcc, v12, v13
 ; SI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
@@ -7609,7 +7575,6 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
 ; CI-NEXT:    s_mov_b32 s3, s7
 ; CI-NEXT:    buffer_load_dwordx2 v[7:8], off, s[0:3], 0 offset:32
-; CI-NEXT:    s_brev_b32 s0, -2
 ; CI-NEXT:    s_waitcnt vmcnt(1)
 ; CI-NEXT:    v_cvt_f32_f16_e32 v6, v3
 ; CI-NEXT:    v_lshrrev_b32_e32 v1, 16, v3
@@ -7625,14 +7590,14 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; CI-NEXT:    v_cvt_f32_f16_e32 v5, v5
 ; CI-NEXT:    v_cvt_f16_f32_e32 v8, v6
-; CI-NEXT:    v_cmp_ngt_f32_e64 s[2:3], |v6|, |v7|
+; CI-NEXT:    v_cmp_ngt_f32_e64 s[0:1], |v6|, |v7|
 ; CI-NEXT:    v_and_b32_e32 v10, 0x7fffffff, v6
 ; CI-NEXT:    v_and_b32_e32 v9, 0x7fffffff, v7
-; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; CI-NEXT:    s_and_b64 vcc, exec, s[0:1]
 ; CI-NEXT:    s_cbranch_vccz .LBB10_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else86
 ; CI-NEXT:    v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT:    v_bfi_b32 v11, s0, 0, v6
+; CI-NEXT:    v_and_b32_e32 v11, 0x80000000, v6
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v10, v9
 ; CI-NEXT:    v_cndmask_b32_e32 v8, v8, v11, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB10_3
@@ -7705,8 +7670,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB10_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else53
 ; CI-NEXT:    v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT:    s_brev_b32 s0, -2
-; CI-NEXT:    v_bfi_b32 v12, s0, 0, v4
+; CI-NEXT:    v_and_b32_e32 v12, 0x80000000, v4
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v11, v10
 ; CI-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB10_11
@@ -7779,8 +7743,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB10_18
 ; CI-NEXT:  ; %bb.17: ; %frem.else20
 ; CI-NEXT:    v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT:    s_brev_b32 s0, -2
-; CI-NEXT:    v_bfi_b32 v13, s0, 0, v2
+; CI-NEXT:    v_and_b32_e32 v13, 0x80000000, v2
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v12, v11
 ; CI-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB10_19
@@ -7853,8 +7816,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_cbranch_vccz .LBB10_26
 ; CI-NEXT:  ; %bb.25: ; %frem.else
 ; CI-NEXT:    v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT:    s_brev_b32 s0, -2
-; CI-NEXT:    v_bfi_b32 v14, s0, 0, v0
+; CI-NEXT:    v_and_b32_e32 v14, 0x80000000, v0
 ; CI-NEXT:    v_cmp_eq_f32_e32 vcc, v13, v12
 ; CI-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB10_27
@@ -7994,8 +7956,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v6, v5
 ; VI-NEXT:    s_cbranch_vccz .LBB10_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else86
-; VI-NEXT:    s_movk_i32 s2, 0x7fff
-; VI-NEXT:    v_bfi_b32 v4, s2, 0, v0
+; VI-NEXT:    v_and_b32_e32 v4, 0x8000, v0
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB10_3
@@ -8066,8 +8027,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v9, v8
 ; VI-NEXT:    s_cbranch_vccz .LBB10_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else53
-; VI-NEXT:    s_movk_i32 s2, 0x7fff
-; VI-NEXT:    v_bfi_b32 v7, s2, 0, v5
+; VI-NEXT:    v_and_b32_e32 v7, 0x8000, v5
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v9, v8
 ; VI-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB10_11
@@ -8136,8 +8096,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v10, v9
 ; VI-NEXT:    s_cbranch_vccz .LBB10_18
 ; VI-NEXT:  ; %bb.17: ; %frem.else20
-; VI-NEXT:    s_movk_i32 s2, 0x7fff
-; VI-NEXT:    v_bfi_b32 v8, s2, 0, v1
+; VI-NEXT:    v_and_b32_e32 v8, 0x8000, v1
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v10, v9
 ; VI-NEXT:    v_cndmask_b32_e32 v8, v1, v8, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB10_19
@@ -8208,8 +8167,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    v_cmp_ngt_f32_e32 vcc, v13, v12
 ; VI-NEXT:    s_cbranch_vccz .LBB10_26
 ; VI-NEXT:  ; %bb.25: ; %frem.else
-; VI-NEXT:    s_movk_i32 s2, 0x7fff
-; VI-NEXT:    v_bfi_b32 v11, s2, 0, v9
+; VI-NEXT:    v_and_b32_e32 v11, 0x8000, v9
 ; VI-NEXT:    v_cmp_eq_f32_e32 vcc, v13, v12
 ; VI-NEXT:    v_cndmask_b32_e32 v11, v9, v11, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB10_27
@@ -8313,8 +8271,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v6, v5
 ; GFX9-NEXT:    s_cbranch_vccz .LBB10_2
 ; GFX9-NEXT:  ; %bb.1: ; %frem.else86
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v4, s2, 0, v2
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x8000, v2
 ; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v6, v5
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_3
@@ -8384,8 +8341,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v8, v7
 ; GFX9-NEXT:    s_cbranch_vccz .LBB10_10
 ; GFX9-NEXT:  ; %bb.9: ; %frem.else53
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v6, s2, 0, v5
+; GFX9-NEXT:    v_and_b32_e32 v6, 0x8000, v5
 ; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v8, v7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_11
@@ -8454,8 +8410,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v9, v8
 ; GFX9-NEXT:    s_cbranch_vccz .LBB10_18
 ; GFX9-NEXT:  ; %bb.17: ; %frem.else20
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v7, s2, 0, v3
+; GFX9-NEXT:    v_and_b32_e32 v7, 0x8000, v3
 ; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v9, v8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v3, v7, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_19
@@ -8525,8 +8480,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    v_cmp_ngt_f32_e32 vcc, v11, v10
 ; GFX9-NEXT:    s_cbranch_vccz .LBB10_26
 ; GFX9-NEXT:  ; %bb.25: ; %frem.else
-; GFX9-NEXT:    s_movk_i32 s2, 0x7fff
-; GFX9-NEXT:    v_bfi_b32 v9, s2, 0, v8
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x8000, v8
 ; GFX9-NEXT:    v_cmp_eq_f32_e32 vcc, v11, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB10_27
@@ -8633,7 +8587,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
 ; GFX10-NEXT:    s_cbranch_vccz .LBB10_2
 ; GFX10-NEXT:  ; %bb.1: ; %frem.else86
-; GFX10-NEXT:    v_bfi_b32 v4, 0x7fff, 0, v2
+; GFX10-NEXT:    v_and_b32_e32 v4, 0x8000, v2
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB10_3
@@ -8705,7 +8659,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v8, v7
 ; GFX10-NEXT:    s_cbranch_vccz .LBB10_10
 ; GFX10-NEXT:  ; %bb.9: ; %frem.else53
-; GFX10-NEXT:    v_bfi_b32 v6, 0x7fff, 0, v5
+; GFX10-NEXT:    v_and_b32_e32 v6, 0x8000, v5
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v8, v7
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, v5, v6, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB10_11
@@ -8776,7 +8730,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v9, v8
 ; GFX10-NEXT:    s_cbranch_vccz .LBB10_18
 ; GFX10-NEXT:  ; %bb.17: ; %frem.else20
-; GFX10-NEXT:    v_bfi_b32 v7, 0x7fff, 0, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 0x8000, v3
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v9, v8
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v3, v7, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB10_19
@@ -8848,7 +8802,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v11, v10
 ; GFX10-NEXT:    s_cbranch_vccz .LBB10_26
 ; GFX10-NEXT:  ; %bb.25: ; %frem.else
-; GFX10-NEXT:    v_bfi_b32 v9, 0x7fff, 0, v8
+; GFX10-NEXT:    v_and_b32_e32 v9, 0x8000, v8
 ; GFX10-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v11, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB10_27
@@ -8956,11 +8910,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB10_2
 ; GFX11-TRUE16-NEXT:  ; %bb.1: ; %frem.else86
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v4.l, v0.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v4.l, 0x8000, v0.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v4, 0x7fff, v7, v4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b16 v4.l, v0.l, v4.l, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX11-TRUE16-NEXT:    s_branch .LBB10_8
@@ -9054,12 +9006,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v9, v8
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB10_10
 ; GFX11-TRUE16-NEXT:  ; %bb.9: ; %frem.else53
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v7.l, v5.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v10.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, v5.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v9, v8
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v7, 0x7fff, v10, v7
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v7.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v7.l, v5.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB10_11
 ; GFX11-TRUE16-NEXT:    s_branch .LBB10_16
 ; GFX11-TRUE16-NEXT:  .LBB10_10:
@@ -9149,12 +9099,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v10, v9
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB10_18
 ; GFX11-TRUE16-NEXT:  ; %bb.17: ; %frem.else20
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v8.l, v1.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, v1.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v10, v9
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v8, 0x7fff, v11, v8
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.l, v8.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v8.l, v1.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB10_19
 ; GFX11-TRUE16-NEXT:    s_branch .LBB10_24
 ; GFX11-TRUE16-NEXT:  .LBB10_18:
@@ -9247,12 +9195,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-TRUE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v13, v12
 ; GFX11-TRUE16-NEXT:    s_cbranch_vccz .LBB10_26
 ; GFX11-TRUE16-NEXT:  ; %bb.25: ; %frem.else
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v11.l, v9.l
-; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v14.l, 0
+; GFX11-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, v9.l
 ; GFX11-TRUE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v13, v12
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT:    v_bfi_b32 v11, 0x7fff, v14, v11
-; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v9.l, v11.l, vcc_lo
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cndmask_b16 v11.l, v9.l, v0.h, vcc_lo
 ; GFX11-TRUE16-NEXT:    s_cbranch_execz .LBB10_27
 ; GFX11-TRUE16-NEXT:    s_branch .LBB10_32
 ; GFX11-TRUE16-NEXT:  .LBB10_26:
@@ -9375,7 +9321,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v6, v5
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB10_2
 ; GFX11-FAKE16-NEXT:  ; %bb.1: ; %frem.else86
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v4, 0x7fff, 0, v0
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v4, 0x8000, v0
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v6, v5
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc_lo
@@ -9470,7 +9416,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v9, v8
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB10_10
 ; GFX11-FAKE16-NEXT:  ; %bb.9: ; %frem.else53
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v7, 0x7fff, 0, v5
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v7, 0x8000, v5
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v9, v8
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v7, v5, v7, vcc_lo
@@ -9562,7 +9508,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v10, v9
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB10_18
 ; GFX11-FAKE16-NEXT:  ; %bb.17: ; %frem.else20
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v8, 0x7fff, 0, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v8, 0x8000, v1
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v10, v9
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v8, v1, v8, vcc_lo
@@ -9657,7 +9603,7 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-FAKE16-NEXT:    v_cmp_ngt_f32_e32 vcc_lo, v13, v12
 ; GFX11-FAKE16-NEXT:    s_cbranch_vccz .LBB10_26
 ; GFX11-FAKE16-NEXT:  ; %bb.25: ; %frem.else
-; GFX11-FAKE16-NEXT:    v_bfi_b32 v11, 0x7fff, 0, v9
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v11, 0x8000, v9
 ; GFX11-FAKE16-NEXT:    v_cmp_eq_f32_e32 vcc_lo, v13, v12
 ; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e32 v11, v9, v11, vcc_lo
@@ -9791,12 +9737,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s8, s6
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_2
 ; GFX1150-TRUE16-NEXT:  ; %bb.1: ; %frem.else86
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s5
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s8, s6
+; GFX1150-TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, s5
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v0.l, s5, v0.l, s9
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB10_8
@@ -9894,13 +9838,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s10, s9
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_10
 ; GFX1150-TRUE16-NEXT:  ; %bb.9: ; %frem.else53
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s8
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s10, s9
+; GFX1150-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, s8
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff, v2, v1
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v1.l, s8, v1.l, s11
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v1.l, s8, v0.h, s11
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB10_11
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB10_16
 ; GFX1150-TRUE16-NEXT:  .LBB10_10:
@@ -9995,13 +9937,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s10, s9
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_18
 ; GFX1150-TRUE16-NEXT:  ; %bb.17: ; %frem.else20
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s7
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s10, s9
+; GFX1150-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, s7
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v3, v2
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v2.l, s7, v2.l, s11
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v2.l, s7, v0.h, s11
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB10_19
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB10_24
 ; GFX1150-TRUE16-NEXT:  .LBB10_18:
@@ -10098,13 +10038,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-TRUE16-NEXT:    s_cmp_ngt_f32 s12, s11
 ; GFX1150-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_26
 ; GFX1150-TRUE16-NEXT:  ; %bb.25: ; %frem.else
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v3.l, s10
-; GFX1150-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX1150-TRUE16-NEXT:    s_cmp_eq_f32 s12, s11
+; GFX1150-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, s10
 ; GFX1150-TRUE16-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1150-TRUE16-NEXT:    v_bfi_b32 v3, 0x7fff, v4, v3
-; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v3.l, s10, v3.l, s13
+; GFX1150-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1150-TRUE16-NEXT:    v_cndmask_b16 v3.l, s10, v0.h, s13
 ; GFX1150-TRUE16-NEXT:    s_cbranch_execz .LBB10_27
 ; GFX1150-TRUE16-NEXT:    s_branch .LBB10_32
 ; GFX1150-TRUE16-NEXT:  .LBB10_26:
@@ -10247,8 +10185,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-FAKE16-NEXT:    s_cmp_ngt_f32 s8, s6
 ; GFX1150-FAKE16-NEXT:    s_cbranch_scc0 .LBB10_2
 ; GFX1150-FAKE16-NEXT:  ; %bb.1: ; %frem.else86
+; GFX1150-FAKE16-NEXT:    s_and_b32 s9, 0x8000, s5
 ; GFX1150-FAKE16-NEXT:    s_cmp_eq_f32 s8, s6
-; GFX1150-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, s5
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX1150-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-FAKE16-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
@@ -10347,8 +10286,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-FAKE16-NEXT:    s_cmp_ngt_f32 s10, s9
 ; GFX1150-FAKE16-NEXT:    s_cbranch_scc0 .LBB10_10
 ; GFX1150-FAKE16-NEXT:  ; %bb.9: ; %frem.else53
+; GFX1150-FAKE16-NEXT:    s_and_b32 s11, 0x8000, s8
 ; GFX1150-FAKE16-NEXT:    s_cmp_eq_f32 s10, s9
-; GFX1150-FAKE16-NEXT:    v_bfi_b32 v1, 0x7fff, 0, s8
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX1150-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-FAKE16-NEXT:    v_cndmask_b32_e32 v1, s8, v1, vcc_lo
@@ -10445,8 +10385,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-FAKE16-NEXT:    s_cmp_ngt_f32 s10, s9
 ; GFX1150-FAKE16-NEXT:    s_cbranch_scc0 .LBB10_18
 ; GFX1150-FAKE16-NEXT:  ; %bb.17: ; %frem.else20
+; GFX1150-FAKE16-NEXT:    s_and_b32 s11, 0x8000, s7
 ; GFX1150-FAKE16-NEXT:    s_cmp_eq_f32 s10, s9
-; GFX1150-FAKE16-NEXT:    v_bfi_b32 v2, 0x7fff, 0, s7
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX1150-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-FAKE16-NEXT:    v_cndmask_b32_e32 v2, s7, v2, vcc_lo
@@ -10545,8 +10486,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-FAKE16-NEXT:    s_cmp_ngt_f32 s12, s11
 ; GFX1150-FAKE16-NEXT:    s_cbranch_scc0 .LBB10_26
 ; GFX1150-FAKE16-NEXT:  ; %bb.25: ; %frem.else
+; GFX1150-FAKE16-NEXT:    s_and_b32 s13, 0x8000, s10
 ; GFX1150-FAKE16-NEXT:    s_cmp_eq_f32 s12, s11
-; GFX1150-FAKE16-NEXT:    v_bfi_b32 v3, 0x7fff, 0, s10
+; GFX1150-FAKE16-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX1150-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-FAKE16-NEXT:    v_cndmask_b32_e32 v3, s10, v3, vcc_lo
@@ -10694,12 +10636,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s8, s6
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_2
 ; GFX1200-TRUE16-NEXT:  ; %bb.1: ; %frem.else86
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s5
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s8, s6
+; GFX1200-TRUE16-NEXT:    v_and_b16 v0.l, 0x8000, s5
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v0, 0x7fff, v1, v0
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v0.l, s5, v0.l, s9
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB10_3
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB10_8
@@ -10803,13 +10743,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s10, s9
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_10
 ; GFX1200-TRUE16-NEXT:  ; %bb.9: ; %frem.else53
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v1.l, s8
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v2.l, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s10, s9
+; GFX1200-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, s8
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v1, 0x7fff, v2, v1
-; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v1.l, s8, v1.l, s11
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v1.l, s8, v0.h, s11
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB10_11
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB10_16
 ; GFX1200-TRUE16-NEXT:  .LBB10_10:
@@ -10910,14 +10848,12 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s10, s9
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_18
 ; GFX1200-TRUE16-NEXT:  ; %bb.17: ; %frem.else20
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v2.l, s7
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v3.l, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s10, s9
+; GFX1200-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, s7
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v2, 0x7fff, v3, v2
 ; GFX1200-TRUE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v2.l, s7, v2.l, s11
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v2.l, s7, v0.h, s11
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB10_19
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB10_24
 ; GFX1200-TRUE16-NEXT:  .LBB10_18:
@@ -11021,13 +10957,11 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-TRUE16-NEXT:    s_cmp_ngt_f32 s12, s11
 ; GFX1200-TRUE16-NEXT:    s_cbranch_scc0 .LBB10_26
 ; GFX1200-TRUE16-NEXT:  ; %bb.25: ; %frem.else
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v3.l, s10
-; GFX1200-TRUE16-NEXT:    v_mov_b16_e32 v4.l, 0
 ; GFX1200-TRUE16-NEXT:    s_cmp_eq_f32 s12, s11
+; GFX1200-TRUE16-NEXT:    v_and_b16 v0.h, 0x8000, s10
 ; GFX1200-TRUE16-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1200-TRUE16-NEXT:    v_bfi_b32 v3, 0x7fff, v4, v3
-; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v3.l, s10, v3.l, s13
+; GFX1200-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
+; GFX1200-TRUE16-NEXT:    v_cndmask_b16 v3.l, s10, v0.h, s13
 ; GFX1200-TRUE16-NEXT:    s_cbranch_execz .LBB10_27
 ; GFX1200-TRUE16-NEXT:    s_branch .LBB10_32
 ; GFX1200-TRUE16-NEXT:  .LBB10_26:
@@ -11177,8 +11111,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-FAKE16-NEXT:    s_cmp_ngt_f32 s8, s6
 ; GFX1200-FAKE16-NEXT:    s_cbranch_scc0 .LBB10_2
 ; GFX1200-FAKE16-NEXT:  ; %bb.1: ; %frem.else86
+; GFX1200-FAKE16-NEXT:    s_and_b32 s9, 0x8000, s5
 ; GFX1200-FAKE16-NEXT:    s_cmp_eq_f32 s8, s6
-; GFX1200-FAKE16-NEXT:    v_bfi_b32 v0, 0x7fff, 0, s5
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v0, s9
 ; GFX1200-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-FAKE16-NEXT:    v_cndmask_b32_e32 v0, s5, v0, vcc_lo
@@ -11284,8 +11219,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-FAKE16-NEXT:    s_cmp_ngt_f32 s10, s9
 ; GFX1200-FAKE16-NEXT:    s_cbranch_scc0 .LBB10_10
 ; GFX1200-FAKE16-NEXT:  ; %bb.9: ; %frem.else53
+; GFX1200-FAKE16-NEXT:    s_and_b32 s11, 0x8000, s8
 ; GFX1200-FAKE16-NEXT:    s_cmp_eq_f32 s10, s9
-; GFX1200-FAKE16-NEXT:    v_bfi_b32 v1, 0x7fff, 0, s8
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX1200-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1200-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -11389,8 +11325,10 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-FAKE16-NEXT:    s_cmp_ngt_f32 s10, s9
 ; GFX1200-FAKE16-NEXT:    s_cbranch_scc0 .LBB10_18
 ; GFX1200-FAKE16-NEXT:  ; %bb.17: ; %frem.else20
+; GFX1200-FAKE16-NEXT:    s_and_b32 s11, 0x8000, s7
 ; GFX1200-FAKE16-NEXT:    s_cmp_eq_f32 s10, s9
-; GFX1200-FAKE16-NEXT:    v_bfi_b32 v2, 0x7fff, 0, s7
+; GFX1200-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v2, s11
 ; GFX1200-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1200-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -11497,8 +11435,9 @@ define amdgpu_kernel void @frem_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-FAKE16-NEXT:    s_cmp_ngt_f32 s12, s11
 ; GFX1200-FAKE16-NEXT:    s_cbranch_scc0 .LBB10_26
 ; GFX1200-FAKE16-NEXT:  ; %bb.25: ; %frem.else
+; GFX1200-FAKE16-NEXT:    s_and_b32 s13, 0x8000, s10
 ; GFX1200-FAKE16-NEXT:    s_cmp_eq_f32 s12, s11
-; GFX1200-FAKE16-NEXT:    v_bfi_b32 v3, 0x7fff, 0, s10
+; GFX1200-FAKE16-NEXT:    v_mov_b32_e32 v3, s13
 ; GFX1200-FAKE16-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1200-FAKE16-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -11660,8 +11599,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_vccz .LBB11_2
 ; SI-NEXT:  ; %bb.1: ; %frem.else16
-; SI-NEXT:    s_brev_b32 s2, -2
-; SI-NEXT:    v_bfi_b32 v4, s2, 0, v0
+; SI-NEXT:    v_and_b32_e32 v4, 0x80000000, v0
 ; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v2|
 ; SI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
 ; SI-NEXT:    s_mov_b64 vcc, exec
@@ -11741,8 +11679,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_vccz .LBB11_10
 ; SI-NEXT:  ; %bb.9: ; %frem.else
-; SI-NEXT:    s_brev_b32 s2, -2
-; SI-NEXT:    v_bfi_b32 v5, s2, 0, v1
+; SI-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v3|
 ; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
 ; SI-NEXT:    s_mov_b64 vcc, exec
@@ -11851,8 +11788,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CI-NEXT:    s_cbranch_vccz .LBB11_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else16
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_bfi_b32 v4, s2, 0, v0
+; CI-NEXT:    v_and_b32_e32 v4, 0x80000000, v0
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v2|
 ; CI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB11_3
@@ -11919,8 +11855,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CI-NEXT:    s_cbranch_vccz .LBB11_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_bfi_b32 v5, s2, 0, v1
+; CI-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v3|
 ; CI-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB11_11
@@ -12016,8 +11951,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; VI-NEXT:    s_cbranch_vccz .LBB11_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else16
-; VI-NEXT:    s_brev_b32 s2, -2
-; VI-NEXT:    v_bfi_b32 v4, s2, 0, v0
+; VI-NEXT:    v_and_b32_e32 v4, 0x80000000, v0
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v2|
 ; VI-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB11_3
@@ -12084,8 +12018,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; VI-NEXT:    s_cbranch_vccz .LBB11_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else
-; VI-NEXT:    s_brev_b32 s2, -2
-; VI-NEXT:    v_bfi_b32 v5, s2, 0, v1
+; VI-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v3|
 ; VI-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB11_11
@@ -12176,8 +12109,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB11_2
 ; GFX9-NEXT:  ; %bb.1: ; %frem.else16
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    v_bfi_b32 v4, s2, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v4, 0x80000000, v0
 ; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v2|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB11_3
@@ -12244,8 +12176,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB11_10
 ; GFX9-NEXT:  ; %bb.9: ; %frem.else
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    v_bfi_b32 v5, s2, 0, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v3|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB11_11
@@ -12337,7 +12268,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX10-NEXT:    s_cbranch_vccz .LBB11_2
 ; GFX10-NEXT:  ; %bb.1: ; %frem.else16
-; GFX10-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v4, 0x80000000, v0
 ; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2|
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB11_3
@@ -12406,7 +12337,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX10-NEXT:    s_cbranch_vccz .LBB11_10
 ; GFX10-NEXT:  ; %bb.9: ; %frem.else
-; GFX10-NEXT:    v_bfi_b32 v5, 0x7fffffff, 0, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3|
 ; GFX10-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB11_11
@@ -12498,7 +12429,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX11-NEXT:    s_cbranch_vccz .LBB11_2
 ; GFX11-NEXT:  ; %bb.1: ; %frem.else16
-; GFX11-NEXT:    v_bfi_b32 v4, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_and_b32_e32 v4, 0x80000000, v0
 ; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v2|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc_lo
@@ -12587,7 +12518,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX11-NEXT:    s_cbranch_vccz .LBB11_10
 ; GFX11-NEXT:  ; %bb.9: ; %frem.else
-; GFX11-NEXT:    v_bfi_b32 v5, 0x7fffffff, 0, v1
+; GFX11-NEXT:    v_and_b32_e32 v5, 0x80000000, v1
 ; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v1|, |v3|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v1, v5, vcc_lo
@@ -12699,13 +12630,14 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    s_waitcnt vmcnt(0)
 ; GFX1150-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX1150-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1150-NEXT:    s_and_b32 s8, s4, 0x7fffffff
+; GFX1150-NEXT:    s_and_b32 s7, s4, 0x7fffffff
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-NEXT:    s_cmp_ngt_f32 s3, s8
+; GFX1150-NEXT:    s_cmp_ngt_f32 s3, s7
 ; GFX1150-NEXT:    s_cbranch_scc0 .LBB11_2
 ; GFX1150-NEXT:  ; %bb.1: ; %frem.else16
-; GFX1150-NEXT:    s_cmp_eq_f32 s3, s8
-; GFX1150-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, s6
+; GFX1150-NEXT:    s_and_b32 s8, 0x80000000, s6
+; GFX1150-NEXT:    s_cmp_eq_f32 s3, s7
+; GFX1150-NEXT:    v_mov_b32_e32 v0, s8
 ; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
@@ -12793,13 +12725,14 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, s6
 ; GFX1150-NEXT:  .LBB11_8:
 ; GFX1150-NEXT:    s_and_b32 s6, s5, 0x7fffffff
-; GFX1150-NEXT:    s_and_b32 s8, s2, 0x7fffffff
+; GFX1150-NEXT:    s_and_b32 s7, s2, 0x7fffffff
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-NEXT:    s_cmp_ngt_f32 s6, s8
+; GFX1150-NEXT:    s_cmp_ngt_f32 s6, s7
 ; GFX1150-NEXT:    s_cbranch_scc0 .LBB11_10
 ; GFX1150-NEXT:  ; %bb.9: ; %frem.else
-; GFX1150-NEXT:    s_cmp_eq_f32 s6, s8
-; GFX1150-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, s5
+; GFX1150-NEXT:    s_and_b32 s8, 0x80000000, s5
+; GFX1150-NEXT:    s_cmp_eq_f32 s6, s7
+; GFX1150-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
@@ -12919,21 +12852,21 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_wait_loadcnt 0x0
 ; GFX1200-NEXT:    v_readfirstlane_b32 s4, v1
 ; GFX1200-NEXT:    v_readfirstlane_b32 s2, v2
-; GFX1200-NEXT:    s_and_b32 s8, s4, 0x7fffffff
+; GFX1200-NEXT:    s_and_b32 s7, s4, 0x7fffffff
 ; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1200-NEXT:    s_cmp_ngt_f32 s3, s8
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB11_2
+; GFX1200-NEXT:    s_cmp_ngt_f32 s3, s7
+; GFX1200-NEXT:    s_cbranch_scc0 .LBB11_3
 ; GFX1200-NEXT:  ; %bb.1: ; %frem.else16
-; GFX1200-NEXT:    s_cmp_eq_f32 s3, s8
-; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, s6
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v0, s6, v0, vcc_lo
-; GFX1200-NEXT:    s_cbranch_execz .LBB11_3
-; GFX1200-NEXT:    s_branch .LBB11_8
-; GFX1200-NEXT:  .LBB11_2:
-; GFX1200-NEXT:    ; implicit-def: $vgpr0
-; GFX1200-NEXT:  .LBB11_3: ; %frem.compute15
+; GFX1200-NEXT:    s_and_b32 s8, 0x80000000, s6
+; GFX1200-NEXT:    s_cmp_eq_f32 s3, s7
+; GFX1200-NEXT:    s_cselect_b32 s7, s8, s6
+; GFX1200-NEXT:    s_cbranch_execz .LBB11_4
+; GFX1200-NEXT:  ; %bb.2:
+; GFX1200-NEXT:    v_mov_b32_e32 v0, s7
+; GFX1200-NEXT:    s_branch .LBB11_9
+; GFX1200-NEXT:  .LBB11_3:
+; GFX1200-NEXT:    ; implicit-def: $sgpr7
+; GFX1200-NEXT:  .LBB11_4: ; %frem.compute15
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s4|
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v0, |s6|
 ; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v3, s6
@@ -12963,17 +12896,16 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_fmac_f32_e32 v7, v8, v6
 ; GFX1200-NEXT:    v_fma_f32 v3, -v5, v7, v3
 ; GFX1200-NEXT:    s_denorm_mode 12
-; GFX1200-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
 ; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v4
 ; GFX1200-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB11_7
-; GFX1200-NEXT:  ; %bb.4: ; %frem.loop_body23.preheader
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB11_8
+; GFX1200-NEXT:  ; %bb.5: ; %frem.loop_body23.preheader
 ; GFX1200-NEXT:    s_sub_co_i32 s7, s7, s8
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-NEXT:    s_add_co_i32 s7, s7, 12
-; GFX1200-NEXT:  .LBB11_5: ; %frem.loop_body23
+; GFX1200-NEXT:  .LBB11_6: ; %frem.loop_body23
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_mov_b32_e32 v5, v2
@@ -12993,11 +12925,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX1200-NEXT:    v_ldexp_f32 v2, v2, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB11_5
-; GFX1200-NEXT:  ; %bb.6: ; %Flow51
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB11_6
+; GFX1200-NEXT:  ; %bb.7: ; %Flow51
 ; GFX1200-NEXT:    v_mov_b32_e32 v4, s7
 ; GFX1200-NEXT:    v_mov_b32_e32 v2, v5
-; GFX1200-NEXT:  .LBB11_7: ; %frem.loop_exit24
+; GFX1200-NEXT:  .LBB11_8: ; %frem.loop_exit24
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v4, -11, v4
 ; GFX1200-NEXT:    v_ldexp_f32 v2, v2, v4
@@ -13015,24 +12947,24 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, s6
-; GFX1200-NEXT:  .LBB11_8:
+; GFX1200-NEXT:  .LBB11_9:
 ; GFX1200-NEXT:    s_and_b32 s6, s5, 0x7fffffff
-; GFX1200-NEXT:    s_and_b32 s8, s2, 0x7fffffff
+; GFX1200-NEXT:    s_and_b32 s7, s2, 0x7fffffff
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX1200-NEXT:    s_cmp_ngt_f32 s6, s8
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB11_10
-; GFX1200-NEXT:  ; %bb.9: ; %frem.else
-; GFX1200-NEXT:    s_cmp_eq_f32 s6, s8
-; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, s5
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1200-NEXT:    s_cmp_ngt_f32 s6, s7
+; GFX1200-NEXT:    s_cbranch_scc0 .LBB11_12
+; GFX1200-NEXT:  ; %bb.10: ; %frem.else
+; GFX1200-NEXT:    s_and_b32 s8, 0x80000000, s5
+; GFX1200-NEXT:    s_cmp_eq_f32 s6, s7
+; GFX1200-NEXT:    s_cselect_b32 s7, s8, s5
+; GFX1200-NEXT:    s_cbranch_execz .LBB11_13
+; GFX1200-NEXT:  ; %bb.11:
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v1, s5, v1, vcc_lo
-; GFX1200-NEXT:    s_cbranch_execz .LBB11_11
-; GFX1200-NEXT:    s_branch .LBB11_16
-; GFX1200-NEXT:  .LBB11_10:
-; GFX1200-NEXT:    ; implicit-def: $vgpr1
-; GFX1200-NEXT:  .LBB11_11: ; %frem.compute
+; GFX1200-NEXT:    v_mov_b32_e32 v1, s7
+; GFX1200-NEXT:    s_branch .LBB11_18
+; GFX1200-NEXT:  .LBB11_12:
+; GFX1200-NEXT:    ; implicit-def: $sgpr7
+; GFX1200-NEXT:  .LBB11_13: ; %frem.compute
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v2, |s2|
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s5|
 ; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v4, s5
@@ -13067,12 +12999,13 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
 ; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v5
 ; GFX1200-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB11_15
-; GFX1200-NEXT:  ; %bb.12: ; %frem.loop_body.preheader
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB11_17
+; GFX1200-NEXT:  ; %bb.14: ; %frem.loop_body.preheader
+; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-NEXT:    s_sub_co_i32 s7, s7, s8
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-NEXT:    s_add_co_i32 s7, s7, 12
-; GFX1200-NEXT:  .LBB11_13: ; %frem.loop_body
+; GFX1200-NEXT:  .LBB11_15: ; %frem.loop_body
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_mov_b32_e32 v6, v3
@@ -13092,11 +13025,11 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
 ; GFX1200-NEXT:    v_ldexp_f32 v3, v3, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB11_13
-; GFX1200-NEXT:  ; %bb.14: ; %Flow
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB11_15
+; GFX1200-NEXT:  ; %bb.16: ; %Flow
 ; GFX1200-NEXT:    v_mov_b32_e32 v5, s7
 ; GFX1200-NEXT:    v_mov_b32_e32 v3, v6
-; GFX1200-NEXT:  .LBB11_15: ; %frem.loop_exit
+; GFX1200-NEXT:  .LBB11_17: ; %frem.loop_exit
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v5, -11, v5
 ; GFX1200-NEXT:    v_ldexp_f32 v3, v3, v5
@@ -13114,7 +13047,7 @@ define amdgpu_kernel void @frem_v2f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s5
-; GFX1200-NEXT:  .LBB11_16: ; %Flow50
+; GFX1200-NEXT:  .LBB11_18: ; %Flow50
 ; GFX1200-NEXT:    s_cmp_lg_f32 s4, 0
 ; GFX1200-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1200-NEXT:    s_cmp_nge_f32 s3, 0x7f800000
@@ -13161,8 +13094,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_vccz .LBB12_2
 ; SI-NEXT:  ; %bb.1: ; %frem.else78
-; SI-NEXT:    s_brev_b32 s2, -2
-; SI-NEXT:    v_bfi_b32 v8, s2, 0, v0
+; SI-NEXT:    v_and_b32_e32 v8, 0x80000000, v0
 ; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v4|
 ; SI-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
 ; SI-NEXT:    s_mov_b64 vcc, exec
@@ -13242,8 +13174,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_vccz .LBB12_10
 ; SI-NEXT:  ; %bb.9: ; %frem.else47
-; SI-NEXT:    s_brev_b32 s2, -2
-; SI-NEXT:    v_bfi_b32 v9, s2, 0, v1
+; SI-NEXT:    v_and_b32_e32 v9, 0x80000000, v1
 ; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v5|
 ; SI-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc
 ; SI-NEXT:    s_mov_b64 vcc, exec
@@ -13323,8 +13254,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_vccz .LBB12_18
 ; SI-NEXT:  ; %bb.17: ; %frem.else16
-; SI-NEXT:    s_brev_b32 s2, -2
-; SI-NEXT:    v_bfi_b32 v10, s2, 0, v2
+; SI-NEXT:    v_and_b32_e32 v10, 0x80000000, v2
 ; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v2|, |v6|
 ; SI-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
 ; SI-NEXT:    s_mov_b64 vcc, exec
@@ -13404,8 +13334,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; SI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; SI-NEXT:    s_cbranch_vccz .LBB12_26
 ; SI-NEXT:  ; %bb.25: ; %frem.else
-; SI-NEXT:    s_brev_b32 s2, -2
-; SI-NEXT:    v_bfi_b32 v11, s2, 0, v3
+; SI-NEXT:    v_and_b32_e32 v11, 0x80000000, v3
 ; SI-NEXT:    v_cmp_eq_f32_e64 vcc, |v3|, |v7|
 ; SI-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc
 ; SI-NEXT:    s_mov_b64 vcc, exec
@@ -13522,8 +13451,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CI-NEXT:    s_cbranch_vccz .LBB12_2
 ; CI-NEXT:  ; %bb.1: ; %frem.else78
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_bfi_b32 v8, s2, 0, v0
+; CI-NEXT:    v_and_b32_e32 v8, 0x80000000, v0
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v4|
 ; CI-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB12_3
@@ -13590,8 +13518,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CI-NEXT:    s_cbranch_vccz .LBB12_10
 ; CI-NEXT:  ; %bb.9: ; %frem.else47
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_bfi_b32 v9, s2, 0, v1
+; CI-NEXT:    v_and_b32_e32 v9, 0x80000000, v1
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v5|
 ; CI-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB12_11
@@ -13658,8 +13585,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CI-NEXT:    s_cbranch_vccz .LBB12_18
 ; CI-NEXT:  ; %bb.17: ; %frem.else16
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_bfi_b32 v10, s2, 0, v2
+; CI-NEXT:    v_and_b32_e32 v10, 0x80000000, v2
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v2|, |v6|
 ; CI-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB12_19
@@ -13726,8 +13652,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; CI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; CI-NEXT:    s_cbranch_vccz .LBB12_26
 ; CI-NEXT:  ; %bb.25: ; %frem.else
-; CI-NEXT:    s_brev_b32 s2, -2
-; CI-NEXT:    v_bfi_b32 v11, s2, 0, v3
+; CI-NEXT:    v_and_b32_e32 v11, 0x80000000, v3
 ; CI-NEXT:    v_cmp_eq_f32_e64 vcc, |v3|, |v7|
 ; CI-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc
 ; CI-NEXT:    s_cbranch_execz .LBB12_27
@@ -13831,8 +13756,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; VI-NEXT:    s_cbranch_vccz .LBB12_2
 ; VI-NEXT:  ; %bb.1: ; %frem.else78
-; VI-NEXT:    s_brev_b32 s2, -2
-; VI-NEXT:    v_bfi_b32 v8, s2, 0, v0
+; VI-NEXT:    v_and_b32_e32 v8, 0x80000000, v0
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v4|
 ; VI-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB12_3
@@ -13899,8 +13823,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; VI-NEXT:    s_cbranch_vccz .LBB12_10
 ; VI-NEXT:  ; %bb.9: ; %frem.else47
-; VI-NEXT:    s_brev_b32 s2, -2
-; VI-NEXT:    v_bfi_b32 v9, s2, 0, v1
+; VI-NEXT:    v_and_b32_e32 v9, 0x80000000, v1
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v5|
 ; VI-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB12_11
@@ -13967,8 +13890,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; VI-NEXT:    s_cbranch_vccz .LBB12_18
 ; VI-NEXT:  ; %bb.17: ; %frem.else16
-; VI-NEXT:    s_brev_b32 s2, -2
-; VI-NEXT:    v_bfi_b32 v10, s2, 0, v2
+; VI-NEXT:    v_and_b32_e32 v10, 0x80000000, v2
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v2|, |v6|
 ; VI-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB12_19
@@ -14035,8 +13957,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; VI-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; VI-NEXT:    s_cbranch_vccz .LBB12_26
 ; VI-NEXT:  ; %bb.25: ; %frem.else
-; VI-NEXT:    s_brev_b32 s2, -2
-; VI-NEXT:    v_bfi_b32 v11, s2, 0, v3
+; VI-NEXT:    v_and_b32_e32 v11, 0x80000000, v3
 ; VI-NEXT:    v_cmp_eq_f32_e64 vcc, |v3|, |v7|
 ; VI-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc
 ; VI-NEXT:    s_cbranch_execz .LBB12_27
@@ -14135,8 +14056,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB12_2
 ; GFX9-NEXT:  ; %bb.1: ; %frem.else78
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    v_bfi_b32 v8, s2, 0, v0
+; GFX9-NEXT:    v_and_b32_e32 v8, 0x80000000, v0
 ; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v0|, |v4|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_3
@@ -14203,8 +14123,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB12_10
 ; GFX9-NEXT:  ; %bb.9: ; %frem.else47
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    v_bfi_b32 v9, s2, 0, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 0x80000000, v1
 ; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v1|, |v5|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_11
@@ -14271,8 +14190,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB12_18
 ; GFX9-NEXT:  ; %bb.17: ; %frem.else16
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    v_bfi_b32 v10, s2, 0, v2
+; GFX9-NEXT:    v_and_b32_e32 v10, 0x80000000, v2
 ; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v2|, |v6|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_19
@@ -14339,8 +14257,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX9-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX9-NEXT:    s_cbranch_vccz .LBB12_26
 ; GFX9-NEXT:  ; %bb.25: ; %frem.else
-; GFX9-NEXT:    s_brev_b32 s2, -2
-; GFX9-NEXT:    v_bfi_b32 v11, s2, 0, v3
+; GFX9-NEXT:    v_and_b32_e32 v11, 0x80000000, v3
 ; GFX9-NEXT:    v_cmp_eq_f32_e64 vcc, |v3|, |v7|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc
 ; GFX9-NEXT:    s_cbranch_execz .LBB12_27
@@ -14440,7 +14357,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX10-NEXT:    s_cbranch_vccz .LBB12_2
 ; GFX10-NEXT:  ; %bb.1: ; %frem.else78
-; GFX10-NEXT:    v_bfi_b32 v8, 0x7fffffff, 0, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0x80000000, v0
 ; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4|
 ; GFX10-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB12_3
@@ -14509,7 +14426,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX10-NEXT:    s_cbranch_vccz .LBB12_10
 ; GFX10-NEXT:  ; %bb.9: ; %frem.else47
-; GFX10-NEXT:    v_bfi_b32 v9, 0x7fffffff, 0, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 0x80000000, v1
 ; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5|
 ; GFX10-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB12_11
@@ -14578,7 +14495,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX10-NEXT:    s_cbranch_vccz .LBB12_18
 ; GFX10-NEXT:  ; %bb.17: ; %frem.else16
-; GFX10-NEXT:    v_bfi_b32 v10, 0x7fffffff, 0, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0x80000000, v2
 ; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6|
 ; GFX10-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB12_19
@@ -14647,7 +14564,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX10-NEXT:    s_cbranch_vccz .LBB12_26
 ; GFX10-NEXT:  ; %bb.25: ; %frem.else
-; GFX10-NEXT:    v_bfi_b32 v11, 0x7fffffff, 0, v3
+; GFX10-NEXT:    v_and_b32_e32 v11, 0x80000000, v3
 ; GFX10-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7|
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc_lo
 ; GFX10-NEXT:    s_cbranch_execz .LBB12_27
@@ -14747,7 +14664,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX11-NEXT:    s_cbranch_vccz .LBB12_2
 ; GFX11-NEXT:  ; %bb.1: ; %frem.else78
-; GFX11-NEXT:    v_bfi_b32 v8, 0x7fffffff, 0, v0
+; GFX11-NEXT:    v_and_b32_e32 v8, 0x80000000, v0
 ; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v0|, |v4|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v8, v0, v8, vcc_lo
@@ -14836,7 +14753,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX11-NEXT:    s_cbranch_vccz .LBB12_10
 ; GFX11-NEXT:  ; %bb.9: ; %frem.else47
-; GFX11-NEXT:    v_bfi_b32 v9, 0x7fffffff, 0, v1
+; GFX11-NEXT:    v_and_b32_e32 v9, 0x80000000, v1
 ; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v1|, |v5|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v9, v1, v9, vcc_lo
@@ -14925,7 +14842,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX11-NEXT:    s_cbranch_vccz .LBB12_18
 ; GFX11-NEXT:  ; %bb.17: ; %frem.else16
-; GFX11-NEXT:    v_bfi_b32 v10, 0x7fffffff, 0, v2
+; GFX11-NEXT:    v_and_b32_e32 v10, 0x80000000, v2
 ; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v2|, |v6|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v10, v2, v10, vcc_lo
@@ -15014,7 +14931,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX11-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
 ; GFX11-NEXT:    s_cbranch_vccz .LBB12_26
 ; GFX11-NEXT:  ; %bb.25: ; %frem.else
-; GFX11-NEXT:    v_bfi_b32 v11, 0x7fffffff, 0, v3
+; GFX11-NEXT:    v_and_b32_e32 v11, 0x80000000, v3
 ; GFX11-NEXT:    v_cmp_eq_f32_e64 vcc_lo, |v3|, |v7|
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v11, v3, v11, vcc_lo
@@ -15139,13 +15056,14 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1150-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1150-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX1150-NEXT:    s_and_b32 s12, s6, 0x7fffffff
+; GFX1150-NEXT:    s_and_b32 s11, s6, 0x7fffffff
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-NEXT:    s_cmp_ngt_f32 s5, s12
+; GFX1150-NEXT:    s_cmp_ngt_f32 s5, s11
 ; GFX1150-NEXT:    s_cbranch_scc0 .LBB12_2
 ; GFX1150-NEXT:  ; %bb.1: ; %frem.else78
-; GFX1150-NEXT:    s_cmp_eq_f32 s5, s12
-; GFX1150-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, s8
+; GFX1150-NEXT:    s_and_b32 s12, 0x80000000, s8
+; GFX1150-NEXT:    s_cmp_eq_f32 s5, s11
+; GFX1150-NEXT:    v_mov_b32_e32 v0, s12
 ; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_cndmask_b32_e32 v0, s8, v0, vcc_lo
@@ -15233,13 +15151,14 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, s8
 ; GFX1150-NEXT:  .LBB12_8:
 ; GFX1150-NEXT:    s_and_b32 s8, s10, 0x7fffffff
-; GFX1150-NEXT:    s_and_b32 s12, s4, 0x7fffffff
+; GFX1150-NEXT:    s_and_b32 s11, s4, 0x7fffffff
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-NEXT:    s_cmp_ngt_f32 s8, s12
+; GFX1150-NEXT:    s_cmp_ngt_f32 s8, s11
 ; GFX1150-NEXT:    s_cbranch_scc0 .LBB12_10
 ; GFX1150-NEXT:  ; %bb.9: ; %frem.else47
-; GFX1150-NEXT:    s_cmp_eq_f32 s8, s12
-; GFX1150-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, s10
+; GFX1150-NEXT:    s_and_b32 s12, 0x80000000, s10
+; GFX1150-NEXT:    s_cmp_eq_f32 s8, s11
+; GFX1150-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_cndmask_b32_e32 v1, s10, v1, vcc_lo
@@ -15327,13 +15246,14 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s10
 ; GFX1150-NEXT:  .LBB12_16:
 ; GFX1150-NEXT:    s_and_b32 s10, s9, 0x7fffffff
-; GFX1150-NEXT:    s_and_b32 s12, s3, 0x7fffffff
+; GFX1150-NEXT:    s_and_b32 s11, s3, 0x7fffffff
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-NEXT:    s_cmp_ngt_f32 s10, s12
+; GFX1150-NEXT:    s_cmp_ngt_f32 s10, s11
 ; GFX1150-NEXT:    s_cbranch_scc0 .LBB12_18
 ; GFX1150-NEXT:  ; %bb.17: ; %frem.else16
-; GFX1150-NEXT:    s_cmp_eq_f32 s10, s12
-; GFX1150-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, s9
+; GFX1150-NEXT:    s_and_b32 s12, 0x80000000, s9
+; GFX1150-NEXT:    s_cmp_eq_f32 s10, s11
+; GFX1150-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_cndmask_b32_e32 v2, s9, v2, vcc_lo
@@ -15421,13 +15341,14 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1150-NEXT:    v_bfi_b32 v2, 0x7fffffff, v2, s9
 ; GFX1150-NEXT:  .LBB12_24:
 ; GFX1150-NEXT:    s_and_b32 s9, s7, 0x7fffffff
-; GFX1150-NEXT:    s_and_b32 s12, s2, 0x7fffffff
+; GFX1150-NEXT:    s_and_b32 s11, s2, 0x7fffffff
 ; GFX1150-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1150-NEXT:    s_cmp_ngt_f32 s9, s12
+; GFX1150-NEXT:    s_cmp_ngt_f32 s9, s11
 ; GFX1150-NEXT:    s_cbranch_scc0 .LBB12_26
 ; GFX1150-NEXT:  ; %bb.25: ; %frem.else
-; GFX1150-NEXT:    s_cmp_eq_f32 s9, s12
-; GFX1150-NEXT:    v_bfi_b32 v3, 0x7fffffff, 0, s7
+; GFX1150-NEXT:    s_and_b32 s12, 0x80000000, s7
+; GFX1150-NEXT:    s_cmp_eq_f32 s9, s11
+; GFX1150-NEXT:    v_mov_b32_e32 v3, s12
 ; GFX1150-NEXT:    s_cselect_b32 vcc_lo, -1, 0
 ; GFX1150-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1150-NEXT:    v_cndmask_b32_e32 v3, s7, v3, vcc_lo
@@ -15566,21 +15487,21 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_readfirstlane_b32 s4, v2
 ; GFX1200-NEXT:    v_readfirstlane_b32 s3, v3
 ; GFX1200-NEXT:    v_readfirstlane_b32 s2, v4
-; GFX1200-NEXT:    s_and_b32 s12, s6, 0x7fffffff
+; GFX1200-NEXT:    s_and_b32 s11, s6, 0x7fffffff
 ; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1200-NEXT:    s_cmp_ngt_f32 s5, s12
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_2
+; GFX1200-NEXT:    s_cmp_ngt_f32 s5, s11
+; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_3
 ; GFX1200-NEXT:  ; %bb.1: ; %frem.else78
-; GFX1200-NEXT:    s_cmp_eq_f32 s5, s12
-; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, 0, s8
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v0, s8, v0, vcc_lo
-; GFX1200-NEXT:    s_cbranch_execz .LBB12_3
-; GFX1200-NEXT:    s_branch .LBB12_8
-; GFX1200-NEXT:  .LBB12_2:
-; GFX1200-NEXT:    ; implicit-def: $vgpr0
-; GFX1200-NEXT:  .LBB12_3: ; %frem.compute77
+; GFX1200-NEXT:    s_and_b32 s12, 0x80000000, s8
+; GFX1200-NEXT:    s_cmp_eq_f32 s5, s11
+; GFX1200-NEXT:    s_cselect_b32 s11, s12, s8
+; GFX1200-NEXT:    s_cbranch_execz .LBB12_4
+; GFX1200-NEXT:  ; %bb.2:
+; GFX1200-NEXT:    v_mov_b32_e32 v0, s11
+; GFX1200-NEXT:    s_branch .LBB12_9
+; GFX1200-NEXT:  .LBB12_3:
+; GFX1200-NEXT:    ; implicit-def: $sgpr11
+; GFX1200-NEXT:  .LBB12_4: ; %frem.compute77
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s6|
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v0, |s8|
 ; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v3, s8
@@ -15610,17 +15531,16 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_fmac_f32_e32 v7, v8, v6
 ; GFX1200-NEXT:    v_fma_f32 v3, -v5, v7, v3
 ; GFX1200-NEXT:    s_denorm_mode 12
-; GFX1200-NEXT:    s_wait_alu depctr_va_vcc(0)
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX1200-NEXT:    v_div_fmas_f32 v3, v3, v6, v7
 ; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v4
 ; GFX1200-NEXT:    v_div_fixup_f32 v3, v3, v1, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_7
-; GFX1200-NEXT:  ; %bb.4: ; %frem.loop_body85.preheader
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_8
+; GFX1200-NEXT:  ; %bb.5: ; %frem.loop_body85.preheader
 ; GFX1200-NEXT:    s_sub_co_i32 s11, s11, s12
 ; GFX1200-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1200-NEXT:    s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT:  .LBB12_5: ; %frem.loop_body85
+; GFX1200-NEXT:  .LBB12_6: ; %frem.loop_body85
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX1200-NEXT:    v_mov_b32_e32 v5, v2
@@ -15639,11 +15559,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc_lo
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_ldexp_f32 v2, v2, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_5
-; GFX1200-NEXT:  ; %bb.6: ; %Flow125
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_6
+; GFX1200-NEXT:  ; %bb.7: ; %Flow125
 ; GFX1200-NEXT:    v_mov_b32_e32 v4, s11
 ; GFX1200-NEXT:    v_mov_b32_e32 v2, v5
-; GFX1200-NEXT:  .LBB12_7: ; %frem.loop_exit86
+; GFX1200-NEXT:  .LBB12_8: ; %frem.loop_exit86
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v4, -11, v4
 ; GFX1200-NEXT:    v_ldexp_f32 v2, v2, v4
@@ -15661,24 +15581,24 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_ldexp_f32 v0, v1, v0
 ; GFX1200-NEXT:    v_bfi_b32 v0, 0x7fffffff, v0, s8
-; GFX1200-NEXT:  .LBB12_8:
+; GFX1200-NEXT:  .LBB12_9:
 ; GFX1200-NEXT:    s_and_b32 s8, s10, 0x7fffffff
-; GFX1200-NEXT:    s_and_b32 s12, s4, 0x7fffffff
+; GFX1200-NEXT:    s_and_b32 s11, s4, 0x7fffffff
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX1200-NEXT:    s_cmp_ngt_f32 s8, s12
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_10
-; GFX1200-NEXT:  ; %bb.9: ; %frem.else47
-; GFX1200-NEXT:    s_cmp_eq_f32 s8, s12
-; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, s10
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1200-NEXT:    s_cmp_ngt_f32 s8, s11
+; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_12
+; GFX1200-NEXT:  ; %bb.10: ; %frem.else47
+; GFX1200-NEXT:    s_and_b32 s12, 0x80000000, s10
+; GFX1200-NEXT:    s_cmp_eq_f32 s8, s11
+; GFX1200-NEXT:    s_cselect_b32 s11, s12, s10
+; GFX1200-NEXT:    s_cbranch_execz .LBB12_13
+; GFX1200-NEXT:  ; %bb.11:
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v1, s10, v1, vcc_lo
-; GFX1200-NEXT:    s_cbranch_execz .LBB12_11
-; GFX1200-NEXT:    s_branch .LBB12_16
-; GFX1200-NEXT:  .LBB12_10:
-; GFX1200-NEXT:    ; implicit-def: $vgpr1
-; GFX1200-NEXT:  .LBB12_11: ; %frem.compute46
+; GFX1200-NEXT:    v_mov_b32_e32 v1, s11
+; GFX1200-NEXT:    s_branch .LBB12_18
+; GFX1200-NEXT:  .LBB12_12:
+; GFX1200-NEXT:    ; implicit-def: $sgpr11
+; GFX1200-NEXT:  .LBB12_13: ; %frem.compute46
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v2, |s4|
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v1, |s10|
 ; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v4, s10
@@ -15713,12 +15633,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_div_fmas_f32 v4, v4, v7, v8
 ; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v5
 ; GFX1200-NEXT:    v_div_fixup_f32 v4, v4, v2, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_15
-; GFX1200-NEXT:  ; %bb.12: ; %frem.loop_body54.preheader
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_17
+; GFX1200-NEXT:  ; %bb.14: ; %frem.loop_body54.preheader
+; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-NEXT:    s_sub_co_i32 s11, s11, s12
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-NEXT:    s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT:  .LBB12_13: ; %frem.loop_body54
+; GFX1200-NEXT:  .LBB12_15: ; %frem.loop_body54
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_mov_b32_e32 v6, v3
@@ -15738,11 +15659,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc_lo
 ; GFX1200-NEXT:    v_ldexp_f32 v3, v3, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_13
-; GFX1200-NEXT:  ; %bb.14: ; %Flow121
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_15
+; GFX1200-NEXT:  ; %bb.16: ; %Flow121
 ; GFX1200-NEXT:    v_mov_b32_e32 v5, s11
 ; GFX1200-NEXT:    v_mov_b32_e32 v3, v6
-; GFX1200-NEXT:  .LBB12_15: ; %frem.loop_exit55
+; GFX1200-NEXT:  .LBB12_17: ; %frem.loop_exit55
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v5, -11, v5
 ; GFX1200-NEXT:    v_ldexp_f32 v3, v3, v5
@@ -15760,24 +15681,24 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_ldexp_f32 v1, v2, v1
 ; GFX1200-NEXT:    v_bfi_b32 v1, 0x7fffffff, v1, s10
-; GFX1200-NEXT:  .LBB12_16:
+; GFX1200-NEXT:  .LBB12_18:
 ; GFX1200-NEXT:    s_and_b32 s10, s9, 0x7fffffff
-; GFX1200-NEXT:    s_and_b32 s12, s3, 0x7fffffff
+; GFX1200-NEXT:    s_and_b32 s11, s3, 0x7fffffff
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX1200-NEXT:    s_cmp_ngt_f32 s10, s12
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_18
-; GFX1200-NEXT:  ; %bb.17: ; %frem.else16
-; GFX1200-NEXT:    s_cmp_eq_f32 s10, s12
-; GFX1200-NEXT:    v_bfi_b32 v2, 0x7fffffff, 0, s9
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1200-NEXT:    s_cmp_ngt_f32 s10, s11
+; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_21
+; GFX1200-NEXT:  ; %bb.19: ; %frem.else16
+; GFX1200-NEXT:    s_and_b32 s12, 0x80000000, s9
+; GFX1200-NEXT:    s_cmp_eq_f32 s10, s11
+; GFX1200-NEXT:    s_cselect_b32 s11, s12, s9
+; GFX1200-NEXT:    s_cbranch_execz .LBB12_22
+; GFX1200-NEXT:  ; %bb.20:
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v2, s9, v2, vcc_lo
-; GFX1200-NEXT:    s_cbranch_execz .LBB12_19
-; GFX1200-NEXT:    s_branch .LBB12_24
-; GFX1200-NEXT:  .LBB12_18:
-; GFX1200-NEXT:    ; implicit-def: $vgpr2
-; GFX1200-NEXT:  .LBB12_19: ; %frem.compute15
+; GFX1200-NEXT:    v_mov_b32_e32 v2, s11
+; GFX1200-NEXT:    s_branch .LBB12_27
+; GFX1200-NEXT:  .LBB12_21:
+; GFX1200-NEXT:    ; implicit-def: $sgpr11
+; GFX1200-NEXT:  .LBB12_22: ; %frem.compute15
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v3, |s3|
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v2, |s9|
 ; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v5, s9
@@ -15812,12 +15733,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_div_fmas_f32 v5, v5, v8, v9
 ; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v6
 ; GFX1200-NEXT:    v_div_fixup_f32 v5, v5, v3, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_23
-; GFX1200-NEXT:  ; %bb.20: ; %frem.loop_body23.preheader
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_26
+; GFX1200-NEXT:  ; %bb.23: ; %frem.loop_body23.preheader
+; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-NEXT:    s_sub_co_i32 s11, s11, s12
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-NEXT:    s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT:  .LBB12_21: ; %frem.loop_body23
+; GFX1200-NEXT:  .LBB12_24: ; %frem.loop_body23
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_mov_b32_e32 v7, v4
@@ -15837,11 +15759,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX1200-NEXT:    v_ldexp_f32 v4, v4, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_21
-; GFX1200-NEXT:  ; %bb.22: ; %Flow117
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_24
+; GFX1200-NEXT:  ; %bb.25: ; %Flow117
 ; GFX1200-NEXT:    v_mov_b32_e32 v6, s11
 ; GFX1200-NEXT:    v_mov_b32_e32 v4, v7
-; GFX1200-NEXT:  .LBB12_23: ; %frem.loop_exit24
+; GFX1200-NEXT:  .LBB12_26: ; %frem.loop_exit24
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v6, -11, v6
 ; GFX1200-NEXT:    v_ldexp_f32 v4, v4, v6
@@ -15859,24 +15781,24 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_ldexp_f32 v2, v3, v2
 ; GFX1200-NEXT:    v_bfi_b32 v2, 0x7fffffff, v2, s9
-; GFX1200-NEXT:  .LBB12_24:
+; GFX1200-NEXT:  .LBB12_27:
 ; GFX1200-NEXT:    s_and_b32 s9, s7, 0x7fffffff
-; GFX1200-NEXT:    s_and_b32 s12, s2, 0x7fffffff
+; GFX1200-NEXT:    s_and_b32 s11, s2, 0x7fffffff
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX1200-NEXT:    s_cmp_ngt_f32 s9, s12
-; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_26
-; GFX1200-NEXT:  ; %bb.25: ; %frem.else
-; GFX1200-NEXT:    s_cmp_eq_f32 s9, s12
-; GFX1200-NEXT:    v_bfi_b32 v3, 0x7fffffff, 0, s7
-; GFX1200-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX1200-NEXT:    s_cmp_ngt_f32 s9, s11
+; GFX1200-NEXT:    s_cbranch_scc0 .LBB12_30
+; GFX1200-NEXT:  ; %bb.28: ; %frem.else
+; GFX1200-NEXT:    s_and_b32 s12, 0x80000000, s7
+; GFX1200-NEXT:    s_cmp_eq_f32 s9, s11
+; GFX1200-NEXT:    s_cselect_b32 s11, s12, s7
+; GFX1200-NEXT:    s_cbranch_execz .LBB12_31
+; GFX1200-NEXT:  ; %bb.29:
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
-; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX1200-NEXT:    v_cndmask_b32_e32 v3, s7, v3, vcc_lo
-; GFX1200-NEXT:    s_cbranch_execz .LBB12_27
-; GFX1200-NEXT:    s_branch .LBB12_32
-; GFX1200-NEXT:  .LBB12_26:
-; GFX1200-NEXT:    ; implicit-def: $vgpr3
-; GFX1200-NEXT:  .LBB12_27: ; %frem.compute
+; GFX1200-NEXT:    v_mov_b32_e32 v3, s11
+; GFX1200-NEXT:    s_branch .LBB12_36
+; GFX1200-NEXT:  .LBB12_30:
+; GFX1200-NEXT:    ; implicit-def: $sgpr11
+; GFX1200-NEXT:  .LBB12_31: ; %frem.compute
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v4, |s2|
 ; GFX1200-NEXT:    v_frexp_mant_f32_e64 v3, |s7|
 ; GFX1200-NEXT:    v_frexp_exp_i32_f32_e32 v6, s7
@@ -15911,12 +15833,13 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    v_div_fmas_f32 v6, v6, v9, v10
 ; GFX1200-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 13, v7
 ; GFX1200-NEXT:    v_div_fixup_f32 v6, v6, v4, 1.0
-; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_31
-; GFX1200-NEXT:  ; %bb.28: ; %frem.loop_body.preheader
+; GFX1200-NEXT:    s_cbranch_vccnz .LBB12_35
+; GFX1200-NEXT:  ; %bb.32: ; %frem.loop_body.preheader
+; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-NEXT:    s_sub_co_i32 s11, s11, s12
 ; GFX1200-NEXT:    s_wait_alu depctr_sa_sdst(0)
 ; GFX1200-NEXT:    s_add_co_i32 s11, s11, 12
-; GFX1200-NEXT:  .LBB12_29: ; %frem.loop_body
+; GFX1200-NEXT:  .LBB12_33: ; %frem.loop_body
 ; GFX1200-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1200-NEXT:    v_mov_b32_e32 v8, v5
@@ -15936,11 +15859,11 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX1200-NEXT:    v_ldexp_f32 v5, v5, 12
-; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_29
-; GFX1200-NEXT:  ; %bb.30: ; %Flow
+; GFX1200-NEXT:    s_cbranch_scc1 .LBB12_33
+; GFX1200-NEXT:  ; %bb.34: ; %Flow
 ; GFX1200-NEXT:    v_mov_b32_e32 v7, s11
 ; GFX1200-NEXT:    v_mov_b32_e32 v5, v8
-; GFX1200-NEXT:  .LBB12_31: ; %frem.loop_exit
+; GFX1200-NEXT:  .LBB12_35: ; %frem.loop_exit
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_add_nc_u32_e32 v7, -11, v7
 ; GFX1200-NEXT:    v_ldexp_f32 v5, v5, v7
@@ -15958,7 +15881,7 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1200-NEXT:    v_ldexp_f32 v3, v4, v3
 ; GFX1200-NEXT:    v_bfi_b32 v3, 0x7fffffff, v3, s7
-; GFX1200-NEXT:  .LBB12_32: ; %Flow116
+; GFX1200-NEXT:  .LBB12_36: ; %Flow116
 ; GFX1200-NEXT:    s_cmp_lg_f32 s6, 0
 ; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1200-NEXT:    s_cselect_b32 s6, -1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 0ae31be32ed51..8e7dc4072c012 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -2558,7 +2558,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s2, s1
 ; GFX1032-NEXT:    s_xor_b32 s1, exec_lo, s2
 ; GFX1032-NEXT:  ; %bb.1: ; %frem.else
-; GFX1032-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1032-NEXT:    v_and_b32_e32 v1, 0x80000000, v0
 ; GFX1032-NEXT:    v_cmp_eq_f32_e64 vcc_lo, v0, |s0|
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
@@ -2650,7 +2650,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[2:3]
 ; GFX1064-NEXT:  ; %bb.1: ; %frem.else
-; GFX1064-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1064-NEXT:    v_and_b32_e32 v1, 0x80000000, v0
 ; GFX1064-NEXT:    v_cmp_eq_f32_e64 vcc, v0, |s6|
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0
@@ -2864,7 +2864,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1032-NEXT:    s_and_saveexec_b32 s2, s1
 ; GFX1032-NEXT:    s_xor_b32 s1, exec_lo, s2
 ; GFX1032-NEXT:  ; %bb.1: ; %frem.else
-; GFX1032-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1032-NEXT:    v_and_b32_e32 v1, 0x80000000, v0
 ; GFX1032-NEXT:    v_cmp_eq_f32_e64 vcc_lo, v0, |s0|
 ; GFX1032-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc_lo
 ; GFX1032-NEXT:    ; implicit-def: $vgpr0
@@ -2956,7 +2956,7 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) {
 ; GFX1064-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
 ; GFX1064-NEXT:    s_xor_b64 s[0:1], exec, s[2:3]
 ; GFX1064-NEXT:  ; %bb.1: ; %frem.else
-; GFX1064-NEXT:    v_bfi_b32 v1, 0x7fffffff, 0, v0
+; GFX1064-NEXT:    v_and_b32_e32 v1, 0x80000000, v0
 ; GFX1064-NEXT:    v_cmp_eq_f32_e64 vcc, v0, |s6|
 ; GFX1064-NEXT:    v_cndmask_b32_e32 v1, v0, v1, vcc
 ; GFX1064-NEXT:    ; implicit-def: $vgpr0



More information about the llvm-branch-commits mailing list