[clang-tools-extra] [clang] [llvm] [AMDGPU] Revert "Preliminary patch for divergence driven instruction selection. Operands Folding 1." (PR #71710)

Jay Foad via cfe-commits cfe-commits at lists.llvm.org
Mon Nov 13 05:39:58 PST 2023


https://github.com/jayfoad updated https://github.com/llvm/llvm-project/pull/71710

>From a40781b9a50d87647483929f62d4d9b34c3ac299 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad at amd.com>
Date: Thu, 9 Nov 2023 14:03:30 +0000
Subject: [PATCH] Revert "[AMDGPU] Preliminary patch for divergence driven
 instruction selection. Operands Folding 1."

This reverts commit 201f892b3b597f24287ab6a712a286e25a45a7d9.
---
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |   18 -
 .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll    |   13 +-
 .../CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll     |  462 ++--
 llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll   |  169 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   |  597 +++--
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   |  827 ++++---
 .../AMDGPU/GlobalISel/insertelement.i16.ll    |  492 ++---
 .../AMDGPU/GlobalISel/insertelement.i8.ll     |  364 ++--
 .../GlobalISel/llvm.amdgcn.intersect_ray.ll   |   24 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll    |   22 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll    |   26 +-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll |  666 +++---
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll     |  211 +-
 .../CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll     | 1900 ++++++++--------
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll |  586 +++--
 .../AMDGPU/GlobalISel/select-to-fmin-fmax.ll  |   24 +-
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   |    1 -
 .../CodeGen/AMDGPU/GlobalISel/srem.i32.ll     |  224 +-
 .../CodeGen/AMDGPU/GlobalISel/srem.i64.ll     | 1932 ++++++++---------
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll |  638 +++---
 .../CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll    |    8 +-
 .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll |  100 +-
 .../CodeGen/AMDGPU/GlobalISel/udiv.i32.ll     |   14 +-
 .../CodeGen/AMDGPU/GlobalISel/udiv.i64.ll     |  130 +-
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  348 ++-
 .../CodeGen/AMDGPU/GlobalISel/urem.i32.ll     |   47 +-
 .../CodeGen/AMDGPU/GlobalISel/urem.i64.ll     |  663 +++---
 .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll |  100 +-
 llvm/test/CodeGen/AMDGPU/clamp.ll             |   15 +-
 llvm/test/CodeGen/AMDGPU/ds-alignment.ll      |   47 +-
 llvm/test/CodeGen/AMDGPU/fma.f16.ll           |   84 +-
 llvm/test/CodeGen/AMDGPU/fmed3.ll             |  143 +-
 llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll     |   44 +-
 .../CodeGen/AMDGPU/fold-cndmask-wave32.mir    |    1 -
 llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir    |    3 +-
 .../AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll |   27 +-
 llvm/test/CodeGen/AMDGPU/fold-readlane.mir    |    3 +-
 llvm/test/CodeGen/AMDGPU/fptoui.f16.ll        |    8 +-
 llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll         |  786 +++----
 .../CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll |    2 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll  |    6 +-
 llvm/test/CodeGen/AMDGPU/llvm.exp.ll          | 1190 +++++-----
 llvm/test/CodeGen/AMDGPU/llvm.exp2.ll         |  198 +-
 llvm/test/CodeGen/AMDGPU/llvm.frexp.ll        |  120 +-
 .../CodeGen/AMDGPU/llvm.is.fpclass.f16.ll     |  118 +-
 llvm/test/CodeGen/AMDGPU/llvm.log.ll          |  917 ++++----
 llvm/test/CodeGen/AMDGPU/llvm.log10.ll        |  917 ++++----
 llvm/test/CodeGen/AMDGPU/llvm.log2.ll         |  198 +-
 llvm/test/CodeGen/AMDGPU/load-constant-i1.ll  |  798 ++++---
 llvm/test/CodeGen/AMDGPU/load-global-i16.ll   |  902 ++++----
 llvm/test/CodeGen/AMDGPU/mad-mix.ll           |  144 +-
 .../AMDGPU/promote-constOffset-to-imm.ll      |    3 +-
 .../CodeGen/AMDGPU/remat-fp64-constants.ll    |    2 +-
 .../CodeGen/AMDGPU/shrink-add-sub-constant.ll |   33 +-
 llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll   |   12 +-
 55 files changed, 8424 insertions(+), 8903 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 1ebfa297f4fc339..c8413673b655c33 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -736,24 +736,6 @@ void SIFoldOperands::foldOperand(
 
     const TargetRegisterClass *DestRC = TRI->getRegClassForReg(*MRI, DestReg);
     if (!DestReg.isPhysical()) {
-      if (TRI->isSGPRClass(SrcRC) && TRI->hasVectorRegisters(DestRC)) {
-        SmallVector<FoldCandidate, 4> CopyUses;
-        for (auto &Use : MRI->use_nodbg_operands(DestReg)) {
-          // There's no point trying to fold into an implicit operand.
-          if (Use.isImplicit())
-            continue;
-
-          CopyUses.emplace_back(Use.getParent(),
-                                Use.getParent()->getOperandNo(&Use),
-                                &UseMI->getOperand(1));
-        }
-
-        for (auto &F : CopyUses) {
-          foldOperand(*F.OpToFold, F.UseMI, F.UseOpNo, FoldList,
-                      CopiesToReplace);
-        }
-      }
-
       if (DestRC == &AMDGPU::AGPR_32RegClass &&
           TII->isInlineConstant(OpToFold, AMDGPU::OPERAND_REG_INLINE_C_INT32)) {
         UseMI->setDesc(TII->get(AMDGPU::V_ACCVGPR_WRITE_B32_e64));
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index 26d1fbb09210c64..e4cababfe1c919a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -165,9 +165,8 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX7-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    s_movk_i32 s4, 0xffc0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, s4, v0
-; GFX7-NEXT:    v_add_i32_e32 v1, vcc, s4, v1
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 0xffffffc0, v0
+; GFX7-NEXT:    v_add_i32_e32 v1, vcc, 0xffffffc0, v1
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_add_v2i16_neg_inline_imm_splat:
@@ -180,10 +179,10 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX8-LABEL: v_add_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffffffc0
-; GFX8-NEXT:    v_add_u16_e32 v1, 0xffc0, v0
-; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffc0
+; GFX8-NEXT:    v_add_u16_e32 v2, 0xffc0, v0
+; GFX8-NEXT:    v_add_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_add_v2i16_neg_inline_imm_splat:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
index 9da75b093fc9cb5..1a8529b9101c3fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fdiv.f32.ll
@@ -436,15 +436,15 @@ define float @v_fdiv_f32_ulp25(float %a, float %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_f32_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -569,15 +569,15 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -587,15 +587,15 @@ define float @v_fdiv_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -1527,25 +1527,25 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f32_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v2
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, v5
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc
-; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v5, v0
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v6, v0
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v5
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
-; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v5, v0, v5, vcc
+; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v6, v0, v6, vcc
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v3
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v5
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-IEEE-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GFX6-IEEE-NEXT:    v_mul_f32_e32 v4, v6, v4
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v1
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v3, v3
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -1557,19 +1557,19 @@ define <2 x float> @v_fdiv_v2f32_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GCN-FLUSH-LABEL: v_fdiv_v2f32_ulp25:
 ; GCN-FLUSH:       ; %bb.0:
 ; GCN-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-FLUSH-NEXT:    s_mov_b32 s4, 0x6f800000
-; GCN-FLUSH-NEXT:    v_mov_b32_e32 v4, 0x2f800000
-; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |v2|, s4
-; GCN-FLUSH-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |v3|, s4
-; GCN-FLUSH-NEXT:    v_cndmask_b32_e32 v4, 1.0, v4, vcc
-; GCN-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v5
+; GCN-FLUSH-NEXT:    v_mov_b32_e32 v4, 0x6f800000
+; GCN-FLUSH-NEXT:    v_mov_b32_e32 v5, 0x2f800000
+; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |v2|, v4
+; GCN-FLUSH-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GCN-FLUSH-NEXT:    v_cmp_gt_f32_e64 vcc, |v3|, v4
+; GCN-FLUSH-NEXT:    v_cndmask_b32_e32 v4, 1.0, v5, vcc
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v2, v2, v6
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v3, v3, v4
 ; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
 ; GCN-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v1, v1, v3
-; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v5, v0
+; GCN-FLUSH-NEXT:    v_mul_f32_e32 v0, v6, v0
 ; GCN-FLUSH-NEXT:    v_mul_f32_e32 v1, v4, v1
 ; GCN-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2316,16 +2316,16 @@ define <2 x float> @v_rcp_v2f32_ulp25(<2 x float> %x) {
 ; GFX6-IEEE-LABEL: v_rcp_v2f32_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -2425,9 +2425,9 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-LABEL: v_fdiv_v2f32_arcp_ulp25:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v2
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, v5
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v4, v2, v4, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v4, v4
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
@@ -2435,7 +2435,7 @@ define <2 x float> @v_fdiv_v2f32_arcp_ulp25(<2 x float> %a, <2 x float> %b) {
 ; GFX6-IEEE-NEXT:    v_ldexp_f32_e32 v2, v4, v2
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v3
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v5
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v3, v3
@@ -2863,15 +2863,15 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -2881,15 +2881,15 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf(float %x, float %y, float %z)
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -2983,15 +2983,15 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v1
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v5, v0
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v4
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v4, v0, v5, vcc
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -3002,15 +3002,15 @@ define float @v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user(float %x, fl
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp__nnan_ninf_contractable_user:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v3, v1
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
 ; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v3, v3
-; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v4, v0
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v5, v0
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v4
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
+; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v4, v0, v5, vcc
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v3, v4, v3
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -3391,15 +3391,15 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX6-IEEE-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e64 v3, -v0
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e64 v4, -v0
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v3
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cndmask_b32_e64 v3, -v0, v3, s[4:5]
+; GFX6-IEEE-NEXT:    v_cndmask_b32_e64 v3, -v0, v4, s[4:5]
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e64 v0, -v0
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -3409,15 +3409,15 @@ define float @v_fdiv_neglhs_f32_dynamic_25ulp(float %x, float %y) #0 {
 ; GFX6-FLUSH-LABEL: v_fdiv_neglhs_f32_dynamic_25ulp:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e64 v3, -v0
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s4
+; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e64 v4, -v0
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v3
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cndmask_b32_e64 v3, -v0, v3, s[4:5]
+; GFX6-FLUSH-NEXT:    v_cndmask_b32_e64 v3, -v0, v4, s[4:5]
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e64 v0, -v0
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -3820,121 +3820,121 @@ define float @v_fdiv_f32_constrhs0_dynamic(float %x) #0 {
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
 ; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
-; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, s6, v0
+; GFX6-IEEE-FASTFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-FASTFMA-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
-; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, s6, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v0, v1, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-FASTFMA-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
-; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
-; GFX6-IEEE-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX6-IEEE-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX6-IEEE-SLOWFMA-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX6-IEEE-SLOWFMA-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX89-IEEE:       ; %bb.0:
 ; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-IEEE-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX89-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
-; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
+; GFX89-IEEE-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
-; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX89-IEEE-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX89-IEEE-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX89-IEEE-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX89-IEEE-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX89-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-IEEE-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX89-IEEE-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX89-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX89-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX89-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_f32_constrhs0_dynamic:
 ; GFX89-FLUSH:       ; %bb.0:
 ; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-FLUSH-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX89-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], s6, s6, v0
-; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, v0, s6, v0
+; GFX89-FLUSH-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v1, v1, v0
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v0, v1, v0
 ; GFX89-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
-; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX89-FLUSH-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX89-FLUSH-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX89-FLUSH-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX89-FLUSH-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX89-FLUSH-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-FLUSH-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX89-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, s6, v0
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v1, v0
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_constrhs0_dynamic:
@@ -4199,121 +4199,121 @@ define float @v_fdiv_f32_constlhs0_dynamic(float %x) #0 {
 ; GFX6-IEEE-FASTFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-IEEE-FASTFMA:       ; %bb.0:
 ; GFX6-IEEE-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
-; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, s6, v0, s6
+; GFX6-IEEE-FASTFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-IEEE-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
 ; GFX6-IEEE-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v2, v4, v2, v2
-; GFX6-IEEE-FASTFMA-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v4, v5, v2, v4
-; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-IEEE-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-IEEE-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-IEEE-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-IEEE-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-FASTFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-FLUSH-FASTFMA:       ; %bb.0:
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-FASTFMA-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
-; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v2, v1
-; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v3, vcc, s6, v0, s6
+; GFX6-FLUSH-FASTFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-FASTFMA-NEXT:    v_rcp_f32_e32 v3, v2
+; GFX6-FLUSH-FASTFMA-NEXT:    v_div_scale_f32 v4, vcc, v1, v0, v1
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v4, -v1, v2, 1.0
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v2, v4, v2, v2
-; GFX6-FLUSH-FASTFMA-NEXT:    v_mul_f32_e32 v4, v3, v2
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v5, -v1, v4, v3
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v4, v5, v2, v4
-; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v1, -v1, v4, v3
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v5, -v2, v3, 1.0
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v3, v5, v3, v3
+; GFX6-FLUSH-FASTFMA-NEXT:    v_mul_f32_e32 v5, v4, v3
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v6, -v2, v5, v4
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v5, v6, v3, v5
+; GFX6-FLUSH-FASTFMA-NEXT:    v_fma_f32 v2, -v2, v5, v4
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v1, v1, v2, v4
-; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fmas_f32 v2, v2, v3, v5
+; GFX6-FLUSH-FASTFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-FASTFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-IEEE-SLOWFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-IEEE-SLOWFMA:       ; %bb.0:
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
-; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
+; GFX6-IEEE-SLOWFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-IEEE-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
-; GFX6-IEEE-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX6-IEEE-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX6-IEEE-SLOWFMA-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX6-IEEE-SLOWFMA-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX6-IEEE-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-IEEE-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-IEEE-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX6-FLUSH-SLOWFMA-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX6-FLUSH-SLOWFMA:       ; %bb.0:
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-SLOWFMA-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX6-FLUSH-SLOWFMA-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX6-FLUSH-SLOWFMA-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-IEEE-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX89-IEEE:       ; %bb.0:
 ; GFX89-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-IEEE-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX89-IEEE-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
-; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
+; GFX89-IEEE-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX89-IEEE-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
 ; GFX89-IEEE-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
-; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX89-IEEE-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX89-IEEE-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX89-IEEE-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX89-IEEE-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX89-IEEE-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX89-IEEE-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX89-IEEE-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-IEEE-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX89-IEEE-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX89-IEEE-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX89-IEEE-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX89-IEEE-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX89-IEEE-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX89-IEEE-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX89-IEEE-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-IEEE-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX89-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX89-FLUSH-LABEL: v_fdiv_f32_constlhs0_dynamic:
 ; GFX89-FLUSH:       ; %bb.0:
 ; GFX89-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX89-FLUSH-NEXT:    s_mov_b32 s6, 0x4640e400
-; GFX89-FLUSH-NEXT:    v_div_scale_f32 v1, s[4:5], v0, v0, s6
-; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, vcc, s6, v0, s6
+; GFX89-FLUSH-NEXT:    v_mov_b32_e32 v1, 0x4640e400
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v2, s[4:5], v0, v0, v1
+; GFX89-FLUSH-NEXT:    v_div_scale_f32 v3, vcc, v1, v0, v1
 ; GFX89-FLUSH-NEXT:    s_getreg_b32 s4, hwreg(HW_REG_MODE, 4, 2)
-; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX89-FLUSH-NEXT:    v_rcp_f32_e32 v4, v2
 ; GFX89-FLUSH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3
-; GFX89-FLUSH-NEXT:    v_fma_f32 v4, -v1, v3, 1.0
-; GFX89-FLUSH-NEXT:    v_fma_f32 v3, v4, v3, v3
-; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v4, v2, v3
-; GFX89-FLUSH-NEXT:    v_fma_f32 v5, -v1, v4, v2
-; GFX89-FLUSH-NEXT:    v_fma_f32 v4, v5, v3, v4
-; GFX89-FLUSH-NEXT:    v_fma_f32 v1, -v1, v4, v2
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, -v2, v4, 1.0
+; GFX89-FLUSH-NEXT:    v_fma_f32 v4, v5, v4, v4
+; GFX89-FLUSH-NEXT:    v_mul_f32_e32 v5, v3, v4
+; GFX89-FLUSH-NEXT:    v_fma_f32 v6, -v2, v5, v3
+; GFX89-FLUSH-NEXT:    v_fma_f32 v5, v6, v4, v5
+; GFX89-FLUSH-NEXT:    v_fma_f32 v2, -v2, v5, v3
 ; GFX89-FLUSH-NEXT:    s_setreg_b32 hwreg(HW_REG_MODE, 4, 2), s4
-; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v1, v1, v3, v4
-; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v1, v0, s6
+; GFX89-FLUSH-NEXT:    v_div_fmas_f32 v2, v2, v4, v5
+; GFX89-FLUSH-NEXT:    v_div_fixup_f32 v0, v2, v0, v1
 ; GFX89-FLUSH-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-IEEE-LABEL: v_fdiv_f32_constlhs0_dynamic:
@@ -4830,15 +4830,15 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -4848,15 +4848,15 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_x(float nofpclass(sub) %x, float
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_x:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -5219,15 +5219,15 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX6-IEEE-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX6-IEEE:       ; %bb.0:
 ; GFX6-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX6-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-IEEE-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; GFX6-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -5237,15 +5237,15 @@ define float @v_fdiv_f32_dynamic_25ulp_nodenorm_y(float %x, float nofpclass(sub)
 ; GFX6-FLUSH-LABEL: v_fdiv_f32_dynamic_25ulp_nodenorm_y:
 ; GFX6-FLUSH:       ; %bb.0:
 ; GFX6-FLUSH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-FLUSH-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-FLUSH-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GFX6-FLUSH-NEXT:    v_rcp_f32_e32 v2, v2
-; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v3, v0
-; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-FLUSH-NEXT:    v_frexp_mant_f32_e32 v4, v0
+; GFX6-FLUSH-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
-; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v3, v0, v3, vcc
+; GFX6-FLUSH-NEXT:    v_cndmask_b32_e32 v3, v0, v4, vcc
 ; GFX6-FLUSH-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX6-FLUSH-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GFX6-FLUSH-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
index 73c0a0c92b2e9f4..1c40f7992bfe14c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fpow.ll
@@ -126,14 +126,13 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX6-LABEL: v_pow_v2f32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0x800000
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x4f800000
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v5
-; GFX6-NEXT:    v_mov_b32_e32 v5, 0x800000
-; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v5
-; GFX6-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[4:5]
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0x800000
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x4f800000
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX6-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
@@ -142,15 +141,15 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX6-NEXT:    v_sub_f32_e32 v0, v0, v7
 ; GFX6-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
-; GFX6-NEXT:    s_mov_b32 s6, 0xc2fc0000
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX6-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v7, 0, v2, vcc
-; GFX6-NEXT:    v_cmp_gt_f32_e64 s[4:5], s6, v1
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v7
-; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[4:5]
+; GFX6-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX6-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
@@ -164,14 +163,13 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-LABEL: v_pow_v2f32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_mov_b32 s4, 0x800000
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0x4f800000
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v5
-; GFX8-NEXT:    v_mov_b32_e32 v5, 0x800000
-; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v5
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0x800000
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0x4f800000
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
+; GFX8-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
 ; GFX8-NEXT:    v_log_f32_e32 v0, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_log_f32_e32 v1, v1
@@ -180,15 +178,15 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX8-NEXT:    v_sub_f32_e32 v0, v0, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
-; GFX8-NEXT:    s_mov_b32 s6, 0xc2fc0000
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX8-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX8-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX8-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX8-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, 0, v2, vcc
-; GFX8-NEXT:    v_cmp_gt_f32_e64 s[4:5], s6, v1
-; GFX8-NEXT:    v_add_f32_e32 v0, v0, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX8-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GFX8-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
 ; GFX8-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX8-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX8-NEXT:    v_exp_f32_e32 v1, v1
@@ -202,14 +200,13 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-LABEL: v_pow_v2f32:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0x800000
-; GFX9-NEXT:    v_mov_b32_e32 v4, 0x4f800000
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v5
-; GFX9-NEXT:    v_mov_b32_e32 v5, 0x800000
-; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0x800000
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x4f800000
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
+; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 1.0, v5, s[4:5]
 ; GFX9-NEXT:    v_log_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v1, v1, v4
 ; GFX9-NEXT:    v_log_f32_e32 v1, v1
@@ -218,15 +215,15 @@ define <2 x float> @v_pow_v2f32(<2 x float> %x, <2 x float> %y) {
 ; GFX9-NEXT:    v_sub_f32_e32 v0, v0, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v5, 0, v6, s[4:5]
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
-; GFX9-NEXT:    s_mov_b32 s6, 0xc2fc0000
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX9-NEXT:    v_sub_f32_e32 v1, v1, v5
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX9-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x42800000
+; GFX9-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
 ; GFX9-NEXT:    v_mul_legacy_f32_e32 v1, v1, v3
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, 0, v2, vcc
-; GFX9-NEXT:    v_cmp_gt_f32_e64 s[4:5], s6, v1
-; GFX9-NEXT:    v_add_f32_e32 v0, v0, v7
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
+; GFX9-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GFX9-NEXT:    v_add_f32_e32 v0, v0, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, v7, s[4:5]
 ; GFX9-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_exp_f32_e32 v1, v1
@@ -382,25 +379,25 @@ define <2 x half> @v_pow_v2f16(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0xc2fc0000
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x42800000
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x42800000
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v2
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v3
 ; GFX6-NEXT:    v_mov_b32_e32 v3, 0x1f800000
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v3, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v4
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -505,22 +502,22 @@ define <2 x half> @v_pow_v2f16_fneg_lhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
-; GFX6-NEXT:    s_mov_b32 s4, 0xc2fc0000
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0x42800000
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0x42800000
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
-; GFX6-NEXT:    v_add_f32_e32 v1, v1, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; GFX6-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-NEXT:    v_add_f32_e32 v1, v1, v5
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v3
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v2, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v4, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v0, v1, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 1.0, v5, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v2, v1
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -632,21 +629,21 @@ define <2 x half> @v_pow_v2f16_fneg_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GFX6-NEXT:    v_log_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v3
-; GFX6-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; GFX6-NEXT:    v_add_f32_e32 v0, v0, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0xc2fc0000
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0x42800000
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-NEXT:    v_add_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v1, v1, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v1, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; GFX6-NEXT:    v_add_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
-; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v5
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v0, v0, v6
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -762,21 +759,21 @@ define <2 x half> @v_pow_v2f16_fneg_lhs_rhs(<2 x half> %x, <2 x half> %y) {
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX6-NEXT:    v_log_f32_e32 v0, v0
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v2, v3, v2
-; GFX6-NEXT:    s_mov_b32 s4, 0xc2fc0000
-; GFX6-NEXT:    v_mov_b32_e32 v3, 0x42800000
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v2
-; GFX6-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; GFX6-NEXT:    v_add_f32_e32 v2, v2, v4
-; GFX6-NEXT:    v_mov_b32_e32 v4, 0x1f800000
+; GFX6-NEXT:    v_mov_b32_e32 v3, 0xc2fc0000
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0x42800000
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v2, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
+; GFX6-NEXT:    v_add_f32_e32 v2, v2, v5
+; GFX6-NEXT:    v_mov_b32_e32 v5, 0x1f800000
 ; GFX6-NEXT:    v_mul_legacy_f32_e32 v0, v0, v1
-; GFX6-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GFX6-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX6-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GFX6-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v3
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v4, vcc
 ; GFX6-NEXT:    v_exp_f32_e32 v2, v2
 ; GFX6-NEXT:    v_add_f32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_exp_f32_e32 v1, v0
-; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v5
-; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
+; GFX6-NEXT:    v_mul_f32_e32 v0, v2, v6
+; GFX6-NEXT:    v_cndmask_b32_e32 v2, 1.0, v5, vcc
 ; GFX6-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v1, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index 525075e516d21bc..f9b98059be0b3a4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -539,9 +539,9 @@ define i8 @v_fshl_i8_4(i8 %lhs, i8 %rhs) {
 ; GFX9-LABEL: v_fshl_i8_4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -860,22 +860,22 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-LABEL: v_fshl_v2i8:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v6, 1
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v2, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 7, v5
 ; GFX9-NEXT:    v_not_b32_e32 v2, v5
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v3, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
 ; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
@@ -901,12 +901,12 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, v3, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v7, v1
 ; GFX10-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1299,13 +1299,12 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
+; GFX8-NEXT:    v_not_b32_e32 v6, v7
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v7
-; GFX8-NEXT:    v_not_b32_e32 v6, v7
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX8-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
@@ -1327,18 +1326,18 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
 ; GFX9-NEXT:    v_and_b32_e32 v8, 7, v2
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
-; GFX9-NEXT:    s_mov_b32 s4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v10, 1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v10, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v11, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v8, v8, v0
-; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v10
+; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v11
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
 ; GFX9-NEXT:    v_or_b32_e32 v2, v8, v2
 ; GFX9-NEXT:    v_and_b32_e32 v8, 7, v5
 ; GFX9-NEXT:    v_not_b32_e32 v5, v5
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, s4, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v4, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0xff
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, v8, v3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
@@ -1350,19 +1349,17 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v4, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
+; GFX9-NEXT:    v_not_b32_e32 v6, v7
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v7
-; GFX9-NEXT:    v_not_b32_e32 v6, v7
-; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX9-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
-; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s5, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, v9, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -1417,7 +1414,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v3
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
@@ -1473,7 +1470,7 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0xff, v1
+; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
@@ -1826,10 +1823,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_and_b32 s6, s8, 0xff
 ; GFX6-NEXT:    s_or_b32 s1, s7, s1
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_or_b32 s1, s1, s6
 ; GFX6-NEXT:    s_lshr_b32 s6, s2, 16
 ; GFX6-NEXT:    s_lshr_b32 s7, s2, 24
@@ -1839,12 +1836,12 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s9, s2
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX6-NEXT:    s_lshr_b32 s8, s3, 8
 ; GFX6-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX6-NEXT:    s_or_b32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX6-NEXT:    s_and_b32 s6, s8, 0xff
@@ -1857,7 +1854,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s7, s4, 24
 ; GFX6-NEXT:    s_and_b32 s9, s4, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x80008
-; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX6-NEXT:    s_or_b32 s4, s9, s4
@@ -1865,37 +1862,32 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX6-NEXT:    s_or_b32 s4, s4, s6
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    s_lshr_b32 s8, s5, 8
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX6-NEXT:    s_and_b32 s6, s8, 0xff
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    s_or_b32 s5, s7, s5
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v1
 ; GFX6-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX6-NEXT:    s_or_b32 s5, s5, s6
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v1
+; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v1, s0, v1
 ; GFX6-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, s5, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -1961,10 +1953,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 8
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
@@ -1973,10 +1965,10 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_or_b32 s3, s8, s3
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s6
@@ -1986,44 +1978,39 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 24
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    s_or_b32 s4, s4, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s4, s4, s6
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s5, 8
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    s_or_b32 s5, s8, s5
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v1
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX8-NEXT:    s_or_b32 s5, s5, s6
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s4, v1
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
+; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
 ; GFX8-NEXT:    s_lshr_b32 s0, s2, 1
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, s5, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
@@ -2057,93 +2044,88 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ;
 ; GFX9-LABEL: s_fshl_v2i24:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s7, s0, 8
-; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s0, 24
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX9-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 24
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_or_b32 s0, s0, s7
-; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
-; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX9-NEXT:    s_lshr_b32 s11, s1, 8
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX9-NEXT:    s_or_b32 s0, s0, s7
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
+; GFX9-NEXT:    s_and_b32 s6, s9, 0xff
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX9-NEXT:    s_or_b32 s1, s10, s1
-; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT:    s_or_b32 s1, s8, s1
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s1, s1, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s2, 8
-; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX9-NEXT:    s_lshr_b32 s9, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s2, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s1, s1, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s2, 8
+; GFX9-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX9-NEXT:    s_lshr_b32 s7, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
-; GFX9-NEXT:    s_or_b32 s2, s2, s7
-; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX9-NEXT:    s_lshr_b32 s11, s3, 8
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
+; GFX9-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX9-NEXT:    s_lshr_b32 s9, s3, 8
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX9-NEXT:    s_or_b32 s2, s2, s7
+; GFX9-NEXT:    s_or_b32 s2, s2, s6
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
-; GFX9-NEXT:    s_or_b32 s3, s10, s3
-; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX9-NEXT:    s_and_b32 s6, s9, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT:    s_or_b32 s3, s8, s3
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s3, s3, s7
-; GFX9-NEXT:    s_lshr_b32 s7, s4, 8
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
-; GFX9-NEXT:    s_lshr_b32 s10, s4, 24
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s3, s3, s6
+; GFX9-NEXT:    s_lshr_b32 s6, s4, 8
+; GFX9-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX9-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s8, s4, 24
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    s_or_b32 s4, s4, s7
-; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
-; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX9-NEXT:    s_or_b32 s4, s4, s6
+; GFX9-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v2
-; GFX9-NEXT:    s_or_b32 s4, s4, s7
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX9-NEXT:    s_lshr_b32 s11, s5, 8
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s4, s4, s6
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v0
+; GFX9-NEXT:    s_lshr_b32 s9, s5, 8
 ; GFX9-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX9-NEXT:    s_and_b32 s7, s11, 0xff
-; GFX9-NEXT:    s_or_b32 s5, s10, s5
-; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX9-NEXT:    s_and_b32 s6, s9, 0xff
+; GFX9-NEXT:    s_or_b32 s5, s8, s5
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX9-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX9-NEXT:    s_or_b32 s5, s5, s7
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX9-NEXT:    s_or_b32 s5, s5, s6
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v1
 ; GFX9-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
@@ -2162,14 +2144,14 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v2, v2, s0
-; GFX9-NEXT:    s_mov_b32 s6, 8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v0, s1, v0, v2
-; GFX9-NEXT:    s_mov_b32 s8, 16
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v2, v1, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX9-NEXT:    v_and_or_b32 v2, v1, s0, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 8, 8
@@ -2189,60 +2171,59 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s6
-; GFX10-NEXT:    s_and_b32 s6, 0xffff, s7
+; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s7, s4, 8
-; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
-; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s11, s4, 24
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
 ; GFX10-NEXT:    s_lshr_b32 s12, s5, 8
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_or_b32 s4, s4, s7
 ; GFX10-NEXT:    s_and_b32 s7, s10, 0xff
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v1
-; GFX10-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v0
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX10-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
 ; GFX10-NEXT:    s_or_b32 s4, s4, s7
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX10-NEXT:    s_and_b32 s7, s12, 0xff
-; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX10-NEXT:    v_mul_hi_u32 v3, v0, v3
 ; GFX10-NEXT:    s_or_b32 s5, s11, s5
 ; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX10-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX10-NEXT:    s_or_b32 s5, s5, s7
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v3
+; GFX10-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX10-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 16
 ; GFX10-NEXT:    s_or_b32 s1, s8, s1
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_lshr_b32 s8, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 16
-; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
+; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX10-NEXT:    v_mul_hi_u32 v1, s4, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX10-NEXT:    s_or_b32 s2, s2, s8
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    s_or_b32 s1, s1, s7
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
 ; GFX10-NEXT:    s_lshr_b32 s4, s3, 8
@@ -2270,35 +2251,31 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX10-NEXT:    s_or_b32 s3, s3, s4
 ; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX10-NEXT:    s_or_b32 s0, s0, s6
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX10-NEXT:    s_or_b32 s1, s1, s7
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX10-NEXT:    s_lshr_b32 s2, s3, 1
 ; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v3, v3, s2
-; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_mov_b32_e32 v2, 8
 ; GFX10-NEXT:    v_lshl_or_b32 v0, s1, v0, v3
-; GFX10-NEXT:    s_mov_b32 s0, 16
-; GFX10-NEXT:    v_and_or_b32 v2, v1, 0xff, v2
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_mov_b32_e32 v3, 16
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
 ; GFX10-NEXT:    v_bfe_u32 v4, v0, 8, 8
 ; GFX10-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX10-NEXT:    v_or3_b32 v1, v1, v3, v2
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v4
-; GFX10-NEXT:    v_or3_b32 v1, v2, v1, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_v2i24:
@@ -2316,10 +2293,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s7
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s4, 16
+; GFX11-NEXT:    s_lshr_b32 s7, s4, 16
 ; GFX11-NEXT:    s_or_b32 s0, s0, s6
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
 ; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX11-NEXT:    s_lshr_b32 s6, s4, 8
 ; GFX11-NEXT:    s_lshr_b32 s10, s4, 24
@@ -2329,8 +2305,8 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX11-NEXT:    s_lshr_b32 s11, s5, 8
 ; GFX11-NEXT:    s_or_b32 s4, s4, s6
-; GFX11-NEXT:    s_and_b32 s6, s9, 0xff
-; GFX11-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v0
+; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX11-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
@@ -2338,107 +2314,97 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-NEXT:    s_or_b32 s4, s4, s6
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX11-NEXT:    s_and_b32 s6, s11, 0xff
-; GFX11-NEXT:    v_mul_hi_u32 v3, v0, v3
+; GFX11-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX11-NEXT:    s_or_b32 s5, s10, s5
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s1, 8
+; GFX11-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX11-NEXT:    s_or_b32 s5, s5, s6
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v3
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX11-NEXT:    s_lshr_b32 s7, s2, 8
-; GFX11-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX11-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v1
 ; GFX11-NEXT:    s_or_b32 s1, s8, s1
 ; GFX11-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX11-NEXT:    v_mul_hi_u32 v1, s4, v0
+; GFX11-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX11-NEXT:    s_and_b32 s6, s9, 0xff
 ; GFX11-NEXT:    s_lshr_b32 s9, s2, 24
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX11-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; GFX11-NEXT:    s_or_b32 s2, s2, s7
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v2
 ; GFX11-NEXT:    s_and_b32 s5, s8, 0xff
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_lshr_b32 s4, s3, 8
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    s_or_b32 s2, s2, s5
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
-; GFX11-NEXT:    s_lshr_b32 s4, s3, 8
 ; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
+; GFX11-NEXT:    s_or_b32 s3, s9, s3
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    s_or_b32 s3, s9, s3
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX11-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
 ; GFX11-NEXT:    s_lshl_b32 s2, s6, 16
-; GFX11-NEXT:    v_lshrrev_b32_e64 v3, v3, s3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
 ; GFX11-NEXT:    s_or_b32 s0, s1, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v0, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_bfe_u32 v2, v1, 8, 8
+; GFX11-NEXT:    v_lshrrev_b32_e64 v3, v3, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v0, v3
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 8
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-NEXT:    v_and_or_b32 v1, v1, 0xff, v2
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 8, 8
 ; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 8, v5
 ; GFX11-NEXT:    v_or3_b32 v1, v1, v3, v4
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 8, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11-NEXT:    ; return to shader part epilog
   %lhs = bitcast i48 %lhs.arg to <2 x i24>
   %rhs = bitcast i48 %rhs.arg to <2 x i24>
@@ -2454,21 +2420,19 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX6-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX6-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX6-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX6-NEXT:    v_mul_lo_u32 v9, v7, v8
-; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v8
-; GFX6-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX6-NEXT:    v_mul_lo_u32 v7, v6, v7
+; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v6
+; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v7
 ; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
@@ -2476,11 +2440,8 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
@@ -2506,21 +2467,19 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX8-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX8-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v8
-; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v8
-; GFX8-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX8-NEXT:    v_mul_lo_u32 v7, v6, v7
+; GFX8-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
+; GFX8-NEXT:    v_mul_hi_u32 v7, v4, v6
+; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v7
 ; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
@@ -2528,11 +2487,8 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
-; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v4, v0
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v7
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
@@ -2558,41 +2514,36 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX9-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_bfe_u32 v2, v2, 1, 23
-; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX9-NEXT:    v_bfe_u32 v3, v3, 1, 23
-; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v8
-; GFX9-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
-; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v7
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_mul_lo_u32 v7, v6, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v7
-; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v7, 23, v4
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v5
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v7, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v5
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v7, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -2612,17 +2563,12 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX10-NEXT:    v_bfe_u32 v3, v3, 1, 23
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX10-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v7
-; GFX10-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v6
-; GFX10-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GFX10-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v9
-; GFX10-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX10-NEXT:    v_mul_lo_u32 v7, 0xffffffe8, v6
+; GFX10-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v7
+; GFX10-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
@@ -2656,64 +2602,58 @@ define <2 x i24> @v_fshl_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX11-NEXT:    v_bfe_u32 v2, v2, 1, 23
 ; GFX11-NEXT:    v_bfe_u32 v3, v3, 1, 23
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
 ; GFX11-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v6
-; GFX11-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v9
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX11-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v7
-; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v8
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX11-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX11-NEXT:    v_mul_lo_u32 v7, 0xffffffe8, v6
+; GFX11-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v7
+; GFX11-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v7
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v7, 23, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
+; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
+; GFX11-NEXT:    v_lshl_or_b32 v1, v1, v5, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, v6, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, v7, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, v4, v2
-; GFX11-NEXT:    v_lshl_or_b32 v1, v1, v5, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call <2 x i24> @llvm.fshl.v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt)
   ret <2 x i24> %result
@@ -5070,13 +5010,12 @@ define <4 x half> @v_fshl_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v5, v5, v6
+; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX8-NEXT:    v_and_b32_e32 v5, 15, v7
-; GFX8-NEXT:    v_xor_b32_e32 v6, -1, v7
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_mov_b32_e32 v5, 1
 ; GFX8-NEXT:    v_and_b32_e32 v6, 15, v6
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, v6, v3
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index 3098ce672bd1cf1..c8455665e7b40f1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -535,9 +535,9 @@ define i8 @v_fshr_i8_4(i8 %lhs, i8 %rhs) {
 ; GFX9-LABEL: v_fshr_i8_4:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 4
+; GFX9-NEXT:    v_mov_b32_e32 v2, 4
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 4, v0
-; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -885,24 +885,24 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v2
 ; GFX10-NEXT:    v_not_b32_e32 v2, v2
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
+; GFX10-NEXT:    v_not_b32_e32 v7, v3
 ; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX10-NEXT:    v_lshrrev_b16 v3, v3, v5
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
-; GFX10-NEXT:    v_lshlrev_b16 v4, v6, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v7, v1
+; GFX10-NEXT:    v_lshlrev_b16 v4, v7, v4
+; GFX10-NEXT:    v_lshrrev_b16 v1, v6, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
 ; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_sdwa v1, v2, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1294,11 +1294,11 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_and_b32_e32 v4, 7, v6
 ; GFX8-NEXT:    v_not_b32_e32 v5, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v6, 1
+; GFX8-NEXT:    v_mov_b32_e32 v9, 0xff
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v5, v5, v8
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0xff
-; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
 ; GFX8-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_and_b32_e32 v5, 7, v7
@@ -1344,11 +1344,11 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_and_b32_e32 v4, 7, v6
 ; GFX9-NEXT:    v_not_b32_e32 v5, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v6, 1
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0xff
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v8, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v5, v5, v8
-; GFX9-NEXT:    v_and_b32_sdwa v8, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v4, v4, v8
 ; GFX9-NEXT:    v_or_b32_e32 v4, v5, v4
 ; GFX9-NEXT:    v_and_b32_e32 v5, 7, v7
@@ -1360,7 +1360,7 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_and_or_b32 v1, v2, s4, v1
+; GFX9-NEXT:    v_and_or_b32 v1, v2, v9, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -1371,42 +1371,42 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-LABEL: v_fshr_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_not_b32_e32 v5, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_not_b32_e32 v10, v7
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-NEXT:    v_not_b32_e32 v8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX10-NEXT:    v_lshlrev_b16 v0, v5, v0
-; GFX10-NEXT:    v_and_b32_e32 v5, 7, v10
+; GFX10-NEXT:    v_not_b32_e32 v10, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
-; GFX10-NEXT:    v_not_b32_e32 v13, v11
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
+; GFX10-NEXT:    v_not_b32_e32 v14, v12
+; GFX10-NEXT:    v_lshlrev_b16 v3, v10, v3
+; GFX10-NEXT:    v_not_b32_e32 v10, v11
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xff, v1
-; GFX10-NEXT:    v_lshlrev_b16 v3, v5, v3
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v8
-; GFX10-NEXT:    v_not_b32_e32 v8, v12
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xff, v1
+; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX10-NEXT:    v_and_b32_e32 v7, 0xff, v7
 ; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX10-NEXT:    v_and_b32_e32 v13, 7, v13
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_e32 v13, 7, v14
 ; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
 ; GFX10-NEXT:    v_and_b32_e32 v12, 7, v12
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_lshrrev_b16 v5, v7, v5
-; GFX10-NEXT:    v_lshlrev_b16 v4, v13, v4
+; GFX10-NEXT:    v_lshrrev_b16 v5, v5, v7
+; GFX10-NEXT:    v_lshlrev_b16 v4, v10, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, v11, v1
-; GFX10-NEXT:    v_lshlrev_b16 v6, v8, v6
+; GFX10-NEXT:    v_lshlrev_b16 v6, v13, v6
 ; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v9
-; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v10
+; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v8
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v5
 ; GFX10-NEXT:    v_mov_b32_e32 v5, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
@@ -1415,7 +1415,7 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
@@ -1471,7 +1471,7 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0xff, v1
+; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
@@ -1823,10 +1823,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_and_b32 s9, s0, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s0, s0, 0x80008
 ; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX6-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
+; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 8
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    s_or_b32 s0, s9, s0
 ; GFX6-NEXT:    s_or_b32 s1, s7, s1
 ; GFX6-NEXT:    s_and_b32 s7, s8, 0xff
@@ -1838,12 +1838,12 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX6-NEXT:    s_or_b32 s2, s11, s2
 ; GFX6-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX6-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
+; GFX6-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX6-NEXT:    s_lshr_b32 s10, s3, 8
 ; GFX6-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX6-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX6-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX6-NEXT:    s_or_b32 s2, s2, s8
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX6-NEXT:    s_and_b32 s8, s10, 0xff
@@ -1856,7 +1856,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_lshr_b32 s9, s4, 24
 ; GFX6-NEXT:    s_and_b32 s11, s4, 0xff
 ; GFX6-NEXT:    s_bfe_u32 s4, s4, 0x80008
-; GFX6-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX6-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX6-NEXT:    s_lshl_b32 s4, s4, 8
 ; GFX6-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX6-NEXT:    s_or_b32 s4, s11, s4
@@ -1864,40 +1864,35 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX6-NEXT:    s_or_b32 s4, s4, s8
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
-; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX6-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    s_lshr_b32 s10, s5, 8
-; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX6-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v1
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX6-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX6-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX6-NEXT:    s_and_b32 s8, s10, 0xff
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    s_or_b32 s5, s9, s5
 ; GFX6-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 24, v1
 ; GFX6-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX6-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX6-NEXT:    s_or_b32 s5, s5, s8
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s4, v1
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 24, v1
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX6-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX6-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX6-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 23, v1
+; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 23, v1
 ; GFX6-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX6-NEXT:    s_or_b32 s0, s4, s0
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX6-NEXT:    v_lshl_b32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_lshr_b32_e32 v1, s2, v1
@@ -1961,10 +1956,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_or_b32 s2, s2, s8
 ; GFX8-NEXT:    s_and_b32 s8, s9, 0xff
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 8
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
@@ -1973,10 +1968,10 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_or_b32 s2, s2, s8
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX8-NEXT:    s_and_b32 s8, s11, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_or_b32 s3, s10, s3
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX8-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s8
@@ -1986,46 +1981,41 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 24
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX8-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX8-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
 ; GFX8-NEXT:    s_and_b32 s8, s9, 0xff
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX8-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX8-NEXT:    s_or_b32 s4, s4, s8
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
-; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
+; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX8-NEXT:    v_mul_lo_u32 v2, v0, v2
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s4, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v1
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX8-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_and_b32 s8, s11, 0xff
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    s_or_b32 s5, s10, s5
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 24, v1
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX8-NEXT:    s_or_b32 s5, s5, s8
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, s4, v1
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 24, v1
+; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
+; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 23, v1
-; GFX8-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 23, v1
 ; GFX8-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX8-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX8-NEXT:    s_or_b32 s0, s4, s0
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
 ; GFX8-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
@@ -2066,92 +2056,87 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-LABEL: s_fshr_v2i24:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX9-NEXT:    s_lshr_b32 s11, s1, 8
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX9-NEXT:    s_lshr_b32 s7, s0, 8
-; GFX9-NEXT:    s_lshr_b32 s10, s0, 24
+; GFX9-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s8, s0, 24
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX9-NEXT:    s_or_b32 s1, s10, s1
-; GFX9-NEXT:    s_lshr_b32 s10, s2, 8
-; GFX9-NEXT:    s_lshr_b32 s9, s0, 16
+; GFX9-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX9-NEXT:    s_or_b32 s1, s8, s1
+; GFX9-NEXT:    s_lshr_b32 s8, s2, 8
+; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX9-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX9-NEXT:    s_and_b32 s10, s10, 0xff
-; GFX9-NEXT:    s_or_b32 s0, s0, s7
+; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX9-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX9-NEXT:    s_or_b32 s0, s0, s6
+; GFX9-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
-; GFX9-NEXT:    s_and_b32 s9, s11, 0xff
-; GFX9-NEXT:    s_lshr_b32 s11, s2, 16
-; GFX9-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX9-NEXT:    s_lshr_b32 s9, s2, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX9-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
-; GFX9-NEXT:    s_or_b32 s2, s2, s10
-; GFX9-NEXT:    s_and_b32 s10, s11, 0xff
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    s_and_b32 s10, 0xffff, s10
-; GFX9-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    s_or_b32 s2, s2, s8
+; GFX9-NEXT:    s_and_b32 s8, s9, 0xff
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX9-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX9-NEXT:    s_lshr_b32 s11, s3, 8
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX9-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX9-NEXT:    s_or_b32 s2, s2, s10
+; GFX9-NEXT:    s_or_b32 s2, s2, s8
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX9-NEXT:    s_and_b32 s10, s13, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffffffe8
-; GFX9-NEXT:    s_or_b32 s3, s12, s3
-; GFX9-NEXT:    s_and_b32 s10, 0xffff, s10
-; GFX9-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX9-NEXT:    s_and_b32 s8, s11, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT:    s_or_b32 s3, s10, s3
+; GFX9-NEXT:    s_and_b32 s8, 0xffff, s8
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX9-NEXT:    s_or_b32 s3, s3, s10
-; GFX9-NEXT:    s_lshr_b32 s10, s4, 8
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    s_and_b32 s10, s10, 0xff
-; GFX9-NEXT:    s_lshr_b32 s11, s4, 16
-; GFX9-NEXT:    s_lshr_b32 s12, s4, 24
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX9-NEXT:    s_or_b32 s3, s3, s8
+; GFX9-NEXT:    s_lshr_b32 s8, s4, 8
+; GFX9-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
+; GFX9-NEXT:    s_lshr_b32 s10, s4, 24
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX9-NEXT:    s_lshl_b32 s10, s10, 8
-; GFX9-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX9-NEXT:    s_or_b32 s4, s4, s10
-; GFX9-NEXT:    s_and_b32 s10, s11, 0xff
-; GFX9-NEXT:    s_and_b32 s10, 0xffff, s10
-; GFX9-NEXT:    v_mul_lo_u32 v2, v0, v2
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX9-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX9-NEXT:    s_or_b32 s4, s4, s8
+; GFX9-NEXT:    s_and_b32 s8, s9, 0xff
+; GFX9-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX9-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX9-NEXT:    s_or_b32 s4, s4, s10
-; GFX9-NEXT:    v_add_u32_e32 v1, v1, v3
-; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX9-NEXT:    s_lshr_b32 s13, s5, 8
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX9-NEXT:    s_or_b32 s4, s4, s8
+; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v0
+; GFX9-NEXT:    s_lshr_b32 s11, s5, 8
 ; GFX9-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX9-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX9-NEXT:    s_and_b32 s10, s13, 0xff
-; GFX9-NEXT:    s_or_b32 s5, s12, s5
-; GFX9-NEXT:    s_and_b32 s10, 0xffff, s10
+; GFX9-NEXT:    s_and_b32 s8, s11, 0xff
+; GFX9-NEXT:    s_or_b32 s5, s10, s5
+; GFX9-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX9-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX9-NEXT:    s_lshl_b32 s10, s10, 16
 ; GFX9-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX9-NEXT:    s_or_b32 s5, s5, s10
-; GFX9-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX9-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX9-NEXT:    s_or_b32 s5, s5, s8
 ; GFX9-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, s4, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX9-NEXT:    v_subrev_u32_e32 v3, 24, v1
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v1
 ; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v3, 23, v1
-; GFX9-NEXT:    s_lshl_b32 s4, s7, 17
+; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v1
+; GFX9-NEXT:    s_lshl_b32 s4, s6, 17
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX9-NEXT:    s_or_b32 s0, s4, s0
-; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s5, v0
 ; GFX9-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
@@ -2161,23 +2146,23 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v0
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX9-NEXT:    s_and_b32 s9, 0xffff, s9
+; GFX9-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 23, v0
-; GFX9-NEXT:    s_lshl_b32 s0, s9, 17
+; GFX9-NEXT:    s_lshl_b32 s0, s7, 17
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, v0, s3
-; GFX9-NEXT:    s_mov_b32 s6, 8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
-; GFX9-NEXT:    s_mov_b32 s8, 16
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX9-NEXT:    v_and_or_b32 v2, v1, v2, v3
+; GFX9-NEXT:    v_mov_b32_e32 v3, 16
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX9-NEXT:    v_and_or_b32 v2, v1, s0, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 8, 8
@@ -2190,255 +2175,243 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-LABEL: s_fshr_v2i24:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX10-NEXT:    s_lshr_b32 s14, s4, 8
+; GFX10-NEXT:    s_lshr_b32 s15, s4, 16
+; GFX10-NEXT:    s_and_b32 s14, s14, 0xff
+; GFX10-NEXT:    s_lshr_b32 s16, s4, 24
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX10-NEXT:    s_and_b32 s15, s15, 0xff
+; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX10-NEXT:    s_and_b32 s15, 0xffff, s15
+; GFX10-NEXT:    s_or_b32 s4, s4, s14
+; GFX10-NEXT:    s_lshr_b32 s17, s5, 8
+; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX10-NEXT:    s_lshl_b32 s14, s15, 16
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX10-NEXT:    s_and_b32 s15, s17, 0xff
+; GFX10-NEXT:    s_or_b32 s4, s4, s14
+; GFX10-NEXT:    s_or_b32 s5, s16, s5
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    s_and_b32 s14, 0xffff, s15
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX10-NEXT:    s_lshl_b32 s14, s14, 16
 ; GFX10-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX10-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
+; GFX10-NEXT:    s_or_b32 s5, s5, s14
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 8
 ; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 16
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX10-NEXT:    s_and_b32 s9, s9, 0xff
+; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX10-NEXT:    s_and_b32 s10, s10, 0xff
+; GFX10-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX10-NEXT:    s_or_b32 s1, s8, s1
-; GFX10-NEXT:    s_lshr_b32 s8, s4, 8
+; GFX10-NEXT:    s_and_b32 s8, 0xffff, s9
+; GFX10-NEXT:    s_lshl_b32 s9, s10, 8
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    s_or_b32 s2, s2, s9
+; GFX10-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX10-NEXT:    v_mul_hi_u32 v1, s4, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
-; GFX10-NEXT:    s_or_b32 s0, s0, s6
-; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX10-NEXT:    s_and_b32 s7, s9, 0xff
-; GFX10-NEXT:    s_lshr_b32 s9, s4, 16
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    s_lshr_b32 s10, s4, 24
-; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v1
-; GFX10-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v0
-; GFX10-NEXT:    s_or_b32 s4, s4, s8
-; GFX10-NEXT:    s_and_b32 s8, s9, 0xff
-; GFX10-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX10-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX10-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX10-NEXT:    v_mul_hi_u32 v3, v0, v3
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_or_b32 s4, s4, s8
-; GFX10-NEXT:    s_and_b32 s8, s11, 0xff
-; GFX10-NEXT:    s_or_b32 s5, s10, s5
-; GFX10-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v3
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
-; GFX10-NEXT:    v_mul_hi_u32 v1, s4, v1
-; GFX10-NEXT:    s_or_b32 s5, s5, s8
-; GFX10-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX10-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX10-NEXT:    s_and_b32 s9, s9, 0xff
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX10-NEXT:    s_lshr_b32 s11, s3, 8
-; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_or_b32 s3, s12, s3
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX10-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX10-NEXT:    s_or_b32 s2, s2, s9
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s8
-; GFX10-NEXT:    s_or_b32 s3, s10, s3
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
-; GFX10-NEXT:    s_and_b32 s5, s11, 0xff
+; GFX10-NEXT:    s_and_b32 s4, s11, 0xff
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX10-NEXT:    s_or_b32 s2, s2, s4
+; GFX10-NEXT:    s_and_b32 s4, s13, 0xff
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_or_b32 s2, s2, s4
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
-; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    s_or_b32 s3, s3, s5
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT:    s_or_b32 s3, s3, s4
+; GFX10-NEXT:    s_lshl_b32 s4, s7, 17
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_lshl_b32 s4, s6, 17
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX10-NEXT:    s_or_b32 s0, s4, s0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX10-NEXT:    s_or_b32 s0, s4, s0
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v0
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
-; GFX10-NEXT:    s_lshl_b32 s2, s7, 17
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX10-NEXT:    s_lshl_b32 s2, s8, 17
+; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s3
-; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v3, v1
+; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
 ; GFX10-NEXT:    s_or_b32 s0, s2, s1
-; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
-; GFX10-NEXT:    s_mov_b32 s0, 8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    s_mov_b32 s0, 16
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 8
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v3, v0
+; GFX10-NEXT:    v_mov_b32_e32 v3, 16
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v1, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX10-NEXT:    v_bfe_u32 v4, v0, 8, 8
 ; GFX10-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX10-NEXT:    v_and_or_b32 v2, v1, 0xff, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v4
 ; GFX10-NEXT:    v_or3_b32 v1, v2, v1, v3
-; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v4
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_v2i24:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX11-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX11-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX11-NEXT:    s_lshr_b32 s14, s4, 8
+; GFX11-NEXT:    s_lshr_b32 s15, s4, 16
+; GFX11-NEXT:    s_and_b32 s14, s14, 0xff
+; GFX11-NEXT:    s_lshr_b32 s16, s4, 24
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX11-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX11-NEXT:    s_or_b32 s0, s0, s6
-; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX11-NEXT:    s_and_b32 s7, s9, 0xff
-; GFX11-NEXT:    s_lshr_b32 s9, s4, 8
-; GFX11-NEXT:    s_lshr_b32 s10, s4, 16
-; GFX11-NEXT:    s_and_b32 s9, s9, 0xff
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v0
-; GFX11-NEXT:    s_and_b32 s11, s4, 0xff
-; GFX11-NEXT:    s_lshl_b32 s9, s9, 8
-; GFX11-NEXT:    s_and_b32 s10, s10, 0xff
-; GFX11-NEXT:    s_or_b32 s9, s11, s9
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT:    s_and_b32 s10, 0xffff, s10
-; GFX11-NEXT:    s_and_b32 s9, 0xffff, s9
-; GFX11-NEXT:    s_lshl_b32 s10, s10, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX11-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v1
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    s_and_b32 s15, s15, 0xff
+; GFX11-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX11-NEXT:    s_and_b32 s15, 0xffff, s15
+; GFX11-NEXT:    s_or_b32 s4, s4, s14
+; GFX11-NEXT:    s_lshr_b32 s17, s5, 8
 ; GFX11-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX11-NEXT:    s_lshr_b32 s4, s4, 24
-; GFX11-NEXT:    s_or_b32 s9, s9, s10
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX11-NEXT:    s_and_b32 s10, s11, 0xff
-; GFX11-NEXT:    s_or_b32 s4, s4, s5
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s10
-; GFX11-NEXT:    v_mul_hi_u32 v2, v1, v2
-; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX11-NEXT:    s_lshl_b32 s14, s15, 16
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX11-NEXT:    s_and_b32 s15, s17, 0xff
+; GFX11-NEXT:    s_or_b32 s4, s4, s14
+; GFX11-NEXT:    s_or_b32 s5, s16, s5
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    s_and_b32 s14, 0xffff, s15
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    s_lshl_b32 s14, s14, 16
+; GFX11-NEXT:    s_lshr_b32 s10, s2, 8
+; GFX11-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
+; GFX11-NEXT:    s_or_b32 s5, s5, s14
+; GFX11-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_or_b32 s4, s4, s5
+; GFX11-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX11-NEXT:    s_and_b32 s10, s10, 0xff
+; GFX11-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX11-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX11-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX11-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, v1, v2
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    s_and_b32 s9, s9, 0xff
+; GFX11-NEXT:    s_and_b32 s11, s11, 0xff
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX11-NEXT:    s_or_b32 s1, s8, s1
-; GFX11-NEXT:    s_lshr_b32 s8, s2, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 24
-; GFX11-NEXT:    v_mul_hi_u32 v1, s9, v1
-; GFX11-NEXT:    v_mul_lo_u32 v3, 0xffffffe8, v0
-; GFX11-NEXT:    s_and_b32 s8, s8, 0xff
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s8, s8, 8
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX11-NEXT:    s_and_b32 s8, 0xffff, s9
+; GFX11-NEXT:    s_and_b32 s9, 0xffff, s11
+; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    v_mul_hi_u32 v1, s4, v0
+; GFX11-NEXT:    v_mul_hi_u32 v0, s5, v0
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX11-NEXT:    s_or_b32 s0, s0, s6
+; GFX11-NEXT:    s_and_b32 s7, 0xffff, s7
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX11-NEXT:    s_or_b32 s2, s2, s8
-; GFX11-NEXT:    s_and_b32 s8, s10, 0xff
+; GFX11-NEXT:    s_lshr_b32 s13, s3, 8
 ; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX11-NEXT:    v_mul_hi_u32 v2, v0, v3
-; GFX11-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_or_b32 s2, s2, s8
+; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s9, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v2
-; GFX11-NEXT:    s_lshr_b32 s9, s3, 8
 ; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-NEXT:    s_and_b32 s13, s13, 0xff
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_or_b32 s3, s12, s3
+; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
+; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
+; GFX11-NEXT:    s_lshl_b32 s4, s10, 8
+; GFX11-NEXT:    s_and_b32 s10, 0xffff, s13
+; GFX11-NEXT:    s_or_b32 s2, s2, s4
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
-; GFX11-NEXT:    v_mul_hi_u32 v0, s4, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX11-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-NEXT:    s_lshl_b32 s5, s6, 17
+; GFX11-NEXT:    s_lshl_b32 s4, s9, 16
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX11-NEXT:    s_or_b32 s0, s5, s0
-; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s4, v0
-; GFX11-NEXT:    s_and_b32 s4, s9, 0xff
-; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_or_b32 s2, s2, s4
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
+; GFX11-NEXT:    s_lshl_b32 s4, s7, 17
+; GFX11-NEXT:    s_lshl_b32 s5, s10, 16
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX11-NEXT:    s_or_b32 s0, s4, s0
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX11-NEXT:    s_or_b32 s2, s3, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    s_or_b32 s2, s3, s5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
+; GFX11-NEXT:    s_lshl_b32 s0, s8, 17
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
-; GFX11-NEXT:    s_lshl_b32 s0, s7, 17
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 8, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 8
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 8, 8
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v1, v1, 0xff, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX11-NEXT:    v_bfe_u32 v4, v0, 8, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
+; GFX11-NEXT:    v_bfe_u32 v2, v0, 8, 8
 ; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX11-NEXT:    v_or3_b32 v1, v1, v2, v3
+; GFX11-NEXT:    v_or3_b32 v1, v1, v3, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 8, v4
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 8, v2
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
@@ -2457,24 +2430,22 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX6-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX6-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
+; GFX6-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX6-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
-; GFX6-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX6-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX6-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX6-NEXT:    v_mul_lo_u32 v8, v6, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX6-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GFX6-NEXT:    v_mul_lo_u32 v7, v6, v7
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX6-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX6-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GFX6-NEXT:    v_mul_hi_u32 v7, v4, v6
+; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX6-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v7
 ; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
@@ -2482,11 +2453,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, 24, v4
 ; GFX6-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX6-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GFX6-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, 23, v4
 ; GFX6-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
-; GFX6-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
@@ -2511,24 +2479,22 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX8-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
+; GFX8-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX8-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX8-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX8-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX8-NEXT:    v_mul_lo_u32 v8, v6, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX8-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GFX8-NEXT:    v_mul_lo_u32 v7, v6, v7
 ; GFX8-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX8-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
-; GFX8-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX8-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v7
+; GFX8-NEXT:    v_mul_hi_u32 v7, v4, v6
+; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v7, 24
+; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v7
 ; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
@@ -2536,11 +2502,8 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, 24, v4
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v8
-; GFX8-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, 23, v4
 ; GFX8-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
-; GFX8-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX8-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v7, v0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
@@ -2565,43 +2528,38 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v6, v6
-; GFX9-NEXT:    v_mov_b32_e32 v8, 0xffffffe8
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0xffffffe8
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX9-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
-; GFX9-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX9-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v6, v6
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX9-NEXT:    v_mul_lo_u32 v9, v7, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_mul_lo_u32 v8, v6, v8
 ; GFX9-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX9-NEXT:    v_mul_hi_u32 v9, v7, v9
+; GFX9-NEXT:    v_mul_lo_u32 v7, v6, v7
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_mul_hi_u32 v8, v6, v8
-; GFX9-NEXT:    v_add_u32_e32 v7, v7, v9
-; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v7
-; GFX9-NEXT:    v_add_u32_e32 v6, v6, v8
+; GFX9-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX9-NEXT:    v_add_u32_e32 v6, v6, v7
+; GFX9-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX9-NEXT:    v_mul_lo_u32 v6, v6, 24
 ; GFX9-NEXT:    v_sub_u32_e32 v4, v4, v7
-; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v4
+; GFX9-NEXT:    v_sub_u32_e32 v5, v5, v6
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX9-NEXT:    v_subrev_u32_e32 v7, 24, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_subrev_u32_e32 v6, 24, v4
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v4
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
-; GFX9-NEXT:    v_sub_u32_e32 v7, 23, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX9-NEXT:    v_sub_u32_e32 v6, 23, v4
 ; GFX9-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
-; GFX9-NEXT:    v_and_b32_e32 v7, 0xffffff, v7
+; GFX9-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, v4, v2
-; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v7, v2
-; GFX9-NEXT:    v_sub_u32_e32 v2, v5, v6
-; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
-; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX9-NEXT:    v_lshl_or_b32 v0, v0, v6, v2
+; GFX9-NEXT:    v_subrev_u32_e32 v2, 24, v5
+; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, 24, v2
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, 24, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -2623,17 +2581,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v6, v6
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
 ; GFX10-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v7, v7
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX10-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v7
-; GFX10-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v6
-; GFX10-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GFX10-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v8
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v9
-; GFX10-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX10-NEXT:    v_mul_lo_u32 v7, 0xffffffe8, v6
+; GFX10-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v6, v7
+; GFX10-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GFX10-NEXT:    v_mul_hi_u32 v6, v5, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v7, v7, 24
 ; GFX10-NEXT:    v_mul_lo_u32 v6, v6, 24
@@ -2667,52 +2620,47 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %lhs, <2 x i24> %rhs, <2 x i24> %amt) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v6, 24
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX11-NEXT:    v_rcp_iflag_f32_e32 v6, v6
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
 ; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v6
 ; GFX11-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GFX11-NEXT:    v_mul_lo_u32 v9, 0xffffffe8, v6
+; GFX11-NEXT:    v_mul_lo_u32 v7, 0xffffffe8, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_hi_u32 v9, v6, v9
-; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v9
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_hi_u32 v6, v5, v6
-; GFX11-NEXT:    v_mul_lo_u32 v8, 0xffffffe8, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
-; GFX11-NEXT:    v_mul_hi_u32 v8, v7, v8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
-; GFX11-NEXT:    v_add_nc_u32_e32 v7, v7, v8
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX11-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GFX11-NEXT:    v_add_nc_u32_e32 v6, v6, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_mul_hi_u32 v7, v4, v7
+; GFX11-NEXT:    v_mul_hi_u32 v7, v4, v6
 ; GFX11-NEXT:    v_mul_lo_u32 v7, v7, 24
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v4, v4, v7
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffffff, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_mul_hi_u32 v6, v5, v6
+; GFX11-NEXT:    v_mul_lo_u32 v6, v6, 24
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_sub_nc_u32_e32 v5, v5, v6
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v6, 24, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v7, 24, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v6, 23, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_cndmask_b32 v5, v5, v7 :: v_dual_and_b32 v4, 0xffffff, v4
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xffffff, v6
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_4)
@@ -5056,21 +5004,21 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v9
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v9, 1, v2
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v10, 15, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 16, v4
+; GFX8-NEXT:    v_and_b32_e32 v11, 15, v4
 ; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
 ; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v9, 1, v9
-; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v10, v6
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v2, v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshlrev_b16_e32 v6, v11, v6
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v9
+; GFX8-NEXT:    v_xor_b32_e32 v9, -1, v10
 ; GFX8-NEXT:    v_or_b32_e32 v4, v6, v4
-; GFX8-NEXT:    v_and_b32_e32 v6, 15, v7
-; GFX8-NEXT:    v_xor_b32_e32 v7, -1, v7
-; GFX8-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v10
+; GFX8-NEXT:    v_and_b32_e32 v9, 15, v9
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v2
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v6, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v7, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v9, v2
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -5078,23 +5026,22 @@ define <4 x half> @v_fshr_v4i16(<4 x i16> %lhs, <4 x i16> %rhs, <4 x i16> %amt)
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 1, v1
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 15, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_mov_b32_e32 v4, 1
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_lshrrev_b16_sdwa v6, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v6
-; GFX8-NEXT:    v_lshlrev_b16_e32 v6, 1, v3
-; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX8-NEXT:    v_and_b32_e32 v7, 15, v4
-; GFX8-NEXT:    v_xor_b32_e32 v4, -1, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v4
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 1, v6
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v1, v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b16_sdwa v4, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX8-NEXT:    v_lshlrev_b16_e32 v4, 1, v3
+; GFX8-NEXT:    v_lshlrev_b16_sdwa v3, v7, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
+; GFX8-NEXT:    v_and_b32_e32 v7, 15, v5
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 1, v4
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v7, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v4, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, v5, v4
+; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v6
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
-; GFX8-NEXT:    v_and_b32_e32 v4, 15, v5
-; GFX8-NEXT:    v_xor_b32_e32 v5, -1, v5
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v6
 ; GFX8-NEXT:    v_and_b32_e32 v5, 15, v5
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v4, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
index 64c3cd4e8c067f5..30b66e1fdd3405f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll
@@ -287,10 +287,10 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
-; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
+; GFX9-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_not_b32_e32 v3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -304,10 +304,10 @@ define amdgpu_ps void @insertelement_s_v2i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT:    s_mov_b32 s1, 0xffff
-; GFX8-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
+; GFX8-NEXT:    s_and_b32 s1, s4, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
@@ -382,14 +382,14 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT:    s_mov_b32 s1, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v1, s1
-; GFX9-NEXT:    v_not_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v1, v2
+; GFX9-NEXT:    v_not_b32_e32 v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v2, s0, v3, v2
+; GFX9-NEXT:    v_and_or_b32 v2, s0, v2, v3
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -398,15 +398,15 @@ define amdgpu_ps void @insertelement_s_v2i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    s_mov_b32 s1, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v1, v2
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -474,10 +474,10 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_not_b32_e32 v4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -489,18 +489,18 @@ define amdgpu_ps void @insertelement_v_v2i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX8-LABEL: insertelement_v_v2i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v2, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -664,9 +664,9 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_not_b32_e32 v3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -678,12 +678,12 @@ define amdgpu_ps void @insertelement_v_v2i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX8-LABEL: insertelement_v_v2i16_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
@@ -1078,11 +1078,11 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
-; GFX9-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_and_or_b32 v4, v1, v0, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1105,11 +1105,11 @@ define amdgpu_ps void @insertelement_s_v4i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
-; GFX8-NEXT:    s_and_b32 s3, s4, 0xffff
+; GFX8-NEXT:    s_and_b32 s2, s4, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v4, v0, v3
@@ -1214,16 +1214,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, v1, v4
 ; GFX9-NEXT:    v_not_b32_e32 v1, v1
 ; GFX9-NEXT:    v_and_or_b32 v4, v3, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1240,16 +1240,16 @@ define amdgpu_ps void @insertelement_s_v4i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    s_mov_b32 s2, 0xffff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
@@ -1352,21 +1352,21 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX9-LABEL: insertelement_v_v4i16_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e64 v7, v2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v7, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v5, v2, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
@@ -1375,22 +1375,22 @@ define amdgpu_ps void @insertelement_v_v4i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX8-LABEL: insertelement_v_v4i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 1, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
-; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_lshlrev_b32_e64 v7, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX8-NEXT:    v_not_b32_e32 v2, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
@@ -1603,20 +1603,20 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX9-LABEL: insertelement_v_v4i16_v_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 1, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX9-NEXT:    v_not_b32_e32 v3, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v7, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v6, v3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
@@ -1625,20 +1625,20 @@ define amdgpu_ps void @insertelement_v_v4i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX8-LABEL: insertelement_v_v4i16_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 1, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 1, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0xffff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX8-NEXT:    v_not_b32_e32 v3, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, v7, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, v6, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -2255,21 +2255,21 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_and_or_b32 v6, v1, v0, v2
@@ -2292,21 +2292,21 @@ define amdgpu_ps void @insertelement_s_v8i16_s_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s11
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -2451,12 +2451,12 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT:    s_mov_b32 s8, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX9-NEXT:    v_not_b32_e32 v1, v1
 ; GFX9-NEXT:    v_and_or_b32 v6, v2, v1, v0
@@ -2487,12 +2487,12 @@ define amdgpu_ps void @insertelement_s_v8i16_v_v(ptr addrspace(4) inreg %ptr, i1
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    s_mov_b32 s8, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -2629,25 +2629,25 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX9-LABEL: insertelement_v_v8i16_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX9-NEXT:    v_not_b32_e32 v1, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v9, v2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v9, v9, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v9, v2, v0, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
@@ -2658,26 +2658,26 @@ define amdgpu_ps void @insertelement_v_v8i16_s_v(ptr addrspace(1) %ptr, i16 inre
 ; GFX8-LABEL: insertelement_v_v8i16_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX8-NEXT:    v_not_b32_e32 v1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v9, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v1, v9, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v9, v0, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
@@ -2941,24 +2941,24 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX9-LABEL: insertelement_v_v8i16_v_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX9-NEXT:    v_not_b32_e32 v1, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v3, v3, v1, v2
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
@@ -2969,25 +2969,25 @@ define amdgpu_ps void @insertelement_v_v8i16_v_v(ptr addrspace(1) %ptr, i16 %val
 ; GFX8-LABEL: insertelement_v_v8i16_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX8-NEXT:    v_not_b32_e32 v1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
@@ -3640,13 +3640,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX9-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v9, s23
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_and_or_b32 v9, v1, v0, v2
@@ -3700,13 +3700,13 @@ define amdgpu_ps void @insertelement_s_v16i16_s_v(ptr addrspace(4) inreg %ptr, i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[6:7]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
-; GFX8-NEXT:    s_mov_b32 s5, 0xffff
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v9, s23
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[8:9]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v9, s[10:11]
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -3935,12 +3935,12 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT:    s_mov_b32 s20, 0xffff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s19
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
 ; GFX9-NEXT:    v_not_b32_e32 v1, v1
 ; GFX9-NEXT:    v_and_or_b32 v9, v2, v1, v0
@@ -3994,12 +3994,12 @@ define amdgpu_ps void @insertelement_s_v16i16_v_v(ptr addrspace(4) inreg %ptr, i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[6:7]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v8
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    s_mov_b32 s20, 0xffff
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0xffff
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s19
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s20
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -4209,32 +4209,32 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[7:10], v[0:1], off offset:16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX9-NEXT:    v_not_b32_e32 v1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v11, v2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
+; GFX9-NEXT:    v_not_b32_e32 v0, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[4:5]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v8, s[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
-; GFX9-NEXT:    v_and_or_b32 v11, v11, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX9-NEXT:    v_and_or_b32 v11, v2, v0, v11
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
@@ -4257,33 +4257,33 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(ptr addrspace(1) %ptr, i16 inr
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[7:10], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v2
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX8-NEXT:    v_not_b32_e32 v1, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 4, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v11, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
+; GFX8-NEXT:    v_not_b32_e32 v0, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_cndmask_b32_e32 v11, v3, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v6, s[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v7, s[4:5]
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v8, s[6:7]
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v9, s[8:9]
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v11, v10, s[10:11]
-; GFX8-NEXT:    v_and_b32_e32 v1, v11, v1
-; GFX8-NEXT:    v_or_b32_e32 v11, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v8, s[6:7]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[10:11]
+; GFX8-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v11, v0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v11, s[12:13]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v11, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v11, s[2:3]
@@ -4610,21 +4610,21 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
 ; GFX9-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off offset:16
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX9-NEXT:    s_mov_b32 s0, 0xffff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX9-NEXT:    v_not_b32_e32 v1, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
+; GFX9-NEXT:    v_not_b32_e32 v0, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
@@ -4634,7 +4634,7 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
-; GFX9-NEXT:    v_and_or_b32 v12, v3, v1, v2
+; GFX9-NEXT:    v_and_or_b32 v12, v3, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v12, s[12:13]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
@@ -4657,21 +4657,21 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 16, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX8-NEXT:    flat_load_dwordx4 v[8:11], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 1, v3
-; GFX8-NEXT:    s_mov_b32 s0, 0xffff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v0
-; GFX8-NEXT:    v_not_b32_e32 v1, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v0
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xffff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 4, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 4, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 5, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 6, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 7, v1
+; GFX8-NEXT:    v_not_b32_e32 v0, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 0, v1
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
@@ -4681,8 +4681,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(ptr addrspace(1) %ptr, i16 %va
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[8:9]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[10:11]
-; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_or_b32_e32 v12, v1, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v12, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v12, s[12:13]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v12, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index 16b702edff2db94..d5bfb7faf7fc8a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -71,17 +71,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s5, 1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s5, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, s4, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s0
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
-; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s5, 0
+; GFX10-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v0, s4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -177,16 +177,16 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, 1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s3, 0
+; GFX10-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, s2, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s2, s0
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
-; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, 0
+; GFX10-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v0, s2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -281,17 +281,17 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 1
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 0
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -389,17 +389,17 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v1, s4, vcc_lo
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v1, s4, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -495,13 +495,13 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX10-NEXT:    global_load_ushort v2, v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_and_b32_sdwa v3, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -600,12 +600,12 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, s2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -701,12 +701,12 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -802,12 +802,12 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -1088,10 +1088,10 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_movk_i32 s1, 0xff
-; GFX9-NEXT:    s_and_b32 s2, s4, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
+; GFX9-NEXT:    s_and_b32 s1, s4, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_not_b32_e32 v3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -1105,10 +1105,10 @@ define amdgpu_ps void @insertelement_s_v4i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT:    s_movk_i32 s1, 0xff
-; GFX8-NEXT:    s_and_b32 s2, s4, 0xff
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s2
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s1
+; GFX8-NEXT:    s_and_b32 s1, s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
@@ -1183,14 +1183,14 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX9-NEXT:    s_movk_i32 s1, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v1, s1
-; GFX9-NEXT:    v_not_b32_e32 v3, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v1, v2
+; GFX9-NEXT:    v_not_b32_e32 v2, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_and_or_b32 v2, s0, v3, v2
+; GFX9-NEXT:    v_and_or_b32 v2, s0, v2, v3
 ; GFX9-NEXT:    global_store_dword v[0:1], v2, off
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1199,15 +1199,15 @@ define amdgpu_ps void @insertelement_s_v4i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x0
 ; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT:    s_movk_i32 s1, 0xff
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v3, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v1, v2
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, s0, v0
+; GFX8-NEXT:    v_and_b32_e32 v2, s0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1275,10 +1275,10 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX9-NEXT:    global_load_dword v3, v[0:1], off
 ; GFX9-NEXT:    v_and_b32_e32 v0, 3, v2
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_not_b32_e32 v4, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -1290,18 +1290,18 @@ define amdgpu_ps void @insertelement_v_v4i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX8-LABEL: insertelement_v_v4i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v2, v1
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v2, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v3, v0, v1
+; GFX8-NEXT:    v_and_b32_e32 v2, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -1465,9 +1465,9 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX9-NEXT:    global_load_dword v4, v[0:1], off
 ; GFX9-NEXT:    v_and_b32_e32 v0, 3, v3
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_not_b32_e32 v3, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -1479,12 +1479,12 @@ define amdgpu_ps void @insertelement_v_v4i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX8-LABEL: insertelement_v_v4i8_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dword v0, v[0:1]
-; GFX8-NEXT:    v_and_b32_e32 v1, 3, v3
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v3, v0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
@@ -1957,11 +1957,11 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
-; GFX9-NEXT:    s_movk_i32 s2, 0xff
-; GFX9-NEXT:    s_and_b32 s3, s4, 0xff
+; GFX9-NEXT:    s_and_b32 s2, s4, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
+; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v0, s2
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_and_or_b32 v4, v1, v0, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -1984,11 +1984,11 @@ define amdgpu_ps void @insertelement_s_v8i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
-; GFX8-NEXT:    s_and_b32 s3, s4, 0xff
+; GFX8-NEXT:    s_and_b32 s2, s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s3
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s2
+; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v0, s2
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    v_or_b32_e32 v4, v0, v3
@@ -2093,16 +2093,16 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 2, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX9-NEXT:    s_movk_i32 s2, 0xff
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, v1, v4
 ; GFX9-NEXT:    v_not_b32_e32 v1, v1
 ; GFX9-NEXT:    v_and_or_b32 v4, v3, v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s0
@@ -2119,16 +2119,16 @@ define amdgpu_ps void @insertelement_s_v8i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_and_b32_e32 v1, 3, v1
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT:    s_movk_i32 s2, 0xff
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v1, v4
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
 ; GFX8-NEXT:    v_or_b32_e32 v4, v1, v0
@@ -2231,21 +2231,21 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX9-LABEL: insertelement_v_v8i8_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 2, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 2, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX9-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX9-NEXT:    v_lshlrev_b32_e64 v7, v2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX9-NEXT:    v_not_b32_e32 v2, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v7, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v5, v2, v7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[3:4], v[0:1], off
@@ -2254,22 +2254,22 @@ define amdgpu_ps void @insertelement_v_v8i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX8-LABEL: insertelement_v_v8i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 2, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 2, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
-; GFX8-NEXT:    v_lshlrev_b32_e64 v6, v2, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v2, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v5
+; GFX8-NEXT:    v_lshlrev_b32_e64 v7, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
 ; GFX8-NEXT:    v_not_b32_e32 v2, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v5
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v2, v7, v2
-; GFX8-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v2, v5, v2
+; GFX8-NEXT:    v_or_b32_e32 v2, v2, v7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX8-NEXT:    flat_store_dwordx2 v[3:4], v[0:1]
@@ -2482,20 +2482,20 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX9-LABEL: insertelement_v_v8i8_v_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 2, v3
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v6, 0xff
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX9-NEXT:    v_not_b32_e32 v3, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
 ; GFX9-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX9-NEXT:    v_and_or_b32 v2, v7, v3, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX9-NEXT:    v_and_or_b32 v2, v6, v3, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v[4:5], v[0:1], off
@@ -2504,20 +2504,20 @@ define amdgpu_ps void @insertelement_v_v8i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX8-LABEL: insertelement_v_v8i8_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 2, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 2, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v6, 0xff
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v3, v3, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v6
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, v3, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v7
 ; GFX8-NEXT:    v_not_b32_e32 v3, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v4, 0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v7
 ; GFX8-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v0, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v3, v7, v3
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX8-NEXT:    v_and_b32_e32 v3, v6, v3
 ; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v0, v2, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
@@ -3134,21 +3134,21 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX9-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX9-NEXT:    s_movk_i32 s5, 0xff
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s11
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX9-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_and_or_b32 v6, v1, v0, v2
@@ -3171,21 +3171,21 @@ define amdgpu_ps void @insertelement_s_v16i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    s_load_dwordx4 s[8:11], s[2:3], 0x0
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 2, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v4
+; GFX8-NEXT:    v_and_b32_e32 v0, 3, v0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s10
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
-; GFX8-NEXT:    s_movk_i32 s5, 0xff
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s11
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v0, s4
-; GFX8-NEXT:    v_lshlrev_b32_e64 v0, v0, s5
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s[2:3]
 ; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -3330,12 +3330,12 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX9-NEXT:    s_movk_i32 s8, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX9-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
+; GFX9-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX9-NEXT:    v_not_b32_e32 v1, v1
 ; GFX9-NEXT:    v_and_or_b32 v6, v2, v1, v0
@@ -3366,12 +3366,12 @@ define amdgpu_ps void @insertelement_s_v16i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT:    s_movk_i32 s8, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX8-NEXT:    v_mov_b32_e32 v6, s7
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v4
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s8
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
 ; GFX8-NEXT:    v_not_b32_e32 v1, v1
 ; GFX8-NEXT:    v_and_b32_e32 v1, v2, v1
@@ -3508,25 +3508,25 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX9-LABEL: insertelement_v_v16i8_s_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[3:6], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX9-NEXT:    v_not_b32_e32 v1, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 2, v2
+; GFX9-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX9-NEXT:    s_and_b32 s0, s2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_lshlrev_b32_e64 v9, v2, s0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v9, v9, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX9-NEXT:    v_and_or_b32 v9, v2, v0, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
@@ -3537,26 +3537,26 @@ define amdgpu_ps void @insertelement_v_v16i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX8-LABEL: insertelement_v_v16i8_s_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[3:6], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 2, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 3, v2
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    s_and_b32 s1, s2, 0xff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v2, v1, s1
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX8-NEXT:    v_not_b32_e32 v1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 2, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX8-NEXT:    s_and_b32 s0, s2, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 3, v2
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_lshlrev_b32_e64 v9, v2, s0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v7, 0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_cndmask_b32_e32 v9, v3, v4, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v9, v6, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v1, v9, v1
-; GFX8-NEXT:    v_or_b32_e32 v9, v1, v2
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[2:3]
+; GFX8-NEXT:    v_and_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v9, v0, v9
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v3, v9, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v4, v9, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v5, v9, s[0:1]
@@ -3820,24 +3820,24 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX9-LABEL: insertelement_v_v16i8_v_v:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    global_load_dwordx4 v[4:7], v[0:1], off
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 2, v3
-; GFX9-NEXT:    v_and_b32_e32 v1, 3, v3
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX9-NEXT:    v_not_b32_e32 v1, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 2, v3
+; GFX9-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX9-NEXT:    v_not_b32_e32 v0, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v8, 0
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GFX9-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX9-NEXT:    v_and_or_b32 v3, v3, v1, v2
+; GFX9-NEXT:    v_and_or_b32 v3, v3, v0, v2
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
@@ -3848,25 +3848,25 @@ define amdgpu_ps void @insertelement_v_v16i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX8-LABEL: insertelement_v_v16i8_v_v:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v0, 2, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 3, v3
-; GFX8-NEXT:    s_movk_i32 s0, 0xff
-; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 3, v1
-; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_lshlrev_b32_e64 v1, v1, s0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v0
-; GFX8-NEXT:    v_not_b32_e32 v1, v1
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 2, v3
+; GFX8-NEXT:    v_and_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0xff
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 3, v3
+; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v1
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[2:3], 3, v1
+; GFX8-NEXT:    v_not_b32_e32 v0, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v8, 0
-; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v0
+; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v4, v5, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[2:3]
-; GFX8-NEXT:    v_and_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_or_b32_e32 v3, v1, v2
+; GFX8-NEXT:    v_and_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v3, v0, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v3, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v6, v3, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index d62454c8bd0b605..400e9cac4e87155 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -56,8 +56,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX10-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GFX10-NEXT:    v_and_or_b32 v5, v5, 0xffff, v9
-; GFX10-NEXT:    v_and_or_b32 v6, v6, 0xffff, v10
+; GFX10-NEXT:    v_and_or_b32 v5, 0xffff, v5, v9
+; GFX10-NEXT:    v_and_or_b32 v6, 0xffff, v6, v10
 ; GFX10-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -119,8 +119,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
 ; GFX10-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; GFX10-NEXT:    v_and_or_b32 v6, v6, 0xffff, v10
-; GFX10-NEXT:    v_and_or_b32 v7, v7, 0xffff, v11
+; GFX10-NEXT:    v_and_or_b32 v6, 0xffff, v6, v10
+; GFX10-NEXT:    v_and_or_b32 v7, 0xffff, v7, v11
 ; GFX10-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -265,8 +265,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v4
 ; GFX1030-NEXT:    v_alignbit_b32 v20, v2, v7, 16
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
-; GFX1030-NEXT:    v_and_or_b32 v18, v5, 0xffff, v0
-; GFX1030-NEXT:    v_and_or_b32 v19, v6, 0xffff, v1
+; GFX1030-NEXT:    v_and_or_b32 v18, 0xffff, v5, v0
+; GFX1030-NEXT:    v_and_or_b32 v19, 0xffff, v6, v1
 ; GFX1030-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v10
@@ -303,8 +303,8 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX1013-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GFX1013-NEXT:    v_and_or_b32 v5, v5, 0xffff, v13
-; GFX1013-NEXT:    v_and_or_b32 v6, v6, 0xffff, v14
+; GFX1013-NEXT:    v_and_or_b32 v5, 0xffff, v5, v13
+; GFX1013-NEXT:    v_and_or_b32 v6, 0xffff, v6, v14
 ; GFX1013-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v10
@@ -496,8 +496,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1030-NEXT:    v_mov_b32_e32 v18, v4
 ; GFX1030-NEXT:    v_mov_b32_e32 v19, v5
 ; GFX1030-NEXT:    v_alignbit_b32 v22, v2, v8, 16
-; GFX1030-NEXT:    v_and_or_b32 v20, v6, 0xffff, v0
-; GFX1030-NEXT:    v_and_or_b32 v21, v7, 0xffff, v1
+; GFX1030-NEXT:    v_and_or_b32 v20, 0xffff, v6, v0
+; GFX1030-NEXT:    v_and_or_b32 v21, 0xffff, v7, v1
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1030-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v10
@@ -536,8 +536,8 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
 ; GFX1013-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; GFX1013-NEXT:    v_and_or_b32 v6, v6, 0xffff, v14
-; GFX1013-NEXT:    v_and_or_b32 v7, v7, 0xffff, v15
+; GFX1013-NEXT:    v_and_or_b32 v6, 0xffff, v6, v14
+; GFX1013-NEXT:    v_and_or_b32 v7, 0xffff, v7, v15
 ; GFX1013-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v10
 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index d84286905ecc3dd..6a79ad85a9a2876 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -41,19 +41,19 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-LABEL: v_sdot4_cast_v4i8:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_mov_b32 s5, 8
-; GFX906-NEXT:    s_movk_i32 s4, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX906-NEXT:    v_mov_b32_e32 v10, 8
+; GFX906-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v0, v9, v1
 ; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v6
 ; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v7
-; GFX906-NEXT:    v_and_or_b32 v1, v4, s4, v1
+; GFX906-NEXT:    v_and_or_b32 v1, v4, v9, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
@@ -63,17 +63,17 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10-LABEL: v_sdot4_cast_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, 8
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v6
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v3, v4, 0xff, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v4, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index a9d89db460cc559..3e2a78d8fef8184 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -41,19 +41,19 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-LABEL: v_udot4_cast_v4i8:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_mov_b32 s5, 8
-; GFX906-NEXT:    s_movk_i32 s4, 0xff
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX906-NEXT:    v_and_or_b32 v0, v0, s4, v1
+; GFX906-NEXT:    v_mov_b32_e32 v10, 8
+; GFX906-NEXT:    v_mov_b32_e32 v9, 0xff
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_and_or_b32 v0, v0, v9, v1
 ; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, s5, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v6
 ; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v7
-; GFX906-NEXT:    v_and_or_b32 v1, v4, s4, v1
+; GFX906-NEXT:    v_and_or_b32 v1, v4, v9, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
@@ -63,17 +63,17 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10-LABEL: v_udot4_cast_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_mov_b32 s4, 8
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v1
+; GFX10-NEXT:    v_mov_b32_e32 v9, 8
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v2
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, s4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v6
 ; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v7
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v3, v4, 0xff, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v4, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
@@ -92,10 +92,10 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0xff, v1
+; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v6
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-NEXT:    v_and_or_b32 v4, v4, 0xff, v5
+; GFX11-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
 ; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index cded5c94edf8cc3..d36f5c0ea89d98a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -231,14 +231,12 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_max_i32_e32 v1, v5, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v4
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
@@ -246,8 +244,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
@@ -300,8 +298,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -312,14 +310,14 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -440,8 +438,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -460,10 +458,10 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -506,40 +504,38 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v11, v10
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x7fffffff, v8
 ; GFX6-NEXT:    v_max_i32_e32 v1, v10, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v8
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v11, v8
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v2, v8, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v5
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
@@ -639,11 +635,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_add_i16 v2, v2, v3 clamp
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -677,7 +673,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v1, v2, 0xff, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -709,7 +705,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT:    v_and_or_b32 v1, v1, 0xff, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
 ; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
@@ -867,46 +863,46 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ;
 ; GFX9-LABEL: s_saddsat_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    s_mov_b32 s2, 8
-; GFX9-NEXT:    v_pk_add_i16 v1, s3, v1 clamp
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_pk_add_i16 v1, s2, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    s_mov_b32 s5, 24
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -941,14 +937,14 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_add_i16 v1, s2, s3 clamp
-; GFX10-NEXT:    s_mov_b32 s0, 8
+; GFX10-NEXT:    v_mov_b32_e32 v2, 8
+; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT:    s_mov_b32 s0, 24
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -988,7 +984,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
@@ -1265,19 +1261,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v2i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v1
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_max_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
@@ -1286,19 +1280,17 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-LABEL: v_saddsat_v2i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v0
-; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX8-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_min_i32_e32 v4, 0, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_max_i32_e32 v2, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s5, v4
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x80000000, v4
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX8-NEXT:    v_max_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
@@ -1383,26 +1375,25 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v3i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_min_i32_e32 v7, 0, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
+; GFX6-NEXT:    v_min_i32_e32 v8, 0, v0
 ; GFX6-NEXT:    v_max_i32_e32 v6, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, s5, v7
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s4, v6
-; GFX6-NEXT:    v_max_i32_e32 v3, v7, v3
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x80000000, v8
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x7fffffff, v6
+; GFX6-NEXT:    v_max_i32_e32 v3, v8, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v7, -2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v7, v3
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, 0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
@@ -1411,26 +1402,25 @@ define <3 x i32> @v_saddsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-LABEL: v_saddsat_v3i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    v_min_i32_e32 v7, 0, v0
-; GFX8-NEXT:    s_brev_b32 s4, -2
+; GFX8-NEXT:    v_min_i32_e32 v8, 0, v0
 ; GFX8-NEXT:    v_max_i32_e32 v6, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, s5, v7
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_max_i32_e32 v3, v7, v3
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 0x80000000, v8
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x7fffffff, v6
+; GFX8-NEXT:    v_max_i32_e32 v3, v8, v3
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v1
+; GFX8-NEXT:    v_bfrev_b32_e32 v7, -2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v7, v3
 ; GFX8-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_min_i32_e32 v4, 0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s5, v4
-; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, 0x80000000, v4
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_min_i32_e32 v3, v4, v3
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v3
@@ -1536,26 +1526,26 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_min_i32_e32 v9, 0, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
+; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, s5, v9
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
-; GFX6-NEXT:    v_max_i32_e32 v4, v9, v4
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v11, v10
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x7fffffff, v8
+; GFX6-NEXT:    v_max_i32_e32 v4, v10, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x80000000, v8
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
 ; GFX6-NEXT:    v_max_i32_e32 v5, v8, v5
 ; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v9, v4
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v3
@@ -1571,26 +1561,26 @@ define <4 x i32> @v_saddsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-LABEL: v_saddsat_v4i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    v_min_i32_e32 v9, 0, v0
-; GFX8-NEXT:    s_brev_b32 s4, -2
+; GFX8-NEXT:    v_min_i32_e32 v10, 0, v0
+; GFX8-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX8-NEXT:    v_max_i32_e32 v8, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, s5, v9
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_max_i32_e32 v4, v9, v4
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v11, v10
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 0x7fffffff, v8
+; GFX8-NEXT:    v_max_i32_e32 v4, v10, v4
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX8-NEXT:    v_min_i32_e32 v8, 0, v1
+; GFX8-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, s5, v8
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, 0x80000000, v8
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v9, v4
 ; GFX8-NEXT:    v_max_i32_e32 v5, v8, v5
 ; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s5, v5
-; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, 0x80000000, v5
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v9, v4
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_min_i32_e32 v4, v5, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, 0, v3
@@ -1724,30 +1714,29 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v5i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v12, 0, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
+; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
 ; GFX6-NEXT:    v_max_i32_e32 v10, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s5, v12
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s4, v10
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v13, v12
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, 0x7fffffff, v10
 ; GFX6-NEXT:    v_max_i32_e32 v5, v12, v5
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v13, v10
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v10, v6
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v11, v5
 ; GFX6-NEXT:    v_max_i32_e32 v6, v6, v7
 ; GFX6-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
@@ -1767,30 +1756,29 @@ define <5 x i32> @v_saddsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-LABEL: v_saddsat_v5i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s5, 1
 ; GFX8-NEXT:    v_min_i32_e32 v12, 0, v0
-; GFX8-NEXT:    s_brev_b32 s4, -2
+; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
 ; GFX8-NEXT:    v_max_i32_e32 v10, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, s5, v12
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, v13, v12
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, 0x7fffffff, v10
 ; GFX8-NEXT:    v_max_i32_e32 v5, v12, v5
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v10
 ; GFX8-NEXT:    v_min_i32_e32 v10, 0, v1
+; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, s5, v10
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v13, v10
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
 ; GFX8-NEXT:    v_max_i32_e32 v6, v10, v6
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, s5, v6
-; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v11, v5
 ; GFX8-NEXT:    v_max_i32_e32 v6, v6, v7
 ; GFX8-NEXT:    v_min_i32_e32 v5, v6, v5
 ; GFX8-NEXT:    v_min_i32_e32 v6, 0, v3
-; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, 0, v3
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, 0x80000000, v6
@@ -1949,246 +1937,242 @@ define <16 x i32> @v_saddsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_saddsat_v16i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s4, 1
-; GFX6-NEXT:    v_min_i32_e32 v31, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, s4, v31
-; GFX6-NEXT:    v_max_i32_e32 v16, v31, v16
-; GFX6-NEXT:    s_brev_b32 s5, -2
-; GFX6-NEXT:    v_max_i32_e32 v31, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v31, vcc, s5, v31
-; GFX6-NEXT:    v_min_i32_e32 v16, v16, v31
-; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v16
-; GFX6-NEXT:    v_min_i32_e32 v16, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s4, v16
-; GFX6-NEXT:    v_max_i32_e32 v16, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, s5, v17
-; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
-; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v16
-; GFX6-NEXT:    v_min_i32_e32 v16, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s4, v16
-; GFX6-NEXT:    v_max_i32_e32 v17, 0, v2
-; GFX6-NEXT:    v_max_i32_e32 v16, v16, v18
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, s5, v17
-; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
-; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v16
-; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX6-NEXT:    v_min_i32_e32 v32, 0, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v31, 1
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v31, v32
+; GFX6-NEXT:    v_max_i32_e32 v32, v32, v16
+; GFX6-NEXT:    v_max_i32_e32 v33, 0, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v16, -2
+; GFX6-NEXT:    v_sub_i32_e32 v33, vcc, v16, v33
+; GFX6-NEXT:    v_min_i32_e32 v32, v32, v33
+; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v32
+; GFX6-NEXT:    v_min_i32_e32 v32, 0, v1
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v31, v32
+; GFX6-NEXT:    v_max_i32_e32 v17, v32, v17
+; GFX6-NEXT:    v_max_i32_e32 v32, 0, v1
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v16, v32
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v32
+; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v17
+; GFX6-NEXT:    v_min_i32_e32 v17, 0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
+; GFX6-NEXT:    v_max_i32_e32 v18, 0, v2
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
+; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v17
 ; GFX6-NEXT:    v_min_i32_e32 v17, 0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT:    v_max_i32_e32 v18, 0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX6-NEXT:    v_bfrev_b32_e32 v18, -2
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v17
 ; GFX6-NEXT:    v_min_i32_e32 v17, 0, v4
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v4
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT:    v_max_i32_e32 v18, 0, v4
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
 ; GFX6-NEXT:    v_min_i32_e32 v17, 0, v5
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v5
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT:    v_max_i32_e32 v18, 0, v5
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v21
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v17
 ; GFX6-NEXT:    v_min_i32_e32 v17, 0, v6
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v19, 0, v6
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT:    v_max_i32_e32 v18, 0, v6
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v18, v19
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
-; GFX6-NEXT:    buffer_load_dword v19, off, s[0:3], s32
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v16, v18
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
+; GFX6-NEXT:    buffer_load_dword v18, off, s[0:3], s32
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v17
 ; GFX6-NEXT:    v_min_i32_e32 v17, 0, v7
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, 0, v7
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
+; GFX6-NEXT:    v_max_i32_e32 v19, 0, v7
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v18, v20
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v8
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v16, v19
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_min_i32_e32 v19, 0, v8
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v8
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v24
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v9
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v31, v19
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
+; GFX6-NEXT:    v_max_i32_e32 v19, v19, v24
+; GFX6-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX6-NEXT:    v_min_i32_e32 v19, 0, v9
 ; GFX6-NEXT:    v_add_i32_e32 v8, vcc, v8, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v9
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v25
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v10
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v31, v19
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
+; GFX6-NEXT:    v_max_i32_e32 v19, v19, v25
+; GFX6-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX6-NEXT:    v_min_i32_e32 v19, 0, v10
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v10
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v26
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v11
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v31, v19
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
+; GFX6-NEXT:    v_max_i32_e32 v19, v19, v26
+; GFX6-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX6-NEXT:    v_min_i32_e32 v19, 0, v11
 ; GFX6-NEXT:    v_add_i32_e32 v10, vcc, v10, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v11
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v27
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v12
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v31, v19
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
+; GFX6-NEXT:    v_max_i32_e32 v19, v19, v27
+; GFX6-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX6-NEXT:    v_min_i32_e32 v19, 0, v12
 ; GFX6-NEXT:    v_add_i32_e32 v11, vcc, v11, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v12
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v28
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v13
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v31, v19
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
+; GFX6-NEXT:    v_max_i32_e32 v19, v19, v28
+; GFX6-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX6-NEXT:    v_min_i32_e32 v19, 0, v13
 ; GFX6-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v13
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v29
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX6-NEXT:    v_min_i32_e32 v20, 0, v14
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v31, v19
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
+; GFX6-NEXT:    v_max_i32_e32 v19, v19, v29
+; GFX6-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX6-NEXT:    v_min_i32_e32 v19, 0, v14
 ; GFX6-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v14
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v16, v20
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_max_i32_e32 v20, v20, v30
-; GFX6-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v31, v19
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v16, v17
+; GFX6-NEXT:    v_max_i32_e32 v19, v19, v30
+; GFX6-NEXT:    v_min_i32_e32 v17, v19, v17
 ; GFX6-NEXT:    v_add_i32_e32 v14, vcc, v14, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, 0, v15
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v18, v17
-; GFX6-NEXT:    v_min_i32_e32 v18, 0, v15
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v17
+; GFX6-NEXT:    v_min_i32_e32 v17, 0, v15
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v31, v17
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_i32_e32 v16, v16, v19
-; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
+; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
+; GFX6-NEXT:    v_min_i32_e32 v16, v17, v16
 ; GFX6-NEXT:    v_add_i32_e32 v15, vcc, v15, v16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_saddsat_v16i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s4, 1
-; GFX8-NEXT:    v_min_i32_e32 v31, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v31, vcc, s4, v31
-; GFX8-NEXT:    v_max_i32_e32 v16, v31, v16
-; GFX8-NEXT:    s_brev_b32 s5, -2
-; GFX8-NEXT:    v_max_i32_e32 v31, 0, v0
-; GFX8-NEXT:    v_sub_u32_e32 v31, vcc, s5, v31
-; GFX8-NEXT:    v_min_i32_e32 v16, v16, v31
-; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v16
-; GFX8-NEXT:    v_min_i32_e32 v16, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_max_i32_e32 v16, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v17, 0, v1
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, s5, v17
-; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
-; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v16
-; GFX8-NEXT:    v_min_i32_e32 v16, 0, v2
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_max_i32_e32 v17, 0, v2
-; GFX8-NEXT:    v_max_i32_e32 v16, v16, v18
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, s5, v17
-; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v16
-; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX8-NEXT:    v_min_i32_e32 v32, 0, v0
+; GFX8-NEXT:    v_bfrev_b32_e32 v31, 1
+; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v31, v32
+; GFX8-NEXT:    v_max_i32_e32 v32, v32, v16
+; GFX8-NEXT:    v_max_i32_e32 v33, 0, v0
+; GFX8-NEXT:    v_bfrev_b32_e32 v16, -2
+; GFX8-NEXT:    v_sub_u32_e32 v33, vcc, v16, v33
+; GFX8-NEXT:    v_min_i32_e32 v32, v32, v33
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v32
+; GFX8-NEXT:    v_min_i32_e32 v32, 0, v1
+; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v31, v32
+; GFX8-NEXT:    v_max_i32_e32 v17, v32, v17
+; GFX8-NEXT:    v_max_i32_e32 v32, 0, v1
+; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v16, v32
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v32
+; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v17
+; GFX8-NEXT:    v_min_i32_e32 v17, 0, v2
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
+; GFX8-NEXT:    v_max_i32_e32 v18, 0, v2
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v2, v17
 ; GFX8-NEXT:    v_min_i32_e32 v17, 0, v3
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT:    v_max_i32_e32 v18, 0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX8-NEXT:    v_bfrev_b32_e32 v18, -2
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v3
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v17
 ; GFX8-NEXT:    v_min_i32_e32 v17, 0, v4
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v4
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT:    v_max_i32_e32 v18, 0, v4
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v17
 ; GFX8-NEXT:    v_min_i32_e32 v17, 0, v5
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v5
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT:    v_max_i32_e32 v18, 0, v5
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v21
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_add_u32_e32 v5, vcc, v5, v17
 ; GFX8-NEXT:    v_min_i32_e32 v17, 0, v6
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v19, 0, v6
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT:    v_max_i32_e32 v18, 0, v6
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v22
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v18, v19
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
-; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v16, v18
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32
 ; GFX8-NEXT:    v_add_u32_e32 v6, vcc, v6, v17
 ; GFX8-NEXT:    v_min_i32_e32 v17, 0, v7
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
-; GFX8-NEXT:    v_max_i32_e32 v20, 0, v7
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
+; GFX8-NEXT:    v_max_i32_e32 v19, 0, v7
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v18, v20
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
-; GFX8-NEXT:    v_min_i32_e32 v20, 0, v8
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v16, v19
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_min_i32_e32 v19, 0, v8
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v8
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
-; GFX8-NEXT:    v_max_i32_e32 v20, v20, v24
-; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX8-NEXT:    v_min_i32_e32 v20, 0, v9
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v31, v19
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
+; GFX8-NEXT:    v_max_i32_e32 v19, v19, v24
+; GFX8-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX8-NEXT:    v_min_i32_e32 v19, 0, v9
 ; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v9
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
-; GFX8-NEXT:    v_max_i32_e32 v20, v20, v25
-; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX8-NEXT:    v_min_i32_e32 v20, 0, v10
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v31, v19
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
+; GFX8-NEXT:    v_max_i32_e32 v19, v19, v25
+; GFX8-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX8-NEXT:    v_min_i32_e32 v19, 0, v10
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v9, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v10
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
-; GFX8-NEXT:    v_max_i32_e32 v20, v20, v26
-; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX8-NEXT:    v_min_i32_e32 v20, 0, v11
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v31, v19
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
+; GFX8-NEXT:    v_max_i32_e32 v19, v19, v26
+; GFX8-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX8-NEXT:    v_min_i32_e32 v19, 0, v11
 ; GFX8-NEXT:    v_add_u32_e32 v10, vcc, v10, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v11
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
-; GFX8-NEXT:    v_max_i32_e32 v20, v20, v27
-; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX8-NEXT:    v_min_i32_e32 v20, 0, v12
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v31, v19
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
+; GFX8-NEXT:    v_max_i32_e32 v19, v19, v27
+; GFX8-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX8-NEXT:    v_min_i32_e32 v19, 0, v12
 ; GFX8-NEXT:    v_add_u32_e32 v11, vcc, v11, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v12
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
-; GFX8-NEXT:    v_max_i32_e32 v20, v20, v28
-; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX8-NEXT:    v_min_i32_e32 v20, 0, v13
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v31, v19
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
+; GFX8-NEXT:    v_max_i32_e32 v19, v19, v28
+; GFX8-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX8-NEXT:    v_min_i32_e32 v19, 0, v13
 ; GFX8-NEXT:    v_add_u32_e32 v12, vcc, v12, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v13
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
-; GFX8-NEXT:    v_max_i32_e32 v20, v20, v29
-; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
-; GFX8-NEXT:    v_min_i32_e32 v20, 0, v14
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v31, v19
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
+; GFX8-NEXT:    v_max_i32_e32 v19, v19, v29
+; GFX8-NEXT:    v_min_i32_e32 v17, v19, v17
+; GFX8-NEXT:    v_min_i32_e32 v19, 0, v14
 ; GFX8-NEXT:    v_add_u32_e32 v13, vcc, v13, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v14
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v16, v20
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
-; GFX8-NEXT:    v_max_i32_e32 v20, v20, v30
-; GFX8-NEXT:    v_min_i32_e32 v17, v20, v17
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v31, v19
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v16, v17
+; GFX8-NEXT:    v_max_i32_e32 v19, v19, v30
+; GFX8-NEXT:    v_min_i32_e32 v17, v19, v17
 ; GFX8-NEXT:    v_add_u32_e32 v14, vcc, v14, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, 0, v15
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v18, v17
-; GFX8-NEXT:    v_min_i32_e32 v18, 0, v15
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v16, v18
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v16, v17
+; GFX8-NEXT:    v_min_i32_e32 v17, 0, v15
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v31, v17
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_i32_e32 v16, v16, v19
-; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
+; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
+; GFX8-NEXT:    v_min_i32_e32 v16, v17, v16
 ; GFX8-NEXT:    v_add_u32_e32 v15, vcc, v15, v16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2766,13 +2750,11 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v5, 0, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s5, v5
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, 0x80000000, v5
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
@@ -2780,8 +2762,8 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, s5, v4
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, 0x80000000, v4
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
@@ -2978,13 +2960,11 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: saddsat_v2i16_vs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s3, 1
 ; GFX6-NEXT:    v_min_i32_e32 v3, 0, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s2, -2
 ; GFX6-NEXT:    v_max_i32_e32 v2, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
@@ -2992,8 +2972,8 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
 ; GFX6-NEXT:    v_max_i32_e32 v2, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, s3, v3
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, s2, v2
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, 0x80000000, v3
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
 ; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
@@ -3056,35 +3036,33 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s4, v8
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v11, v10
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, 0x7fffffff, v8
 ; GFX6-NEXT:    v_max_i32_e32 v4, v10, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
-; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v11, v8
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v2
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, s5, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v6, 0, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
@@ -3320,35 +3298,33 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v14, 0, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v12, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, s5, v14
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s4, v12
+; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v15, v14
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, 0x7fffffff, v12
 ; GFX6-NEXT:    v_max_i32_e32 v6, v14, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v12
 ; GFX6-NEXT:    v_min_i32_e32 v12, 0, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
 ; GFX6-NEXT:    v_max_i32_e32 v7, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, s5, v12
-; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, s4, v7
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v15, v12
+; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
 ; GFX6-NEXT:    v_max_i32_e32 v6, v12, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v2
-; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
 ; GFX6-NEXT:    v_max_i32_e32 v7, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, s5, v8
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
 ; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
 ; GFX6-NEXT:    v_min_i32_e32 v8, 0, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
@@ -3674,13 +3650,13 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
 ; GFX6-NEXT:    v_min_i32_e32 v18, 0, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v16, 0, v0
-; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, s5, v18
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s4, v16
+; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v19, v18
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v17, v16
 ; GFX6-NEXT:    v_max_i32_e32 v8, v18, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v16
@@ -3688,21 +3664,19 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
 ; GFX6-NEXT:    v_max_i32_e32 v9, 0, v1
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, s5, v16
-; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, s4, v9
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v19, v16
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v16, v8
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v2
-; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
 ; GFX6-NEXT:    v_max_i32_e32 v9, 0, v2
-; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, s5, v10
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v3
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index ab000d91a3ef23d..1061f0003bd4896 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -278,10 +278,10 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
 ; CHECK-LABEL: v_sdiv_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s6, 0x1000
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x45800000
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xfffff000
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x1000
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -291,14 +291,14 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v3, 12, v2
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, 0x1000, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -312,7 +312,6 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_movk_i32 s8, 0x1000
 ; GISEL-NEXT:    v_mov_b32_e32 v3, 0x1000
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; GISEL-NEXT:    v_mov_b32_e32 v5, 0xfffff000
@@ -321,18 +320,13 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v7, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v7
+; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v7, 12, v5
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
@@ -340,19 +334,19 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
-; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v1
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v0, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
-; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, s8, v1
+; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, 0x1000, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
 ; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
@@ -363,50 +357,42 @@ define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
 ; CGP-LABEL: v_sdiv_v2i32_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_movk_i32 s8, 0x1000
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x45800000
-; CGP-NEXT:    s_movk_i32 s4, 0xf000
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0xfffff000
 ; CGP-NEXT:    v_mov_b32_e32 v5, 0x1000
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, 0x45800000
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, s4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT:    v_lshlrev_b32_e32 v7, 12, v3
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v3
-; CGP-NEXT:    v_lshlrev_b32_e32 v9, 12, v4
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v0, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
+; CGP-NEXT:    v_lshlrev_b32_e32 v7, 12, v4
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; CGP-NEXT:    v_lshlrev_b32_e32 v9, 12, v3
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
-; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
-; CGP-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v0, v5
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[6:7]
 ; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, 0x1000, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
@@ -420,10 +406,10 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_sdiv_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -432,15 +418,15 @@ define i32 @v_sdiv_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v3, v2, s6
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v3, v2, v4
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, 1, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT:    v_cmp_le_u32_e64 s[4:5], s6, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
-; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, s6, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; CHECK-NEXT:    v_subrev_i32_e32 v3, vcc, 0x12d8fb, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[4:5]
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v2
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s6, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -454,100 +440,87 @@ define <2 x i32> @v_sdiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_mov_b32 s8, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0xffed2705
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GISEL-NEXT:    v_mul_lo_u32 v6, v4, s8
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v3, s8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v3
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
-; GISEL-NEXT:    v_subrev_i32_e32 v6, vcc, s8, v0
-; GISEL-NEXT:    v_cmp_le_u32_e64 s[6:7], s8, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v9, s[6:7]
-; GISEL-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[6:7]
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s8, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v3, v7, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v7, v5, v3
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v5
+; GISEL-NEXT:    v_mul_lo_u32 v9, v4, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v0, v3
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; GISEL-NEXT:    v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1
+; GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v5, v7, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i32_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s8, 0x12d8fb
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x4996c7d8
-; CGP-NEXT:    s_mov_b32 s4, 0xffed2705
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0xffed2705
 ; CGP-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, 0x4996c7d8
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, s4
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v7, v3, s8
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v3
-; CGP-NEXT:    v_mul_lo_u32 v9, v4, v5
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v0, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
+; CGP-NEXT:    v_mul_lo_u32 v7, v4, v5
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
+; CGP-NEXT:    v_mul_lo_u32 v9, v3, v5
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, 1, v3
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v9
-; CGP-NEXT:    v_cmp_le_u32_e64 s[4:5], s8, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
-; CGP-NEXT:    v_subrev_i32_e32 v7, vcc, s8, v0
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v0, v5
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[6:7], v1, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[6:7]
 ; CGP-NEXT:    v_subrev_i32_e32 v8, vcc, 0x12d8fb, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v3
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, 1, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[6:7]
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v4
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s8, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v0, v3, v7, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, 1, v3
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v0, v4, v7, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v4, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v3, v8, vcc
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 9dacdbc46be1948..3eb6f1eced0957f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -1001,7 +1001,7 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_movk_i32 s6, 0xf000
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0xfffff000
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
@@ -1009,118 +1009,116 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0xfffff000
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
 ; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v6, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
+; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s6, v5, 0
-; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v0, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, v1, v7, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_xor_b32_e32 v3, v8, v7
-; CHECK-NEXT:    v_mul_lo_u32 v1, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v0
-; CHECK-NEXT:    v_xor_b32_e32 v4, v9, v7
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v2
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
+; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
+; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v0
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v0, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0x1000
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CHECK-NEXT:    v_addc_u32_e32 v0, vcc, v6, v0, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v1
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    s_movk_i32 s6, 0x1000
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
+; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], s6, v5, v[1:2]
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v4, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v4, v1
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0x1000
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v3, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v8
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v7
+; CHECK-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v3
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v4
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 4096
   ret i64 %result
@@ -1131,173 +1129,164 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
 ; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v10, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v9, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v9, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v8
-; GISEL-NEXT:    v_xor_b32_e32 v12, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v0, v5, v8
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v9
+; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_mul_lo_u32 v1, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v5, 0x1000
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v1
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9]
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
 ; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v12, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v13, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v13, v8
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, 0x1000
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v9, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v7
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v10
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, -1, v8, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, -1, v7, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v12, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
+; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v14, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v15, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT:    v_mul_hi_u32 v6, v15, v6
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v15, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v10, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v6, v0
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v8, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v8
+; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v12, v14, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v6
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v8
-; GISEL-NEXT:    v_mul_hi_u32 v3, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v6
+; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
@@ -1305,7 +1294,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v6
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
@@ -1316,13 +1305,13 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_xor_b32_e32 v9, v10, v4
+; GISEL-NEXT:    v_xor_b32_e32 v8, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
@@ -1344,7 +1333,7 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v4, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7]
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
@@ -1370,10 +1359,10 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_pow2k_denom:
@@ -1381,220 +1370,206 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_movk_i32 s7, 0xf000
-; CGP-NEXT:    s_movk_i32 s6, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v6, 0xfffff000
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v6, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_mov_b32_e32 v4, 0xfffff000
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6]
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v8, v5, vcc
-; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v4, v7
-; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v0, v5
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8]
-; CGP-NEXT:    v_xor_b32_e32 v8, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v1, v10, v6
-; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v7, v9, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v10, v6
-; CGP-NEXT:    v_xor_b32_e32 v11, v11, v5
+; CGP-NEXT:    v_trunc_f32_e32 v7, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v7
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; CGP-NEXT:    v_mov_b32_e32 v7, v5
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; CGP-NEXT:    v_mul_hi_u32 v12, v9, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11]
+; CGP-NEXT:    v_mul_lo_u32 v10, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v8, v13
+; CGP-NEXT:    v_mul_lo_u32 v7, v9, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; CGP-NEXT:    v_mov_b32_e32 v4, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v7
+; CGP-NEXT:    v_mul_lo_u32 v0, v17, v13
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v14
+; CGP-NEXT:    v_xor_b32_e32 v18, v1, v7
+; CGP-NEXT:    v_mul_hi_u32 v1, v16, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v17, v13
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v1, v17, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT:    v_mul_hi_u32 v4, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, v17, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v10, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_mul_hi_u32 v4, v9, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; CGP-NEXT:    v_addc_u32_e32 v0, vcc, v10, v0, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v11, v1
-; CGP-NEXT:    v_mul_lo_u32 v7, v8, v0
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v11, v1
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v11, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v7, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v1, v6
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v6
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v10, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v18, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x1000
-; CGP-NEXT:    v_subb_u32_e64 v1, s[4:5], v11, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v11, v6
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v16, v18, v1
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v17, v18, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT:    v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
+; CGP-NEXT:    v_sub_i32_e64 v13, s[4:5], v18, v13
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v6, vcc
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, 0x1000
+; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v8, -1, v7, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v1, 0
-; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v1
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v1, v6
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v9
-; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v10, vcc
-; CGP-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v1
-; CGP-NEXT:    v_trunc_f32_e32 v6, v6
-; CGP-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v1
+; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, 1, v16
+; CGP-NEXT:    v_addc_u32_e32 v18, vcc, 0, v17, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v14, -1, v14, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v15, v[1:2]
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v11
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7]
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v12, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v11, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v1, v15, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v13, v6
-; CGP-NEXT:    v_mul_hi_u32 v14, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; CGP-NEXT:    v_mul_hi_u32 v0, v15, v0
+; CGP-NEXT:    v_mov_b32_e32 v0, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1]
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v19, vcc
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v15
+; CGP-NEXT:    v_mul_lo_u32 v19, v8, v0
+; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v18, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v15, v6
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; CGP-NEXT:    v_mul_hi_u32 v11, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; CGP-NEXT:    v_mul_hi_u32 v6, v15, v6
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v0
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v15, v1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v9, v7, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v6, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v10, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v8, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7]
-; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; CGP-NEXT:    v_xor_b32_e32 v7, v2, v8
-; CGP-NEXT:    v_mul_lo_u32 v2, v13, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v11, v6
-; CGP-NEXT:    v_xor_b32_e32 v12, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v3, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v13, v0
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v5, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v10, v17, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v10, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6]
+; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v8, v5
+; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v6
+; CGP-NEXT:    v_mul_lo_u32 v3, v9, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CGP-NEXT:    v_mul_hi_u32 v6, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v11, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v3
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v2
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v9, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mul_hi_u32 v5, v7, v3
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v12, v3
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v7
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v10, v12, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s6, v9, 0
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v13, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -1602,24 +1577,24 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v7
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v8, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result
@@ -1631,7 +1606,7 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_mov_b32 s6, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
@@ -1639,118 +1614,116 @@ define i64 @v_sdiv_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0xffed2705
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
 ; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v6, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
+; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s6, v5, 0
-; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v0, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, v1, v7, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_xor_b32_e32 v3, v8, v7
-; CHECK-NEXT:    v_mul_lo_u32 v1, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v0
-; CHECK-NEXT:    v_xor_b32_e32 v4, v9, v7
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v2
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
+; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
+; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v0
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v0, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CHECK-NEXT:    v_addc_u32_e32 v0, vcc, v6, v0, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v1
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v9, v4, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
+; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v8, v9, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v9, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], s6, v5, v[1:2]
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
-; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v4, v1, vcc
-; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v4, v1
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0x12d8fb
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v8, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v4, v0
+; CHECK-NEXT:    v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
+; CHECK-NEXT:    v_sub_i32_e64 v1, s[4:5], v9, v1
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v6
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[4:5]
+; CHECK-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[4:5]
 ; CHECK-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v2
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v3, s[4:5]
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, 1, v8
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, 0, v5, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, -1, v4, s[4:5]
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, 1, v7
+; CHECK-NEXT:    v_addc_u32_e32 v8, vcc, 0, v3, vcc
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v3
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, 0, v4, vcc
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, 1, v4
+; CHECK-NEXT:    v_addc_u32_e32 v5, vcc, 0, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v8, v5, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
-; CHECK-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %num, 1235195
   ret i64 %result
@@ -1761,173 +1734,164 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
 ; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v6
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
 ; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
 ; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v10, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v9, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v10, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v5, v10, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v9, v[7:8]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v10, v4
-; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v7
-; GISEL-NEXT:    v_mul_lo_u32 v12, v10, v7
-; GISEL-NEXT:    v_mul_hi_u32 v13, v9, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v7
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v11, v5
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v9, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v10, v5, vcc
-; GISEL-NEXT:    v_mov_b32_e32 v4, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v5, v[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v11, v[8:9]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v9, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v8
-; GISEL-NEXT:    v_xor_b32_e32 v12, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v7
-; GISEL-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v4
+; GISEL-NEXT:    v_mul_lo_u32 v0, v5, v8
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v9
+; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v5, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v8
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_mul_lo_u32 v1, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v7, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v8, v9, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v9, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v1
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_mul_hi_u32 v8, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v10, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], v5, v11, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], 0, v10, v[7:8]
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9]
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
 ; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v12, v7, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v13, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v8, s[4:5], v13, v8
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, 0x12d8fb
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v0, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v9, s[4:5]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v7
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, 1, v10
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, 0, v11, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v1
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v1
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, -1, v8, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v14, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, 1, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v8, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, -1, v7, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v15, v[1:2]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v12
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[6:7]
-; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v13, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v12, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v6
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v13, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
+; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, 0, v14, vcc
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v13, v15, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT:    v_mul_hi_u32 v6, v15, v6
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v15, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v10, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v6, v0
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v11, v9, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v8, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v12, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v8
+; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v12, v14, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v6
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v8
-; GISEL-NEXT:    v_mul_hi_u32 v3, v9, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v6
+; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
@@ -1935,7 +1899,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v6
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
@@ -1946,13 +1910,13 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_xor_b32_e32 v9, v10, v4
+; GISEL-NEXT:    v_xor_b32_e32 v8, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
@@ -1974,7 +1938,7 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v4, vcc
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7]
 ; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
@@ -2000,10 +1964,10 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v10, v2, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v13, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_sdiv_v2i64_oddk_denom:
@@ -2011,220 +1975,206 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_mov_b32 s7, 0xffed2705
-; CGP-NEXT:    s_mov_b32 s6, 0x12d8fb
+; CGP-NEXT:    v_mov_b32_e32 v6, 0xffed2705
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v6, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_mov_b32_e32 v4, 0xffed2705
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6]
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v8, v5, vcc
-; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v4, v7
-; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v0, v5
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8]
-; CGP-NEXT:    v_xor_b32_e32 v8, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v1, v10, v6
-; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v7, v9, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v10, v6
-; CGP-NEXT:    v_xor_b32_e32 v11, v11, v5
+; CGP-NEXT:    v_trunc_f32_e32 v7, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v7
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; CGP-NEXT:    v_mov_b32_e32 v7, v5
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; CGP-NEXT:    v_mul_hi_u32 v12, v9, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11]
+; CGP-NEXT:    v_mul_lo_u32 v10, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v8, v13
+; CGP-NEXT:    v_mul_lo_u32 v7, v9, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; CGP-NEXT:    v_mov_b32_e32 v4, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v7
+; CGP-NEXT:    v_mul_lo_u32 v0, v17, v13
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v14
+; CGP-NEXT:    v_xor_b32_e32 v18, v1, v7
+; CGP-NEXT:    v_mul_hi_u32 v1, v16, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v17, v13
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v1, v17, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT:    v_mul_hi_u32 v4, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, v17, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v10, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_mul_hi_u32 v4, v9, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; CGP-NEXT:    v_addc_u32_e32 v0, vcc, v10, v0, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v11, v1
-; CGP-NEXT:    v_mul_lo_u32 v7, v8, v0
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v11, v1
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v11, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v7, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v1, v6
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v9, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v6
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v10, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v18, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CGP-NEXT:    v_subb_u32_e64 v1, s[4:5], v11, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v6, s[4:5], v11, v6
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v16, v18, v1
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v17, v18, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v15, v0
+; CGP-NEXT:    v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
+; CGP-NEXT:    v_sub_i32_e64 v13, s[4:5], v18, v13
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v6, vcc
-; CGP-NEXT:    v_cvt_f32_u32_e32 v6, 0x12d8fb
+; CGP-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v8, -1, v7, s[4:5]
-; CGP-NEXT:    v_subbrev_u32_e32 v7, vcc, 0, v1, vcc
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v1, 0
-; CGP-NEXT:    v_mac_f32_e32 v6, 0x4f800000, v1
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v1, v6
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, 1, v9
-; CGP-NEXT:    v_addc_u32_e32 v12, vcc, 0, v10, vcc
-; CGP-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v1
-; CGP-NEXT:    v_trunc_f32_e32 v6, v6
-; CGP-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v1
+; CGP-NEXT:    v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
+; CGP-NEXT:    v_add_i32_e32 v15, vcc, 1, v16
+; CGP-NEXT:    v_addc_u32_e32 v18, vcc, 0, v17, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cvt_u32_f32_e32 v15, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
-; CGP-NEXT:    v_cndmask_b32_e32 v14, -1, v14, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v15, v[1:2]
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v11
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7]
-; CGP-NEXT:    v_addc_u32_e32 v16, vcc, 0, v12, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v11, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v1, v15, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v13, v6
-; CGP-NEXT:    v_mul_hi_u32 v14, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; CGP-NEXT:    v_mul_hi_u32 v0, v15, v0
+; CGP-NEXT:    v_mov_b32_e32 v0, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v14, -1, v14, s[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; CGP-NEXT:    v_cndmask_b32_e64 v19, 0, -1, vcc
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1]
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v19, vcc
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, 1, v15
+; CGP-NEXT:    v_mul_lo_u32 v19, v8, v0
+; CGP-NEXT:    v_addc_u32_e32 v13, vcc, 0, v18, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v15, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v13, v18, v13, vcc
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v14, v15, v6
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; CGP-NEXT:    v_mul_hi_u32 v11, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; CGP-NEXT:    v_mul_hi_u32 v6, v15, v6
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v0
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v15, v1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v9, v7, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v6, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2]
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v10, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v1, v8, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7]
-; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; CGP-NEXT:    v_xor_b32_e32 v7, v2, v8
-; CGP-NEXT:    v_mul_lo_u32 v2, v13, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v11, v6
-; CGP-NEXT:    v_xor_b32_e32 v12, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v3, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v13, v0
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v5, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v5, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v10, v17, v13, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v10, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6]
+; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v8, v5
+; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v6
+; CGP-NEXT:    v_mul_lo_u32 v3, v9, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CGP-NEXT:    v_mul_hi_u32 v6, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v11, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v3
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v2
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v9, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mul_hi_u32 v5, v7, v3
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v12, v3
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v7
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v10, v12, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s6, v9, 0
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v10, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v13, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
 ; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -2232,24 +2182,24 @@ define <2 x i64> @v_sdiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, 0, v10, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, 1, v7
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, 0, v8, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, -1, v2, vcc
 ; CGP-NEXT:    v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v7, vcc
+; CGP-NEXT:    v_addc_u32_e32 v5, vcc, 0, v9, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v4, v9, v5, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v3, v10, v4, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v2, v7, v2, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -2445,186 +2395,187 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_sdiv_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[7:8], s[4:5], v4
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v10, 0
+; GISEL-NEXT:    v_lshl_b64 v[7:8], v[9:10], v4
+; GISEL-NEXT:    v_lshl_b64 v[9:10], v[9:10], v6
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v8
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v7, v4
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v8, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v8, v5, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v7, v4
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v7, v8
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v9, v5
-; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, 0, v8
-; GISEL-NEXT:    v_subb_u32_e32 v14, vcc, 0, v5, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v9
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v5
+; GISEL-NEXT:    v_sub_i32_e32 v15, vcc, 0, v8
+; GISEL-NEXT:    v_subb_u32_e32 v16, vcc, 0, v5, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v11
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v7, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v7, 0x5f7ffffc, v7
-; GISEL-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v7
-; GISEL-NEXT:    v_trunc_f32_e32 v11, v9
-; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v11
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v15, v11
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0
-; GISEL-NEXT:    v_mov_b32_e32 v7, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[7:8]
-; GISEL-NEXT:    v_mul_lo_u32 v7, v15, v9
-; GISEL-NEXT:    v_mul_hi_u32 v16, v12, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v16
+; GISEL-NEXT:    v_mul_f32_e32 v11, 0x2f800000, v7
+; GISEL-NEXT:    v_trunc_f32_e32 v13, v11
+; GISEL-NEXT:    v_mac_f32_e32 v7, 0xcf800000, v13
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v13
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, v12
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
+; GISEL-NEXT:    v_mul_lo_u32 v7, v17, v11
+; GISEL-NEXT:    v_mul_hi_u32 v18, v14, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
+; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
+; GISEL-NEXT:    v_mul_lo_u32 v13, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v16, v15, v10
+; GISEL-NEXT:    v_mul_lo_u32 v18, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; GISEL-NEXT:    v_mul_hi_u32 v13, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v18, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
+; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v12
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v10
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v16, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v7
-; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v15, v9, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v13, v12, 0
-; GISEL-NEXT:    v_mov_b32_e32 v7, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v13, v15, v[7:8]
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v7
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, v17, v11, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v14, 0
+; GISEL-NEXT:    v_mov_b32_e32 v7, v12
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v15, v17, v[7:8]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[6:7], v14, v12, v[10:11]
+; GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[4:5], v16, v14, v[12:13]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v13, v0, v7
-; GISEL-NEXT:    v_mul_lo_u32 v0, v15, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v10
-; GISEL-NEXT:    v_xor_b32_e32 v14, v1, v7
-; GISEL-NEXT:    v_mul_hi_u32 v1, v12, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v15, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v0, v17, v11
+; GISEL-NEXT:    v_mul_lo_u32 v15, v14, v12
+; GISEL-NEXT:    v_xor_b32_e32 v16, v1, v7
+; GISEL-NEXT:    v_mul_hi_u32 v1, v14, v11
+; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v11
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v15, v10
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v10
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v17, v12
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v15, v14, v12
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_mul_hi_u32 v10, v15, v10
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
+; GISEL-NEXT:    v_mul_hi_u32 v12, v17, v12
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v15, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v14, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v14, v1
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v11, v16, v0
+; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v1
+; GISEL-NEXT:    v_mul_hi_u32 v14, v13, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v16, v0
+; GISEL-NEXT:    v_mul_hi_u32 v15, v16, v1
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v0, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v14, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v8, v15, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v11, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v8, v16, v[1:2]
-; GISEL-NEXT:    v_lshl_b64 v[11:12], s[4:5], v6
+; GISEL-NEXT:    v_mul_lo_u32 v14, v16, v1
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v0, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v8, v14, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v8, v15, v[1:2]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v13, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v5, v15, v[9:10]
-; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v14, v9, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v14, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v5, v14, v[11:12]
+; GISEL-NEXT:    v_xor_b32_e32 v7, v7, v4
+; GISEL-NEXT:    v_subb_u32_e64 v1, s[4:5], v16, v11, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v6, s[4:5], v16, v11
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v0, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v5
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v6, v5, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
-; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v12
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, v9, v10, s[4:5]
-; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v12, v6, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, v11, v12, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e32 v11, vcc, 0, v1, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v10
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v6
+; GISEL-NEXT:    v_addc_u32_e32 v9, vcc, v10, v6, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v10, v1, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v9, v9, v6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v10
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v17, vcc, 1, v15
-; GISEL-NEXT:    v_addc_u32_e32 v18, vcc, 0, v16, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v11
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v12, v9
+; GISEL-NEXT:    v_add_i32_e32 v16, vcc, 1, v14
+; GISEL-NEXT:    v_addc_u32_e32 v17, vcc, 0, v15, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v12
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v8
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v12, v1
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v12
+; GISEL-NEXT:    v_trunc_f32_e32 v18, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v18
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v19, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v20, vcc, 0, v10
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v19, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v22, v12
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v18, v18
 ; GISEL-NEXT:    v_subb_u32_e32 v21, vcc, 0, v9, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v14, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v11, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v20, v22, v[1:2]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v17
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v11, v5
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v12, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v20, v18, v[1:2]
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v16
 ; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v21, v19, v[11:12]
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v18, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, 0, v17, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v17, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v22, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v16, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v18, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v12, v19, v11
-; GISEL-NEXT:    v_mul_hi_u32 v14, v19, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v18, v8, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v22, v0
+; GISEL-NEXT:    v_mul_hi_u32 v16, v19, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v17, v8, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v14, v22, v11
+; GISEL-NEXT:    v_mul_lo_u32 v16, v18, v11
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v19, v11
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v14, v12
-; GISEL-NEXT:    v_mul_hi_u32 v11, v22, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
+; GISEL-NEXT:    v_mul_hi_u32 v11, v18, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v12, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v19, v0
-; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v22, v1, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v12, vcc, v18, v1, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v20, v11, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v13, v15, v5, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v14, v5, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v20, v12, v[1:2]
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v13, v7
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v13, 31, v3
 ; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v21, v11, v[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, v15, v8, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v13
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v13, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v2, v13
@@ -2718,17 +2669,16 @@ define <2 x i64> @v_sdiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_sdiv_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CGP-NEXT:    v_lshl_b64 v[11:12], s[4:5], v4
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
+; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
+; CGP-NEXT:    v_lshl_b64 v[11:12], v[2:3], v4
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v12
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_mov_b32_e32 v5, v2
-; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 4c444f46ff3dddd..5297df3bedf8f2e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -277,16 +277,15 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v7, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v1, v8, v1, s[0:1]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v8, v1, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v3, v10, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v0, v5, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
 ; GFX8-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v0, v5, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v0, s0, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s1, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s0, v0
@@ -436,16 +435,15 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v6, v8, v7, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v10, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v1, s[0:1]
 ; GFX9-NEXT:    s_xor_b64 s[0:1], s[2:3], s[12:13]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v11, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v0, v6, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v0, s0, v5
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v1, vcc
 ; GFX9-NEXT:    v_xor_b32_e32 v1, s1, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s1
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v0
@@ -559,47 +557,46 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_add_co_u32 v2, s10, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s10
 ; GFX10-NEXT:    v_mul_lo_u32 v4, s9, v2
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v2, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s10, s8, v2, 0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s8, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v4
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v6, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s1, v1
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v2, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s1, v1
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, s0, v0
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v1, s0, s1, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s8, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v10, vcc_lo, v0, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v11, s0, 0, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v0, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v6, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v1
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, s9, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s9, v11
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v6, vcc_lo, s9, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s8, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s9, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
+; GFX10-NEXT:    v_add_co_u32 v13, s0, v4, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v5, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s9, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v12, v9, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v10, s8
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v12
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v9
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v10, v7, s0
+; GFX10-NEXT:    v_sub_co_u32 v10, s0, v8, s8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v6, s0, 0, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v7
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v14, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v8, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v9, v6, vcc_lo
 ; GFX10-NEXT:    s_xor_b64 s[8:9], s[2:3], s[12:13]
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v10, v13, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v11, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX10-NEXT:    v_xor_b32_e32 v2, s8, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s9, v3
@@ -1424,39 +1421,39 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v5, v1, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX8-NEXT:    s_xor_b64 s[12:13], s[0:1], s[6:7]
 ; GFX8-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
 ; GFX8-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GFX8-NEXT:    v_trunc_f32_e32 v12, v1
-; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v12
+; GFX8-NEXT:    v_trunc_f32_e32 v11, v1
+; GFX8-NEXT:    v_mul_f32_e32 v1, 0xcf800000, v11
 ; GFX8-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v13, v0
-; GFX8-NEXT:    s_xor_b64 s[12:13], s[0:1], s[6:7]
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v12, v0
 ; GFX8-NEXT:    s_sub_u32 s5, 0, s2
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
-; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v13, 0
-; GFX8-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v12
 ; GFX8-NEXT:    s_subb_u32 s20, 0, s3
-; GFX8-NEXT:    v_cndmask_b32_e32 v10, v3, v10, vcc
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v5, v[1:2]
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v8, v15, s[0:1]
-; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], s20, v13, v[1:2]
-; GFX8-NEXT:    v_cndmask_b32_e64 v2, v9, v16, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v7, v7, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v10, v10, v13, vcc
+; GFX8-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s5, v12, 0
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v5, v11
+; GFX8-NEXT:    v_cndmask_b32_e64 v10, v3, v10, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v8, v15, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v7, v3, s[0:1]
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], s5, v5, v[1:2]
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v0
-; GFX8-NEXT:    v_mul_lo_u32 v8, v13, v1
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v6, v2, vcc
-; GFX8-NEXT:    v_mul_hi_u32 v2, v13, v0
+; GFX8-NEXT:    v_mad_u64_u32 v[1:2], s[14:15], s20, v12, v[1:2]
+; GFX8-NEXT:    v_cndmask_b32_e32 v2, v9, v16, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v2, s[0:1]
+; GFX8-NEXT:    v_mul_lo_u32 v8, v12, v1
+; GFX8-NEXT:    v_mul_hi_u32 v2, v12, v0
 ; GFX8-NEXT:    v_mul_hi_u32 v0, v5, v0
+; GFX8-NEXT:    v_xor_b32_e32 v9, s19, v10
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v8
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v3, v5, v1
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v8, v2
-; GFX8-NEXT:    v_mul_hi_u32 v8, v13, v1
+; GFX8-NEXT:    v_mul_hi_u32 v8, v12, v1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v3, v0
 ; GFX8-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v8
@@ -1467,16 +1464,15 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v3, v2
 ; GFX8-NEXT:    v_add_u32_e32 v1, vcc, v1, v2
-; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v13, v0
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v12, v0
 ; GFX8-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v8, 0
 ; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, v5, v1, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s18, v4
 ; GFX8-NEXT:    v_mov_b32_e32 v0, v3
 ; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v5, v[0:1]
-; GFX8-NEXT:    v_xor_b32_e32 v9, s19, v10
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s19
-; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
 ; GFX8-NEXT:    v_subrev_u32_e32 v0, vcc, s18, v1
+; GFX8-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s20, v8, v[3:4]
 ; GFX8-NEXT:    v_subb_u32_e32 v1, vcc, v9, v10, vcc
 ; GFX8-NEXT:    v_xor_b32_e32 v4, s4, v7
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v5, v2
@@ -1706,99 +1702,97 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v6
-; GFX9-NEXT:    v_subrev_co_u32_e32 v9, vcc, s12, v8
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[0:1]
-; GFX9-NEXT:    v_subbrev_co_u32_e64 v10, s[0:1], 0, v1, vcc
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], 1, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v11, s[0:1], 0, v4, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v9
+; GFX9-NEXT:    v_subrev_co_u32_e32 v10, vcc, s12, v8
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v2, v3, s[0:1]
+; GFX9-NEXT:    v_subbrev_co_u32_e64 v11, s[0:1], 0, v1, vcc
+; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], 1, v5
+; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[0:1], 0, v4, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s13, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX9-NEXT:    v_cmp_le_u32_e64 s[0:1], s12, v10
 ; GFX9-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s[0:1]
-; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v10
-; GFX9-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v13, s[0:1], 1, v2
-; GFX9-NEXT:    v_addc_co_u32_e64 v14, s[0:1], 0, v11, s[0:1]
+; GFX9-NEXT:    v_cmp_eq_u32_e64 s[0:1], s13, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v13, v2, v13, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e64 v14, s[0:1], 1, v3
+; GFX9-NEXT:    v_addc_co_u32_e64 v15, s[0:1], 0, v12, s[0:1]
 ; GFX9-NEXT:    s_add_u32 s0, s14, s6
 ; GFX9-NEXT:    s_addc_u32 s1, s15, s6
 ; GFX9-NEXT:    s_add_u32 s2, s2, s16
 ; GFX9-NEXT:    s_mov_b32 s17, s16
 ; GFX9-NEXT:    s_addc_u32 s3, s3, s16
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[16:17]
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v15, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX9-NEXT:    v_cvt_f32_u32_e32 v16, s2
 ; GFX9-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
-; GFX9-NEXT:    v_cvt_f32_u32_e32 v7, s2
-; GFX9-NEXT:    v_subrev_co_u32_e32 v16, vcc, s12, v9
-; GFX9-NEXT:    v_subbrev_co_u32_e32 v17, vcc, 0, v1, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x4f800000, v15
-; GFX9-NEXT:    v_add_f32_e32 v1, v1, v7
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v2, v13, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v11, v11, v14, vcc
-; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v2
+; GFX9-NEXT:    v_add_f32_e32 v2, v2, v16
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX9-NEXT:    v_subrev_co_u32_e32 v7, vcc, s12, v10
+; GFX9-NEXT:    v_subbrev_co_u32_e32 v16, vcc, 0, v1, vcc
+; GFX9-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v2
 ; GFX9-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
-; GFX9-NEXT:    v_trunc_f32_e32 v13, v2
-; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v13
+; GFX9-NEXT:    v_trunc_f32_e32 v17, v2
+; GFX9-NEXT:    v_mul_f32_e32 v2, 0xcf800000, v17
 ; GFX9-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v18, v1
 ; GFX9-NEXT:    s_xor_b64 s[12:13], s[0:1], s[6:7]
 ; GFX9-NEXT:    s_sub_u32 s5, 0, s2
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v14, 0
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
-; GFX9-NEXT:    s_subb_u32 s14, 0, s3
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v13, v[2:3]
-; GFX9-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v11, vcc
-; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s14, v14, v[2:3]
-; GFX9-NEXT:    v_mul_lo_u32 v3, v13, v1
-; GFX9-NEXT:    v_mul_hi_u32 v11, v14, v1
-; GFX9-NEXT:    v_mul_lo_u32 v4, v14, v2
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v12
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, v9, v16, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v10, v17, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v3, s[0:1], v3, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mul_lo_u32 v11, v13, v2
-; GFX9-NEXT:    v_mul_hi_u32 v1, v13, v1
+; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v18, 0
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v3, v14, vcc
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v17
+; GFX9-NEXT:    s_subb_u32 s20, 0, s3
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v12, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v10, v7, vcc
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s5, v14, v[2:3]
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, v4, v12, s[0:1]
+; GFX9-NEXT:    v_mad_u64_u32 v[2:3], s[14:15], s20, v18, v[2:3]
+; GFX9-NEXT:    v_mul_lo_u32 v3, v14, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v16, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v4, v18, v2
+; GFX9-NEXT:    v_mul_hi_u32 v11, v18, v1
+; GFX9-NEXT:    v_mul_hi_u32 v1, v14, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v5, v13, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_mul_lo_u32 v11, v14, v2
 ; GFX9-NEXT:    v_add_u32_e32 v3, v4, v3
-; GFX9-NEXT:    v_mul_hi_u32 v4, v14, v2
-; GFX9-NEXT:    v_mul_hi_u32 v2, v13, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v11, v1
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v1, s[0:1], v1, v3
+; GFX9-NEXT:    v_mul_hi_u32 v4, v18, v2
+; GFX9-NEXT:    v_mul_hi_u32 v2, v14, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v11, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v3
 ; GFX9-NEXT:    v_add_u32_e32 v4, v11, v4
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v18, v1
 ; GFX9-NEXT:    v_add3_u32 v2, v4, v3, v2
-; GFX9-NEXT:    v_add_co_u32_e64 v11, s[0:1], v14, v1
-; GFX9-NEXT:    v_addc_co_u32_e64 v12, s[0:1], v13, v2, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s5, v11, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v9, s18, v5
+; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[14:15], s5, v11, 0
+; GFX9-NEXT:    v_addc_co_u32_e32 v12, vcc, v14, v2, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v6, v10, s[0:1]
 ; GFX9-NEXT:    v_mad_u64_u32 v[1:2], s[0:1], s5, v12, v[1:2]
-; GFX9-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v7, s19, v7
-; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s14, v11, v[1:2]
+; GFX9-NEXT:    v_xor_b32_e32 v8, s18, v5
+; GFX9-NEXT:    v_xor_b32_e32 v9, s19, v9
+; GFX9-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s20, v11, v[1:2]
 ; GFX9-NEXT:    v_mov_b32_e32 v10, s19
-; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s18, v9
-; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v10, vcc
-; GFX9-NEXT:    v_xor_b32_e32 v5, s4, v8
+; GFX9-NEXT:    v_subrev_co_u32_e32 v1, vcc, s18, v8
+; GFX9-NEXT:    v_xor_b32_e32 v5, s4, v7
 ; GFX9-NEXT:    v_mul_lo_u32 v7, v12, v3
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v11, v4
+; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v9, v10, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v11, v3
-; GFX9-NEXT:    v_mul_hi_u32 v3, v12, v3
-; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v9
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v12, v4
+; GFX9-NEXT:    v_mul_hi_u32 v3, v12, v3
 ; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v8, v11, v4
 ; GFX9-NEXT:    v_mul_hi_u32 v4, v12, v4
@@ -1824,13 +1818,14 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mul_lo_u32 v10, s13, v4
 ; GFX9-NEXT:    v_add_u32_e32 v7, v8, v7
 ; GFX9-NEXT:    v_mul_hi_u32 v8, s12, v4
-; GFX9-NEXT:    v_mov_b32_e32 v9, s4
+; GFX9-NEXT:    v_xor_b32_e32 v6, s4, v6
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v10, v3
 ; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v8
 ; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v11, vcc, v3, v7
 ; GFX9-NEXT:    v_mad_u64_u32 v[3:4], s[0:1], s2, v11, 0
+; GFX9-NEXT:    v_mov_b32_e32 v9, s4
 ; GFX9-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX9-NEXT:    v_subrev_co_u32_e32 v5, vcc, s4, v5
 ; GFX9-NEXT:    v_add_u32_e32 v8, v10, v8
@@ -1994,170 +1989,169 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v10, v13, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s23, s20, v6, 0
 ; GFX10-NEXT:    v_add_co_u32 v7, s23, v9, v8
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s23
 ; GFX10-NEXT:    v_mul_lo_u32 v9, s21, v6
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s20, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s23
 ; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v7
 ; GFX10-NEXT:    v_add3_u32 v2, v10, v8, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v8, v5, v0
-; GFX10-NEXT:    v_add3_u32 v7, v1, v11, v9
 ; GFX10-NEXT:    v_mul_hi_u32 v10, v6, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, v5, v0
+; GFX10-NEXT:    v_add3_u32 v7, v1, v11, v9
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, v3, v2, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v12, v6, v7
 ; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s20, s5, v4, 0
+; GFX10-NEXT:    v_mul_lo_u32 v12, v6, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v9, s22, v4
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s5, v3
 ; GFX10-NEXT:    v_mul_lo_u32 v13, v5, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v14, v6, v7
 ; GFX10-NEXT:    v_mul_hi_u32 v7, v5, v7
-; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v12
 ; GFX10-NEXT:    v_mul_lo_u32 v15, v3, v1
 ; GFX10-NEXT:    v_mul_hi_u32 v16, v4, v1
+; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v12
 ; GFX10-NEXT:    v_add3_u32 v2, v2, v11, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v0, s5, v13, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v10
+; GFX10-NEXT:    v_mul_lo_u32 v12, v4, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v14
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
-; GFX10-NEXT:    v_mul_lo_u32 v12, v4, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_mul_hi_u32 v1, v3, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v9, v8
 ; GFX10-NEXT:    v_mul_lo_u32 v13, v3, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v10, v11, v10
-; GFX10-NEXT:    v_mul_hi_u32 v9, v4, v2
+; GFX10-NEXT:    v_mul_hi_u32 v14, v4, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v9, v11, v10
+; GFX10-NEXT:    v_add_co_u32 v10, s5, v15, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v11, s5, v15, v12
-; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v0
-; GFX10-NEXT:    v_add3_u32 v7, v10, v8, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
 ; GFX10-NEXT:    v_add_co_u32 v1, s5, v13, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s5
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
-; GFX10-NEXT:    v_add_co_u32 v8, s5, v11, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v10, s5, v10, v16
+; GFX10-NEXT:    v_add3_u32 v7, v9, v8, v7
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v14
+; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v6, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v9
-; GFX10-NEXT:    v_mul_lo_u32 v7, s1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v9, s0, v5
-; GFX10-NEXT:    v_mul_hi_u32 v10, s1, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, v5, v7, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v11, v10
+; GFX10-NEXT:    v_mul_hi_u32 v9, s0, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v7, v12, v8
+; GFX10-NEXT:    v_mul_lo_u32 v8, s1, v0
+; GFX10-NEXT:    v_mul_lo_u32 v10, s0, v5
+; GFX10-NEXT:    v_mul_hi_u32 v0, s1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s1, v5
+; GFX10-NEXT:    v_add_co_u32 v1, s5, v1, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s5
-; GFX10-NEXT:    v_add_nc_u32_e32 v8, v12, v8
 ; GFX10-NEXT:    v_mul_hi_u32 v12, s0, v5
-; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v5
-; GFX10-NEXT:    v_add_co_u32 v7, s5, v7, v9
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v10, s5, v11, v10
-; GFX10-NEXT:    v_add_co_u32 v0, s20, v7, v0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s20
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s5
-; GFX10-NEXT:    v_add_co_u32 v10, s5, v10, v12
+; GFX10-NEXT:    v_mul_hi_u32 v2, v3, v2
+; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v0, s5, v11, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s5
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v9, v0
-; GFX10-NEXT:    v_add_co_u32 v8, s5, v1, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s5
-; GFX10-NEXT:    v_add_nc_u32_e32 v7, v7, v11
-; GFX10-NEXT:    v_add_co_u32 v9, s5, v10, v0
+; GFX10-NEXT:    v_add_co_u32 v8, s5, v8, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v0, s5, v0, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s5
+; GFX10-NEXT:    v_mul_hi_u32 v5, s1, v5
+; GFX10-NEXT:    v_add_nc_u32_e32 v8, v10, v8
+; GFX10-NEXT:    v_add3_u32 v2, v7, v6, v2
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, v11, v9
+; GFX10-NEXT:    v_add_co_u32 v7, s5, v0, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s5
-; GFX10-NEXT:    v_mul_hi_u32 v2, v3, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, v13, v6
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v4, v8
-; GFX10-NEXT:    v_add3_u32 v5, v7, v0, v5
-; GFX10-NEXT:    v_mul_hi_u32 v8, s14, v4
-; GFX10-NEXT:    v_add3_u32 v2, v6, v1, v2
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v9, 0
-; GFX10-NEXT:    v_mul_lo_u32 v6, s7, v9
-; GFX10-NEXT:    v_mul_lo_u32 v7, s6, v5
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, v3, v2, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u32 v8, s7, v7
 ; GFX10-NEXT:    v_mul_lo_u32 v3, s15, v4
+; GFX10-NEXT:    v_add3_u32 v5, v6, v0, v5
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s5, s6, v7, 0
+; GFX10-NEXT:    v_mul_lo_u32 v6, s14, v2
+; GFX10-NEXT:    v_mul_hi_u32 v10, s14, v4
+; GFX10-NEXT:    v_mul_lo_u32 v9, s6, v5
 ; GFX10-NEXT:    v_mul_hi_u32 v4, s15, v4
-; GFX10-NEXT:    v_mul_lo_u32 v10, s14, v2
 ; GFX10-NEXT:    v_mul_lo_u32 v11, s15, v2
-; GFX10-NEXT:    v_add3_u32 v1, v1, v7, v6
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v9, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v12, s1, v1
-; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, s0, v0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v14, s0, s1, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s6, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v13, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v14
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s7, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v12
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s0
+; GFX10-NEXT:    v_add_co_u32 v3, s5, v3, v6
+; GFX10-NEXT:    v_add3_u32 v1, v1, v9, v8
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, s0, v0
+; GFX10-NEXT:    v_add_co_u32 v0, s0, v7, 1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v9, s1, v1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v8, s0, 0, v5, s0
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v12, s0, s1, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v6
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s7, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s5
+; GFX10-NEXT:    v_add_co_u32 v4, s1, v11, v4
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v14, vcc_lo, v6, s6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s0, 0, v9, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v9, vcc_lo, s7, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v13, v1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s6, v14
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s7, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v19, s0, v6, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v20, s0, 0, v7, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, v16, v1, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v18, v17, s0
-; GFX10-NEXT:    v_add_co_u32 v1, s0, v3, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v16, 0, -1, s0
+; GFX10-NEXT:    v_add_co_u32 v3, s0, v3, v10
 ; GFX10-NEXT:    v_mul_hi_u32 v10, s14, v2
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v11, v4
-; GFX10-NEXT:    v_add_co_u32 v1, s1, v1, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s7, v15
+; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v3, v17, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v16, v13, s0
 ; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, 1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v3, v1
-; GFX10-NEXT:    v_mul_hi_u32 v2, s15, v2
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v17
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v8, v10
-; GFX10-NEXT:    v_add_co_u32 v4, s0, v4, v1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    v_sub_co_u32 v8, s0, v12, s6
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, s0, 0, v0, s0
-; GFX10-NEXT:    v_add3_u32 v2, v3, v1, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v6, v19, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v7, v20, vcc_lo
-; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s2, v4, 0
-; GFX10-NEXT:    v_mul_lo_u32 v7, s2, v2
-; GFX10-NEXT:    v_mul_lo_u32 v11, s3, v4
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v17
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v16
-; GFX10-NEXT:    v_mov_b32_e32 v16, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v12, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc_lo
-; GFX10-NEXT:    v_add3_u32 v1, v1, v7, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v15, v10, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v7, v13, v8, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v16, s0, v0, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s0, 0, v8, s0
+; GFX10-NEXT:    v_add_co_u32 v3, s0, v4, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v10, v13, v10
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v11
+; GFX10-NEXT:    v_mul_lo_u32 v13, s3, v3
+; GFX10-NEXT:    v_add3_u32 v2, v10, v4, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v0, v16, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v17, s0
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s1, s2, v3, 0
+; GFX10-NEXT:    v_mul_lo_u32 v8, s2, v2
+; GFX10-NEXT:    v_sub_co_u32 v10, s1, v14, s6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s1, 0, v9, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, v14, v10, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v7, v11, vcc_lo
+; GFX10-NEXT:    v_add3_u32 v1, v1, v8, v13
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v15, v9, s0
 ; GFX10-NEXT:    v_sub_co_u32 v8, s0, s14, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
 ; GFX10-NEXT:    v_sub_co_ci_u32_e64 v9, s1, s15, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc_lo
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s15, v1
-; GFX10-NEXT:    v_xor_b32_e32 v0, s18, v3
+; GFX10-NEXT:    v_xor_b32_e32 v0, s18, v7
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s3, v9
-; GFX10-NEXT:    v_xor_b32_e32 v3, s19, v5
-; GFX10-NEXT:    v_xor_b32_e32 v6, s4, v6
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v4, s19, v4
+; GFX10-NEXT:    v_xor_b32_e32 v5, s4, v5
+; GFX10-NEXT:    v_mov_b32_e32 v16, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, vcc_lo, s3, v1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v12, vcc_lo, v8, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v13, s0, 0, v10, vcc_lo
 ; GFX10-NEXT:    v_sub_co_u32 v0, s0, v0, s18
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s19, v3, s0
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v1, s0, s19, v4, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v9
-; GFX10-NEXT:    v_xor_b32_e32 v3, s4, v7
+; GFX10-NEXT:    v_xor_b32_e32 v4, s4, v6
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v10, vcc_lo, s3, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v11, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, v11, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
-; GFX10-NEXT:    v_add_co_u32 v14, s0, v4, 1
+; GFX10-NEXT:    v_add_co_u32 v14, s0, v3, 1
 ; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v2, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, v13
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v7, v11, s0
@@ -2167,24 +2161,24 @@ define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_sub_co_u32 v7, s0, v12, s2
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v10, s0, 0, v10, s0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v11, v14, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v5
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v6
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v15, v17, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v12, v7, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v12, v7, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v7, v13, v10, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v10, v4, v11, s0
+; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v4, s4
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v11, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v14, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s0
 ; GFX10-NEXT:    s_xor_b64 s[0:1], s[12:13], s[16:17]
-; GFX10-NEXT:    v_sub_co_u32 v4, vcc_lo, v3, s4
-; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v6, vcc_lo
-; GFX10-NEXT:    v_xor_b32_e32 v6, s1, v2
-; GFX10-NEXT:    v_xor_b32_e32 v8, s12, v8
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v5, vcc_lo, s4, v5, vcc_lo
+; GFX10-NEXT:    v_xor_b32_e32 v3, s0, v3
+; GFX10-NEXT:    v_xor_b32_e32 v8, s1, v2
+; GFX10-NEXT:    v_xor_b32_e32 v6, s12, v6
 ; GFX10-NEXT:    v_xor_b32_e32 v7, s12, v7
 ; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, v3, s0
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v6, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v8, s12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v8, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v6, s12
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s12, v7, vcc_lo
 ; GFX10-NEXT:    global_store_dwordx4 v16, v[0:3], s[8:9]
 ; GFX10-NEXT:    global_store_dwordx4 v16, v[4:7], s[10:11]
@@ -2520,17 +2514,16 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x80018
-; GFX10-NEXT:    s_bfe_i32 s2, s0, 0x80010
-; GFX10-NEXT:    s_ashr_i32 s3, s1, 31
-; GFX10-NEXT:    s_ashr_i32 s8, s2, 31
-; GFX10-NEXT:    s_add_i32 s1, s1, s3
-; GFX10-NEXT:    s_add_i32 s2, s2, s8
-; GFX10-NEXT:    s_xor_b32 s1, s1, s3
-; GFX10-NEXT:    s_xor_b32 s2, s2, s8
+; GFX10-NEXT:    s_bfe_i32 s3, s0, 0x80010
+; GFX10-NEXT:    s_ashr_i32 s2, s1, 31
+; GFX10-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX10-NEXT:    s_add_i32 s1, s1, s2
+; GFX10-NEXT:    s_add_i32 s3, s3, s8
+; GFX10-NEXT:    s_xor_b32 s1, s1, s2
+; GFX10-NEXT:    s_xor_b32 s3, s3, s8
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX10-NEXT:    s_sub_i32 s6, 0, s1
-; GFX10-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -2538,59 +2531,60 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, s7, v1
-; GFX10-NEXT:    s_sext_i32_i8 s6, s0
-; GFX10-NEXT:    s_bfe_i32 s0, s0, 0x80008
+; GFX10-NEXT:    s_sub_i32 s6, 0, s3
+; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
+; GFX10-NEXT:    s_bfe_i32 s6, s0, 0x80008
+; GFX10-NEXT:    s_sext_i32_i8 s0, s0
 ; GFX10-NEXT:    s_ashr_i32 s9, s6, 31
 ; GFX10-NEXT:    s_ashr_i32 s10, s0, 31
+; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    s_add_i32 s6, s6, s9
 ; GFX10-NEXT:    s_add_i32 s0, s0, s10
-; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GFX10-NEXT:    s_xor_b32 s0, s0, s10
 ; GFX10-NEXT:    s_xor_b32 s6, s6, s9
+; GFX10-NEXT:    s_xor_b32 s0, s0, s10
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, v1, v3
-; GFX10-NEXT:    v_mul_hi_u32 v0, s0, v0
-; GFX10-NEXT:    v_mul_hi_u32 v1, s6, v1
+; GFX10-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GFX10-NEXT:    v_mul_hi_u32 v1, s0, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s1
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s2
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s0, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s6, v3
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s6, v2
 ; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s1, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s2, v3
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v0
+; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s2, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s2, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s1, s10, s3
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v4, s0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    s_xor_b32 s1, s9, s2
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
 ; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v2, s10, v2
-; GFX10-NEXT:    s_xor_b32 s0, s9, s8
+; GFX10-NEXT:    v_xor_b32_e32 v2, s9, v2
+; GFX10-NEXT:    s_xor_b32 s0, s10, s8
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v3, s9, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s10, v2
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
+; GFX10-NEXT:    v_xor_b32_e32 v3, s10, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s9, v2
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s9, v3
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s10, v3
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
index f2516cbc7b5df26..ee3bf9611199472 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/select-to-fmin-fmax.ll
@@ -45,18 +45,18 @@ define <4 x half> @test_v4s16(<4 x half> %a) #0 {
 ; GCN-LABEL: test_v4s16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_mov_b32_e32 v4, 0
 ; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v4, v0, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e64 v5, v0, 0, vcc
+; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v0, v4 src0_sel:WORD_1 src1_sel:DWORD
 ; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v2, 0, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v1, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v1, v4 src0_sel:WORD_1 src1_sel:DWORD
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v3, 0, s[4:5]
-; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v4
+; GCN-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GCN-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GCN-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
@@ -71,31 +71,31 @@ define <8 x half> @test_v8s16(<8 x half> %a) #0 {
 ; GCN-LABEL: test_v8s16:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    s_mov_b32 s6, 0
+; GCN-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GCN-NEXT:    v_cndmask_b32_e64 v8, v0, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v0, s6 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cndmask_b32_e64 v9, v0, 0, vcc
+; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v0, v8 src0_sel:WORD_1 src1_sel:DWORD
 ; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
 ; GCN-NEXT:    v_cndmask_b32_e64 v0, v4, 0, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, v1, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v1, s6 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v1, v8 src0_sel:WORD_1 src1_sel:DWORD
 ; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v2
 ; GCN-NEXT:    v_cndmask_b32_e64 v1, v5, 0, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, v2, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v2, s6 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v2, v8 src0_sel:WORD_1 src1_sel:DWORD
 ; GCN-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v3
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v3
 ; GCN-NEXT:    v_cndmask_b32_e64 v2, v6, 0, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v3, 0, vcc
-; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v3, s6 src0_sel:WORD_1 src1_sel:DWORD
+; GCN-NEXT:    v_cmp_lt_f16_sdwa s[4:5], v3, v8 src0_sel:WORD_1 src1_sel:DWORD
 ; GCN-NEXT:    v_lshl_or_b32 v1, v1, 16, v4
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v5
 ; GCN-NEXT:    v_cndmask_b32_e64 v3, v7, 0, s[4:5]
-; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v8
+; GCN-NEXT:    v_and_b32_e32 v7, 0xffff, v9
 ; GCN-NEXT:    v_lshl_or_b32 v2, v2, 16, v4
 ; GCN-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GCN-NEXT:    v_lshl_or_b32 v0, v0, 16, v7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index 21f591a62c5f451..bac80f0777c0242 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -999,7 +999,6 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
 ; GFX8-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 11, v7
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v2, v5, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v5, 11
 ; GFX8-NEXT:    v_or_b32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_ashrrev_i16_e32 v4, 11, v8
 ; GFX8-NEXT:    v_ashrrev_i16_sdwa v3, v5, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
index 88ace1c51f5b023..1bb606f36e48d2c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i32.ll
@@ -254,10 +254,10 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
 ; CHECK-LABEL: v_srem_i32_pow2k_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_movk_i32 s4, 0x1000
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x45800000
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xfffff000
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x1000
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -268,11 +268,11 @@ define i32 @v_srem_i32_pow2k_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 12, v2
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v0
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, 0x1000, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v0
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, 0x1000, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -286,89 +286,76 @@ define <2 x i32> @v_srem_v2i32_pow2k_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_movk_i32 s4, 0x1000
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x1000
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xfffff000
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x1000
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0xfffff000
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GISEL-NEXT:    v_lshlrev_b32_e32 v5, 12, v5
 ; GISEL-NEXT:    v_lshlrev_b32_e32 v4, 12, v4
-; GISEL-NEXT:    v_lshlrev_b32_e32 v3, 12, v3
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v1
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v1
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v3
+; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 0x1000, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v3
+; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 0x1000, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32_pow2k_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_movk_i32 s4, 0x1000
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x45800000
-; CGP-NEXT:    s_movk_i32 s5, 0xf000
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0xfffff000
 ; CGP-NEXT:    v_mov_b32_e32 v5, 0x1000
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, 0x45800000
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, s5
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT:    v_lshlrev_b32_e32 v3, 12, v3
+; CGP-NEXT:    v_mul_lo_u32 v4, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v0, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; CGP-NEXT:    v_lshlrev_b32_e32 v4, 12, v4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; CGP-NEXT:    v_lshlrev_b32_e32 v3, 12, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
 ; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x1000, v1
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
 ; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x1000, v1
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -385,10 +372,10 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_srem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CHECK-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
@@ -397,13 +384,13 @@ define i32 @v_srem_i32_oddk_denom(i32 %num) {
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT:    v_mul_lo_u32 v2, v2, s4
+; CHECK-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v0
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v0
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CHECK-NEXT:    v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
@@ -417,89 +404,76 @@ define <2 x i32> @v_srem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT:    s_mov_b32 s4, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
-; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
-; GISEL-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
+; GISEL-NEXT:    v_mov_b32_e32 v5, 0xffed2705
+; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v3, v3
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
-; GISEL-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; GISEL-NEXT:    v_mul_lo_u32 v7, v6, v4
-; GISEL-NEXT:    v_mul_lo_u32 v4, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
-; GISEL-NEXT:    v_mul_hi_u32 v4, v3, v4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, s4
-; GISEL-NEXT:    v_mul_lo_u32 v3, v3, s4
-; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v1
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
-; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v1
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-NEXT:    v_mul_f32_e32 v4, 0x4f7ffffe, v4
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_mul_hi_u32 v5, v0, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GISEL-NEXT:    v_mul_lo_u32 v5, v5, v3
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v3
+; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v4, vcc, v0, v3
+; GISEL-NEXT:    v_subrev_i32_e32 v5, vcc, 0x12d8fb, v1
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_xor_b32_e32 v1, v1, v6
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i32_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CGP-NEXT:    v_ashrrev_i32_e32 v2, 31, v0
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v3, 0x4996c7d8
-; CGP-NEXT:    s_mov_b32 s5, 0xffed2705
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0xffed2705
 ; CGP-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
 ; CGP-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v7, 0x4996c7d8
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x4f7ffffe, v7
 ; CGP-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v3, v3
 ; CGP-NEXT:    v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mul_lo_u32 v8, v3, s5
-; CGP-NEXT:    v_mul_lo_u32 v4, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v4, v7, v4
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT:    v_mul_hi_u32 v3, v0, v3
-; CGP-NEXT:    v_mul_hi_u32 v4, v1, v4
-; CGP-NEXT:    v_mul_lo_u32 v3, v3, s4
-; CGP-NEXT:    v_mul_lo_u32 v4, v4, s4
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; CGP-NEXT:    v_mul_lo_u32 v4, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
+; CGP-NEXT:    v_mul_hi_u32 v4, v0, v3
+; CGP-NEXT:    v_mul_hi_u32 v3, v1, v3
+; CGP-NEXT:    v_mul_lo_u32 v4, v4, v5
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v5
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
 ; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v0, v5
 ; CGP-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v5
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index d1599ac489a5f51..0b22b3b3a4ba7c6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -979,7 +979,7 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x1000
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_movk_i32 s6, 0xf000
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0xfffff000
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
@@ -987,116 +987,114 @@ define i64 @v_srem_i64_pow2k_denom(i64 %num) {
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0xfffff000
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
 ; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v6, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
+; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s6, v5, 0
-; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v0, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, v1, v7, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_xor_b32_e32 v3, v8, v7
-; CHECK-NEXT:    v_mul_lo_u32 v1, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v0
-; CHECK-NEXT:    v_xor_b32_e32 v4, v9, v7
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v2
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
+; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
+; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v0
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v0, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0x1000
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CHECK-NEXT:    v_addc_u32_e32 v0, vcc, v6, v0, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v1
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    s_movk_i32 s6, 0x1000
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; CHECK-NEXT:    v_mul_hi_u32 v7, v9, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], s6, v2, v[1:2]
-; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v3, v0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0x1000
-; CHECK-NEXT:    v_subb_u32_e64 v2, vcc, v4, v1, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v4, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v4, v0
+; CHECK-NEXT:    v_subb_u32_e64 v2, vcc, v9, v1, s[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v9, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v5
 ; CHECK-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v6, vcc, 0x1000, v4
+; CHECK-NEXT:    v_subrev_i32_e32 v7, vcc, 0x1000, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 4096
   ret i64 %result
@@ -1114,165 +1112,155 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v8, v[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v6
-; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v0, v5, v8
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v9
+; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v1, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v5, 0x1000
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v8, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v8, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9]
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x1000
 ; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v10, v0
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v11, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v11, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v1, s[4:5]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, 0x1000
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v8, v5
-; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v1
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v7, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v11, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
+; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v11, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v14, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v6
-; GISEL-NEXT:    v_mul_hi_u32 v15, v13, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v14, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v6
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v14, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v6, v0
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v8, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v11, v[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v2, v8
+; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v14, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v6
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v8
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v6
+; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
@@ -1280,7 +1268,7 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v6
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
@@ -1291,62 +1279,62 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v2
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_xor_b32_e32 v9, v9, v4
+; GISEL-NEXT:    v_xor_b32_e32 v8, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v11, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v10, v2
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v12, v3
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, -1, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, -1, v8, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_pow2k_denom:
@@ -1354,241 +1342,227 @@ define <2 x i64> @v_srem_v2i64_pow2k_denom(<2 x i64> %num) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x1000
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_movk_i32 s7, 0xf000
-; CGP-NEXT:    s_movk_i32 s6, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v6, 0xfffff000
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v6, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_mov_b32_e32 v4, 0xfffff000
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6]
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v8, v5, vcc
-; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v4, v7
-; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v0, v5
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8]
-; CGP-NEXT:    v_xor_b32_e32 v8, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v1, v10, v6
-; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v7, v9, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v10, v6
-; CGP-NEXT:    v_xor_b32_e32 v11, v11, v5
+; CGP-NEXT:    v_trunc_f32_e32 v7, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v7
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; CGP-NEXT:    v_mov_b32_e32 v7, v5
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; CGP-NEXT:    v_mul_hi_u32 v12, v9, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11]
+; CGP-NEXT:    v_mul_lo_u32 v10, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v8, v13
+; CGP-NEXT:    v_mul_lo_u32 v7, v9, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; CGP-NEXT:    v_mov_b32_e32 v4, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v7
+; CGP-NEXT:    v_mul_lo_u32 v0, v17, v13
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v14
+; CGP-NEXT:    v_xor_b32_e32 v18, v1, v7
+; CGP-NEXT:    v_mul_hi_u32 v1, v16, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v17, v13
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v1, v17, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT:    v_mul_hi_u32 v4, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, v17, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v10, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_mul_hi_u32 v4, v9, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; CGP-NEXT:    v_addc_u32_e32 v0, vcc, v10, v0, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v11, v1
-; CGP-NEXT:    v_mul_lo_u32 v7, v8, v0
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v11, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v18, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x1000
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v11, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v7, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_mul_hi_u32 v9, v11, v0
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v6, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v0
-; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v11, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v11, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v10, -1, v1, s[4:5]
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, 0x1000
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v16, v18, v1
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v18, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, v15, v0
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v18, v13
+; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v8, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v0, vcc
-; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
-; CGP-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; CGP-NEXT:    v_trunc_f32_e32 v6, v1
-; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v0
-; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v6
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
-; CGP-NEXT:    v_cndmask_b32_e32 v15, -1, v7, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v11, v4
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7]
-; CGP-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v12, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v11, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v1, v14, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v13, v6
-; CGP-NEXT:    v_mul_hi_u32 v15, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; CGP-NEXT:    v_mul_hi_u32 v0, v14, v0
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; CGP-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT:    v_mov_b32_e32 v0, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1]
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v18, vcc
+; CGP-NEXT:    v_mul_lo_u32 v19, v8, v0
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v14, v6
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; CGP-NEXT:    v_mul_hi_u32 v11, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
-; CGP-NEXT:    v_mul_hi_u32 v6, v14, v6
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v0
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v14, v1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v9, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v6, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2]
-; CGP-NEXT:    v_xor_b32_e32 v1, v8, v5
-; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7]
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; CGP-NEXT:    v_xor_b32_e32 v7, v2, v8
-; CGP-NEXT:    v_mul_lo_u32 v2, v13, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v11, v6
-; CGP-NEXT:    v_xor_b32_e32 v12, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v3, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v13, v0
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v5, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v10, v15, v16, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v10, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6]
+; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v8, v5
+; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v6
+; CGP-NEXT:    v_mul_lo_u32 v3, v9, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CGP-NEXT:    v_mul_hi_u32 v6, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v11, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v3
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v2
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v9, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mul_hi_u32 v5, v7, v3
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v12, v3
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v7
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v12, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s6, v3, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v13, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v9, -1, v9, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v8, -1, v8, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v7, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; CGP-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v5, vcc
+; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 4096, i64 4096>
   ret <2 x i64> %result
@@ -1600,7 +1574,7 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT:    s_mov_b32 s6, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0xffed2705
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0x4f800000, v3
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v2, v2
 ; CHECK-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
@@ -1608,116 +1582,114 @@ define i64 @v_srem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_trunc_f32_e32 v4, v3
 ; CHECK-NEXT:    v_mac_f32_e32 v2, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0xffed2705
-; CHECK-NEXT:    v_cvt_u32_f32_e32 v6, v4
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v2, v5, 0
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4]
-; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v2
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v2
 ; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_mul_lo_u32 v4, v6, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v3
-; CHECK-NEXT:    v_mul_hi_u32 v10, v5, v3
-; CHECK-NEXT:    v_mul_hi_u32 v3, v6, v3
+; CHECK-NEXT:    v_mul_lo_u32 v4, v7, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v9, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v10, v7, v3
+; CHECK-NEXT:    v_mul_hi_u32 v11, v5, v3
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v9
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v9, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT:    v_addc_u32_e32 v6, vcc, v6, v3, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s6, v5, 0
-; CHECK-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v0, v7
-; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], s6, v6, v[3:4]
-; CHECK-NEXT:    v_addc_u32_e32 v9, vcc, v1, v7, vcc
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT:    v_xor_b32_e32 v3, v8, v7
-; CHECK-NEXT:    v_mul_lo_u32 v1, v6, v2
-; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v0
-; CHECK-NEXT:    v_xor_b32_e32 v4, v9, v7
-; CHECK-NEXT:    v_mul_hi_u32 v9, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v2, v6, v2
+; CHECK-NEXT:    v_addc_u32_e32 v7, vcc, v7, v3, vcc
+; CHECK-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
+; CHECK-NEXT:    v_ashrrev_i32_e32 v6, 31, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v4, v0, v6
+; CHECK-NEXT:    v_mul_lo_u32 v0, v7, v2
+; CHECK-NEXT:    v_mul_lo_u32 v8, v5, v3
+; CHECK-NEXT:    v_xor_b32_e32 v9, v1, v6
+; CHECK-NEXT:    v_mul_hi_u32 v1, v5, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v1, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v9, v6, v0
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; CHECK-NEXT:    v_mul_hi_u32 v0, v6, v0
+; CHECK-NEXT:    v_mul_hi_u32 v3, v7, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v2, v1
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v3, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CHECK-NEXT:    v_addc_u32_e32 v1, vcc, v7, v1, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CHECK-NEXT:    v_mul_lo_u32 v3, v4, v1
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v0
+; CHECK-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, v9, v1
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v4, v1
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; CHECK-NEXT:    v_addc_u32_e32 v0, vcc, v6, v0, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v2, v4, v1
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, v0
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v1
-; CHECK-NEXT:    v_mul_hi_u32 v1, v4, v1
-; CHECK-NEXT:    s_mov_b32 s6, 0x12d8fb
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
+; CHECK-NEXT:    v_mul_hi_u32 v7, v9, v1
+; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v0, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v0
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_mul_hi_u32 v5, v3, v0
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT:    v_mul_hi_u32 v8, v4, v0
-; CHECK-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v8, v2
-; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], s6, v2, v[1:2]
-; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v3, v0
-; CHECK-NEXT:    v_mov_b32_e32 v6, 0x12d8fb
-; CHECK-NEXT:    v_subb_u32_e64 v2, vcc, v4, v1, s[4:5]
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v4, v1
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v6
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_mad_u64_u32 v[1:2], s[4:5], v5, v2, v[1:2]
+; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v4, v0
+; CHECK-NEXT:    v_subb_u32_e64 v2, vcc, v9, v1, s[4:5]
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v9, v1
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
 ; CHECK-NEXT:    v_cndmask_b32_e32 v3, -1, v3, vcc
-; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v6
+; CHECK-NEXT:    v_sub_i32_e32 v4, vcc, v0, v5
 ; CHECK-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
 ; CHECK-NEXT:    v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v6
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
 ; CHECK-NEXT:    v_cndmask_b32_e32 v5, -1, v5, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v6, vcc, 0x12d8fb, v4
+; CHECK-NEXT:    v_subrev_i32_e32 v7, vcc, 0x12d8fb, v4
 ; CHECK-NEXT:    v_subbrev_u32_e32 v8, vcc, 0, v1, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
-; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; CHECK-NEXT:    v_cndmask_b32_e32 v4, v4, v7, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v1, v8, vcc
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; CHECK-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v7
-; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v7
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v7
-; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; CHECK-NEXT:    v_xor_b32_e32 v0, v0, v6
+; CHECK-NEXT:    v_xor_b32_e32 v1, v1, v6
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
+; CHECK-NEXT:    v_subb_u32_e32 v1, vcc, v1, v6, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %num, 1235195
   ret i64 %result
@@ -1735,165 +1707,155 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v5
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v7, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v8, v[5:6]
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v7, v[5:6]
-; GISEL-NEXT:    v_mul_lo_u32 v6, v8, v4
-; GISEL-NEXT:    v_mul_hi_u32 v4, v8, v4
-; GISEL-NEXT:    v_mul_lo_u32 v10, v7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v11, v8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v12, v7, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT:    v_trunc_f32_e32 v7, v5
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
+; GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6]
+; GISEL-NEXT:    v_mul_lo_u32 v5, v7, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v4
+; GISEL-NEXT:    v_mul_hi_u32 v4, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v11, v7, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v5, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v9, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v8, v[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v6, v4
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
+; GISEL-NEXT:    v_addc_u32_e32 v5, vcc, v7, v5, vcc
+; GISEL-NEXT:    v_mov_b32_e32 v4, v9
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, v[6:7]
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v10, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v8, v5
-; GISEL-NEXT:    v_mul_lo_u32 v7, v9, v6
-; GISEL-NEXT:    v_xor_b32_e32 v11, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v9, v5
-; GISEL-NEXT:    v_mul_hi_u32 v5, v8, v5
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v0, v5, v8
+; GISEL-NEXT:    v_mul_lo_u32 v12, v11, v9
+; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v5, v8
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v8, v6
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT:    v_mul_hi_u32 v7, v9, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; GISEL-NEXT:    v_mul_hi_u32 v6, v8, v6
+; GISEL-NEXT:    v_mul_lo_u32 v1, v5, v9
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v12, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v9, v5, v9
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v8, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v10, v1
-; GISEL-NEXT:    v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v11, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v5, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v10, v1
+; GISEL-NEXT:    v_mul_hi_u32 v11, v10, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v5, 0x12d8fb
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v8, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[1:2]
+; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_mul_hi_u32 v9, v10, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v8
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v1
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v5, v8, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v10, vcc, v10, v0
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9]
 ; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
 ; GISEL-NEXT:    s_subb_u32 s7, 0, 0
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], 0, v8, v[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v8, vcc, v10, v0
-; GISEL-NEXT:    v_subb_u32_e64 v9, s[4:5], v11, v6, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v11, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, -1, v1, s[4:5]
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, 0x12d8fb
-; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; GISEL-NEXT:    v_subb_u32_e64 v11, s[4:5], v13, v8, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v10, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v8, v5
-; GISEL-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v0, vcc
-; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
-; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v6, v1
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v6
-; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v13, 0
-; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
-; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v7, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v14, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v11, v5
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[6:7]
-; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v12, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v11
+; GISEL-NEXT:    v_sub_i32_e32 v13, vcc, v10, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, -1, v1, s[4:5]
+; GISEL-NEXT:    v_subbrev_u32_e32 v14, vcc, 0, v0, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v13, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v14
+; GISEL-NEXT:    v_cndmask_b32_e32 v15, -1, v8, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v13, v5
+; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
+; GISEL-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v14, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v11, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v14, v0
-; GISEL-NEXT:    v_mul_lo_u32 v11, v13, v6
-; GISEL-NEXT:    v_mul_hi_u32 v15, v13, v0
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; GISEL-NEXT:    v_mul_hi_u32 v0, v14, v0
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v13, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v7, v0
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v0
+; GISEL-NEXT:    v_cndmask_b32_e32 v14, v14, v16, vcc
+; GISEL-NEXT:    v_mul_hi_u32 v0, v7, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v14, v6
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_mul_hi_u32 v11, v13, v6
+; GISEL-NEXT:    v_mul_lo_u32 v15, v7, v8
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v6, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
-; GISEL-NEXT:    v_mul_hi_u32 v6, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v15, v13
+; GISEL-NEXT:    v_mul_hi_u32 v8, v7, v8
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v0
-; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v14, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v11, 0
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v7, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v6, v0
+; GISEL-NEXT:    v_addc_u32_e32 v13, vcc, v7, v1, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT:    v_xor_b32_e32 v1, v8, v4
-; GISEL-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v11, v[6:7]
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v12, vcc
-; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v10, v2, v8
+; GISEL-NEXT:    v_xor_b32_e32 v1, v9, v4
+; GISEL-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7]
+; GISEL-NEXT:    v_cndmask_b32_e32 v10, v11, v14, vcc
+; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v9
 ; GISEL-NEXT:    v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v7, v11, v6
-; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v8
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v7, v8, v6
+; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v9
+; GISEL-NEXT:    v_mul_hi_u32 v3, v8, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
@@ -1901,7 +1863,7 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v6
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v6
+; GISEL-NEXT:    v_mul_hi_u32 v7, v8, v6
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
@@ -1912,62 +1874,62 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT:    v_mul_lo_u32 v6, v10, v2
-; GISEL-NEXT:    v_mul_hi_u32 v7, v10, v0
+; GISEL-NEXT:    v_mul_lo_u32 v6, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v7, v11, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT:    v_xor_b32_e32 v9, v9, v4
+; GISEL-NEXT:    v_xor_b32_e32 v8, v10, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v7, v12, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
-; GISEL-NEXT:    v_mul_hi_u32 v6, v10, v2
+; GISEL-NEXT:    v_mul_hi_u32 v6, v11, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v0, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v12, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v11, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v6, v0
 ; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v0, v3
 ; GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], v5, v6, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v9, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v11, v[6:7]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v10, v2
+; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v8, v4, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
 ; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v12, v3
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v7, vcc, v2, v5
 ; GISEL-NEXT:    v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v5
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v5
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, -1, v9, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v8, -1, v8, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v5, vcc, v7, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v4
 ; GISEL-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v3, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v10, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v8
-; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v8
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; GISEL-NEXT:    v_xor_b32_e32 v2, v2, v9
+; GISEL-NEXT:    v_xor_b32_e32 v3, v3, v9
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v2, v9
+; GISEL-NEXT:    v_subb_u32_e32 v3, vcc, v3, v9, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_srem_v2i64_oddk_denom:
@@ -1975,241 +1937,227 @@ define <2 x i64> @v_srem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT:    s_mov_b32 s7, 0xffed2705
-; CGP-NEXT:    s_mov_b32 s6, 0x12d8fb
+; CGP-NEXT:    v_mov_b32_e32 v6, 0xffed2705
 ; CGP-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v5
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; CGP-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT:    v_trunc_f32_e32 v6, v5
-; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v4
-; CGP-NEXT:    v_mov_b32_e32 v4, 0xffed2705
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v6
-; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v4, v7, 0
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s7, v8, v[5:6]
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v7, v[5:6]
-; CGP-NEXT:    v_mul_lo_u32 v6, v8, v4
-; CGP-NEXT:    v_mul_hi_u32 v4, v8, v4
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v5
-; CGP-NEXT:    v_mul_lo_u32 v11, v8, v5
-; CGP-NEXT:    v_mul_hi_u32 v12, v7, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v8, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v7, v4
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v9, 0
-; CGP-NEXT:    v_addc_u32_e32 v10, vcc, v8, v5, vcc
-; CGP-NEXT:    v_ashrrev_i32_e32 v5, 31, v1
-; CGP-NEXT:    v_mov_b32_e32 v4, v7
-; CGP-NEXT:    v_mad_u64_u32 v[7:8], s[4:5], s7, v10, v[4:5]
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v0, v5
-; CGP-NEXT:    v_addc_u32_e32 v11, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v9, v[7:8]
-; CGP-NEXT:    v_xor_b32_e32 v8, v4, v5
-; CGP-NEXT:    v_mul_lo_u32 v1, v10, v6
-; CGP-NEXT:    v_mul_lo_u32 v4, v9, v0
-; CGP-NEXT:    v_mul_hi_u32 v7, v9, v6
-; CGP-NEXT:    v_mul_hi_u32 v6, v10, v6
-; CGP-NEXT:    v_xor_b32_e32 v11, v11, v5
+; CGP-NEXT:    v_trunc_f32_e32 v7, v5
+; CGP-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v7
+; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v4
+; CGP-NEXT:    v_cvt_u32_f32_e32 v9, v7
+; CGP-NEXT:    v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
+; CGP-NEXT:    v_mov_b32_e32 v7, v5
+; CGP-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
+; CGP-NEXT:    v_mul_hi_u32 v12, v9, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11]
+; CGP-NEXT:    v_mul_lo_u32 v10, v9, v4
+; CGP-NEXT:    v_mul_hi_u32 v11, v8, v4
+; CGP-NEXT:    v_mul_lo_u32 v4, v8, v13
+; CGP-NEXT:    v_mul_lo_u32 v7, v9, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v8, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v4, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v15, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v15, v14
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v7, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v14, v7
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v13, v7
+; CGP-NEXT:    v_add_i32_e32 v16, vcc, v8, v4
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
+; CGP-NEXT:    v_addc_u32_e32 v17, vcc, v9, v7, vcc
+; CGP-NEXT:    v_mov_b32_e32 v4, v14
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
+; CGP-NEXT:    v_ashrrev_i32_e32 v7, 31, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v7
+; CGP-NEXT:    v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v1, v7, vcc
+; CGP-NEXT:    v_xor_b32_e32 v15, v0, v7
+; CGP-NEXT:    v_mul_lo_u32 v0, v17, v13
+; CGP-NEXT:    v_mul_lo_u32 v4, v16, v14
+; CGP-NEXT:    v_xor_b32_e32 v18, v1, v7
+; CGP-NEXT:    v_mul_hi_u32 v1, v16, v13
+; CGP-NEXT:    v_mul_hi_u32 v13, v17, v13
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
+; CGP-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v1, v17, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
+; CGP-NEXT:    v_mul_hi_u32 v4, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
+; CGP-NEXT:    v_add_i32_e32 v4, vcc, v13, v4
+; CGP-NEXT:    v_mul_hi_u32 v13, v17, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v7, v10, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_mul_hi_u32 v4, v9, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v0, v10, v0
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v4, vcc, v6, v4
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; CGP-NEXT:    v_addc_u32_e32 v0, vcc, v10, v0, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v11, v1
-; CGP-NEXT:    v_mul_lo_u32 v7, v8, v0
-; CGP-NEXT:    v_mul_hi_u32 v9, v8, v1
-; CGP-NEXT:    v_mul_hi_u32 v1, v11, v1
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v13, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_addc_u32_e32 v1, vcc, v17, v1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v13, v18, v0
+; CGP-NEXT:    v_mul_lo_u32 v14, v15, v1
+; CGP-NEXT:    v_mul_hi_u32 v16, v15, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v18, v0
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v11, v0
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_mul_hi_u32 v7, v8, v0
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT:    v_mul_hi_u32 v9, v11, v0
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v1, 0
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v9, v6
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s6, v6, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v0
-; CGP-NEXT:    v_subb_u32_e64 v9, s[4:5], v11, v6, vcc
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v11, v6
-; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v8, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v10, -1, v1, s[4:5]
-; CGP-NEXT:    v_cvt_f32_u32_e32 v1, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v16, v18, v1
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_mul_hi_u32 v14, v15, v1
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v16, v0
+; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v13
+; CGP-NEXT:    v_mul_hi_u32 v16, v18, v1
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v4, v0, 0
+; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
+; CGP-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
+; CGP-NEXT:    v_mad_u64_u32 v[13:14], s[4:5], v4, v13, v[1:2]
+; CGP-NEXT:    v_sub_i32_e32 v14, vcc, v15, v0
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v18, v13
+; CGP-NEXT:    v_subb_u32_e64 v15, s[4:5], v18, v13, vcc
 ; CGP-NEXT:    v_subbrev_u32_e32 v0, vcc, 0, v0, vcc
-; CGP-NEXT:    v_mac_f32_e32 v1, 0x4f800000, v6
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v1, v1
-; CGP-NEXT:    v_sub_i32_e32 v11, vcc, v8, v4
-; CGP-NEXT:    v_subbrev_u32_e32 v12, vcc, 0, v0, vcc
-; CGP-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v1
-; CGP-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; CGP-NEXT:    v_trunc_f32_e32 v6, v1
-; CGP-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v6
-; CGP-NEXT:    v_cvt_u32_f32_e32 v13, v0
-; CGP-NEXT:    v_cvt_u32_f32_e32 v14, v6
-; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v13, 0
-; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v12
-; CGP-NEXT:    v_cndmask_b32_e32 v15, -1, v7, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v14, v[1:2]
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v11, v4
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v13, v[6:7]
-; CGP-NEXT:    v_subbrev_u32_e32 v16, vcc, 0, v12, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v15
-; CGP-NEXT:    v_cndmask_b32_e32 v7, v11, v1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v1, v14, v0
-; CGP-NEXT:    v_mul_lo_u32 v11, v13, v6
-; CGP-NEXT:    v_mul_hi_u32 v15, v13, v0
-; CGP-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
-; CGP-NEXT:    v_mul_hi_u32 v0, v14, v0
+; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v4
+; CGP-NEXT:    v_sub_i32_e32 v16, vcc, v14, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v15
+; CGP-NEXT:    v_subbrev_u32_e32 v17, vcc, 0, v0, vcc
+; CGP-NEXT:    v_mov_b32_e32 v0, v5
+; CGP-NEXT:    v_cndmask_b32_e64 v13, -1, v1, s[4:5]
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v16, v4
+; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, -1, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1]
+; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v17
+; CGP-NEXT:    v_cndmask_b32_e32 v5, -1, v18, vcc
+; CGP-NEXT:    v_mul_lo_u32 v19, v8, v0
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v16, v4
+; CGP-NEXT:    v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v16, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v16, v17, v18, vcc
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v19
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v1, v15
 ; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v15, v14, v6
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; CGP-NEXT:    v_mul_hi_u32 v11, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; CGP-NEXT:    v_mul_lo_u32 v11, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; CGP-NEXT:    v_mul_hi_u32 v10, v8, v0
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v15, v11
-; CGP-NEXT:    v_mul_hi_u32 v6, v14, v6
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
-; CGP-NEXT:    v_add_i32_e32 v1, vcc, v6, v1
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v0
-; CGP-NEXT:    v_addc_u32_e32 v13, vcc, v14, v1, vcc
-; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s7, v11, 0
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; CGP-NEXT:    v_cndmask_b32_e32 v6, v8, v7, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v8, v9, v12, vcc
-; CGP-NEXT:    v_xor_b32_e32 v9, v6, v5
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], s7, v13, v[1:2]
-; CGP-NEXT:    v_xor_b32_e32 v1, v8, v5
-; CGP-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
-; CGP-NEXT:    v_mad_u64_u32 v[6:7], s[4:5], -1, v11, v[6:7]
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v8, vcc
-; CGP-NEXT:    v_xor_b32_e32 v7, v2, v8
-; CGP-NEXT:    v_mul_lo_u32 v2, v13, v0
-; CGP-NEXT:    v_mul_lo_u32 v10, v11, v6
-; CGP-NEXT:    v_xor_b32_e32 v12, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v3, v11, v0
-; CGP-NEXT:    v_mul_hi_u32 v0, v13, v0
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v12, v11
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v1
+; CGP-NEXT:    v_addc_u32_e32 v9, vcc, v9, v0, vcc
+; CGP-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
+; CGP-NEXT:    v_xor_b32_e32 v11, v5, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
+; CGP-NEXT:    v_cndmask_b32_e32 v10, v15, v16, vcc
+; CGP-NEXT:    v_xor_b32_e32 v1, v10, v7
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6]
+; CGP-NEXT:    v_ashrrev_i32_e32 v10, 31, v3
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_addc_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT:    v_xor_b32_e32 v12, v2, v10
+; CGP-NEXT:    v_mul_lo_u32 v2, v9, v0
+; CGP-NEXT:    v_mul_lo_u32 v6, v8, v5
+; CGP-NEXT:    v_xor_b32_e32 v13, v3, v10
+; CGP-NEXT:    v_mul_hi_u32 v3, v8, v0
+; CGP-NEXT:    v_mul_hi_u32 v0, v9, v0
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT:    v_mul_lo_u32 v3, v13, v6
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; CGP-NEXT:    v_mul_hi_u32 v10, v11, v6
+; CGP-NEXT:    v_mul_lo_u32 v3, v9, v5
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
+; CGP-NEXT:    v_mul_hi_u32 v6, v8, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
-; CGP-NEXT:    v_mul_hi_u32 v6, v13, v6
+; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_mul_hi_u32 v5, v9, v5
 ; CGP-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CGP-NEXT:    v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v11, v0
-; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; CGP-NEXT:    v_mul_lo_u32 v6, v12, v3
-; CGP-NEXT:    v_mul_lo_u32 v10, v7, v2
-; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v9, v5
-; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v5, vcc
-; CGP-NEXT:    v_mul_hi_u32 v5, v7, v3
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v2, vcc, v5, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v8, v0
+; CGP-NEXT:    v_addc_u32_e32 v2, vcc, v9, v2, vcc
+; CGP-NEXT:    v_mul_lo_u32 v5, v13, v3
 ; CGP-NEXT:    v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT:    v_mul_hi_u32 v3, v12, v3
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v2
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v6, v3
+; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v11, v7
+; CGP-NEXT:    v_subb_u32_e32 v1, vcc, v1, v7, vcc
+; CGP-NEXT:    v_mul_hi_u32 v7, v12, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v13, v2
+; CGP-NEXT:    v_mul_hi_u32 v3, v13, v3
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_mul_hi_u32 v6, v12, v2
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v7, v3
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
 ; CGP-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CGP-NEXT:    v_mul_hi_u32 v9, v12, v2
-; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s6, v3, 0
+; CGP-NEXT:    v_mul_hi_u32 v7, v13, v2
+; CGP-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v4, v3, 0
 ; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v9, v5
-; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], s6, v5, v[3:4]
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v7, v2
-; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v12, v5, vcc
-; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v12, v5
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
+; CGP-NEXT:    v_mad_u64_u32 v[5:6], s[4:5], v4, v5, v[3:4]
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
+; CGP-NEXT:    v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
+; CGP-NEXT:    v_sub_i32_e64 v5, s[4:5], v13, v5
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v2, v4
 ; CGP-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
 ; CGP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v5
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v4
-; CGP-NEXT:    v_cndmask_b32_e32 v9, -1, v9, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v8, -1, v8, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v4, vcc, v7, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v3
-; CGP-NEXT:    v_subbrev_u32_e32 v10, vcc, 0, v5, vcc
+; CGP-NEXT:    v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v8
 ; CGP-NEXT:    v_cndmask_b32_e32 v4, v7, v4, vcc
-; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v10, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
 ; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
 ; CGP-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; CGP-NEXT:    v_xor_b32_e32 v2, v2, v8
-; CGP-NEXT:    v_xor_b32_e32 v3, v3, v8
-; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
-; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v8, vcc
+; CGP-NEXT:    v_xor_b32_e32 v2, v2, v10
+; CGP-NEXT:    v_xor_b32_e32 v3, v3, v10
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v2, v10
+; CGP-NEXT:    v_subb_u32_e32 v3, vcc, v3, v10, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = srem <2 x i64> %num, <i64 1235195, i64 1235195>
   ret <2 x i64> %result
@@ -2403,191 +2351,192 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_srem_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v4
+; GISEL-NEXT:    v_mov_b32_e32 v8, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0
+; GISEL-NEXT:    v_lshl_b64 v[4:5], v[8:9], v4
+; GISEL-NEXT:    v_lshl_b64 v[8:9], v[8:9], v6
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v5
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v5, v7, vcc
+; GISEL-NEXT:    v_addc_u32_e32 v10, vcc, v5, v7, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v4, v7
-; GISEL-NEXT:    v_xor_b32_e32 v7, v8, v7
+; GISEL-NEXT:    v_xor_b32_e32 v7, v10, v7
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v4, v5
-; GISEL-NEXT:    v_cvt_f32_u32_e32 v8, v7
-; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, 0, v5
-; GISEL-NEXT:    v_subb_u32_e32 v13, vcc, 0, v7, vcc
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v8
+; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v7
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, 0, v5
+; GISEL-NEXT:    v_subb_u32_e32 v15, vcc, 0, v7, vcc
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0x4f800000, v10
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v4, v4
 ; GISEL-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v4
-; GISEL-NEXT:    v_trunc_f32_e32 v10, v8
-; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v10
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v11, v4
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v14, v10
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v12, v11, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v12, v14, v[4:5]
-; GISEL-NEXT:    v_mul_lo_u32 v4, v14, v8
-; GISEL-NEXT:    v_mul_hi_u32 v15, v11, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v13, v11, v[9:10]
-; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v15
+; GISEL-NEXT:    v_mul_f32_e32 v10, 0x2f800000, v4
+; GISEL-NEXT:    v_trunc_f32_e32 v12, v10
+; GISEL-NEXT:    v_mac_f32_e32 v4, 0xcf800000, v12
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v13, v4
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v16, v12
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
+; GISEL-NEXT:    v_mul_lo_u32 v4, v16, v10
+; GISEL-NEXT:    v_mul_hi_u32 v17, v13, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
+; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
+; GISEL-NEXT:    v_mul_lo_u32 v12, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v4, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v15, v14, v9
+; GISEL-NEXT:    v_mul_lo_u32 v17, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; GISEL-NEXT:    v_mul_hi_u32 v12, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v17, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
+; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v10, v4
-; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v9
-; GISEL-NEXT:    v_add_i32_e32 v4, vcc, v8, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v4
-; GISEL-NEXT:    v_addc_u32_e32 v14, vcc, v14, v8, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v12, v11, 0
-; GISEL-NEXT:    v_mov_b32_e32 v4, v9
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v12, v14, v[4:5]
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v4
+; GISEL-NEXT:    v_addc_u32_e32 v16, vcc, v16, v10, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v14, v13, 0
+; GISEL-NEXT:    v_mov_b32_e32 v4, v11
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v14, v16, v[4:5]
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[6:7], v13, v11, v[9:10]
+; GISEL-NEXT:    v_mad_u64_u32 v[11:12], s[4:5], v15, v13, v[11:12]
 ; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v1, v4, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v12, v0, v4
-; GISEL-NEXT:    v_mul_lo_u32 v0, v14, v8
-; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v13, v1, v4
-; GISEL-NEXT:    v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v14, v8
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v0, v16, v10
+; GISEL-NEXT:    v_mul_lo_u32 v14, v13, v11
+; GISEL-NEXT:    v_xor_b32_e32 v15, v1, v4
+; GISEL-NEXT:    v_mul_hi_u32 v1, v13, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v16, v10
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v1, v14, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v9
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v1, v16, v11
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_mul_hi_u32 v14, v13, v11
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v9, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
+; GISEL-NEXT:    v_mul_hi_u32 v11, v16, v11
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v1, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v14, v1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v1
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v1
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v1
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v11, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_addc_u32_e32 v1, vcc, v16, v1, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, v15, v0
+; GISEL-NEXT:    v_mul_lo_u32 v11, v12, v1
+; GISEL-NEXT:    v_mul_hi_u32 v13, v12, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
+; GISEL-NEXT:    v_mul_hi_u32 v14, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v0, v8
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v1
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], v5, v14, 0
-; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[6:7], v5, v8, v[1:2]
-; GISEL-NEXT:    v_lshl_b64 v[10:11], s[4:5], v6
-; GISEL-NEXT:    v_sub_i32_e32 v12, vcc, v12, v0
-; GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[4:5], v7, v14, v[8:9]
-; GISEL-NEXT:    v_subb_u32_e64 v14, s[4:5], v13, v8, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v13, v8
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v7
+; GISEL-NEXT:    v_mul_lo_u32 v13, v15, v1
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_hi_u32 v11, v12, v1
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v13, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v5, v13, 0
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v5, v10, v[1:2]
+; GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[4:5], v7, v13, v[10:11]
+; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v12, v0
+; GISEL-NEXT:    v_subb_u32_e64 v12, s[4:5], v15, v10, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v15, v10
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v7
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v12, v5
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v14, v7
-; GISEL-NEXT:    v_subb_u32_e32 v9, vcc, v0, v7, vcc
-; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v11
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v12, v7
+; GISEL-NEXT:    v_subb_u32_e32 v10, vcc, v0, v7, vcc
+; GISEL-NEXT:    v_ashrrev_i32_e32 v0, 31, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, v1, v6, s[4:5]
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v0
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v11, v0, vcc
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v8, v0
+; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v9, v0, vcc
 ; GISEL-NEXT:    v_xor_b32_e32 v6, v1, v0
 ; GISEL-NEXT:    v_xor_b32_e32 v8, v8, v0
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v0, v6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v1, v8
-; GISEL-NEXT:    v_sub_i32_e32 v11, vcc, v12, v5
-; GISEL-NEXT:    v_subbrev_u32_e64 v15, s[4:5], 0, v9, vcc
+; GISEL-NEXT:    v_sub_i32_e32 v14, vcc, v11, v5
+; GISEL-NEXT:    v_subbrev_u32_e64 v15, s[4:5], 0, v10, vcc
 ; GISEL-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v15, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v11, v5
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v14, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v15, v7
 ; GISEL-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, v10, v1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, v9, v1, s[4:5]
 ; GISEL-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
-; GISEL-NEXT:    v_trunc_f32_e32 v10, v1
-; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v10
+; GISEL-NEXT:    v_trunc_f32_e32 v9, v1
+; GISEL-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v9
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v17, v0
 ; GISEL-NEXT:    v_sub_i32_e64 v18, s[4:5], 0, v6
 ; GISEL-NEXT:    v_subb_u32_e64 v19, s[4:5], 0, v8, s[4:5]
 ; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v18, v17, 0
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v20, v10
-; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v9, v7, vcc
+; GISEL-NEXT:    v_cvt_u32_f32_e32 v20, v9
+; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v10, v7, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v18, v20, v[1:2]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v11, v5
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v14, v5
 ; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v19, v17, v[9:10]
 ; GISEL-NEXT:    v_subbrev_u32_e32 v5, vcc, 0, v7, vcc
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v11, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v14, v1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v1, v20, v0
 ; GISEL-NEXT:    v_mul_lo_u32 v10, v17, v9
-; GISEL-NEXT:    v_mul_hi_u32 v11, v17, v0
+; GISEL-NEXT:    v_mul_hi_u32 v14, v17, v0
 ; GISEL-NEXT:    v_cndmask_b32_e32 v5, v15, v5, vcc
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v20, v0
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v11
+; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v14
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v11, v20, v9
+; GISEL-NEXT:    v_mul_lo_u32 v14, v20, v9
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_mul_hi_u32 v10, v17, v9
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
 ; GISEL-NEXT:    v_mul_hi_u32 v9, v20, v9
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v1
 ; GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GISEL-NEXT:    v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v17, v0
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v17, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v15, vcc, v20, v1, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v18, v11, 0
+; GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v18, v14, 0
 ; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v13
-; GISEL-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v7, v11, v7, vcc
 ; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v18, v15, v[1:2]
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v7, v4
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
-; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v19, v11, v[9:10]
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v14, v5, vcc
+; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v19, v14, v[9:10]
+; GISEL-NEXT:    v_cndmask_b32_e32 v5, v12, v5, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
 ; GISEL-NEXT:    v_addc_u32_e32 v3, vcc, v3, v7, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v12, v2, v7
+; GISEL-NEXT:    v_xor_b32_e32 v11, v2, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v2, v15, v0
-; GISEL-NEXT:    v_mul_lo_u32 v10, v11, v9
-; GISEL-NEXT:    v_xor_b32_e32 v13, v3, v7
-; GISEL-NEXT:    v_mul_hi_u32 v3, v11, v0
+; GISEL-NEXT:    v_mul_lo_u32 v10, v14, v9
+; GISEL-NEXT:    v_xor_b32_e32 v12, v3, v7
+; GISEL-NEXT:    v_mul_hi_u32 v3, v14, v0
 ; GISEL-NEXT:    v_mul_hi_u32 v0, v15, v0
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v2, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
@@ -2595,7 +2544,7 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v15, v9
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v10, v2
-; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v9
+; GISEL-NEXT:    v_mul_hi_u32 v10, v14, v9
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v3, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v10
@@ -2606,28 +2555,28 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
 ; GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
-; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v11, v0
+; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v14, v0
 ; GISEL-NEXT:    v_addc_u32_e32 v2, vcc, v15, v2, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v3, v13, v0
-; GISEL-NEXT:    v_mul_lo_u32 v9, v12, v2
-; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v0
-; GISEL-NEXT:    v_mul_hi_u32 v0, v13, v0
+; GISEL-NEXT:    v_mul_lo_u32 v3, v12, v0
+; GISEL-NEXT:    v_mul_lo_u32 v9, v11, v2
+; GISEL-NEXT:    v_mul_hi_u32 v10, v11, v0
+; GISEL-NEXT:    v_mul_hi_u32 v0, v12, v0
 ; GISEL-NEXT:    v_xor_b32_e32 v5, v5, v4
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v13, v2
+; GISEL-NEXT:    v_mul_lo_u32 v10, v12, v2
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v9, v3
-; GISEL-NEXT:    v_mul_hi_u32 v9, v12, v2
+; GISEL-NEXT:    v_mul_hi_u32 v9, v11, v2
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v10, v0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v0, v3
-; GISEL-NEXT:    v_mul_hi_u32 v10, v13, v2
-; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v11, 0
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v0, v3
+; GISEL-NEXT:    v_mul_hi_u32 v10, v12, v2
+; GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], v6, v13, 0
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v0, vcc, v9, v0
 ; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v10, v0
@@ -2635,10 +2584,10 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-NEXT:    v_mad_u64_u32 v[9:10], s[4:5], v6, v9, v[0:1]
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v1, v4
 ; GISEL-NEXT:    v_subb_u32_e32 v1, vcc, v5, v4, vcc
-; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v11, v[9:10]
-; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v12, v2
-; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v13, v3, vcc
-; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v13, v3
+; GISEL-NEXT:    v_mad_u64_u32 v[3:4], s[4:5], v8, v13, v[9:10]
+; GISEL-NEXT:    v_sub_i32_e32 v2, vcc, v11, v2
+; GISEL-NEXT:    v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
+; GISEL-NEXT:    v_sub_i32_e64 v3, s[4:5], v12, v3
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v4, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[4:5]
 ; GISEL-NEXT:    v_cmp_ge_u32_e64 s[4:5], v2, v6
@@ -2672,17 +2621,16 @@ define <2 x i64> @v_srem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-LABEL: v_srem_v2i64_pow2_shl_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
-; CGP-NEXT:    v_lshl_b64 v[11:12], s[4:5], v4
+; CGP-NEXT:    v_mov_b32_e32 v5, v2
+; CGP-NEXT:    v_mov_b32_e32 v7, v3
+; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
+; CGP-NEXT:    v_mov_b32_e32 v3, 0
+; CGP-NEXT:    v_lshl_b64 v[11:12], v[2:3], v4
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v8, v0
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v12
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_mov_b32_e32 v5, v2
-; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    v_mov_b32_e32 v2, 0x1000
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
-; CGP-NEXT:    v_mov_b32_e32 v3, 0
 ; CGP-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; CGP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; CGP-NEXT:    s_xor_b64 s[6:7], exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 65455d754be4f53..61e1e67b7ae360e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -231,23 +231,21 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 8, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
 ; GFX6-NEXT:    v_max_i32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x80000000, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
@@ -300,8 +298,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -312,14 +310,14 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -440,8 +438,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -460,10 +458,10 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -506,41 +504,39 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 24, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
+; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v11
 ; GFX6-NEXT:    v_max_i32_e32 v1, v8, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, v1, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v2
+; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v11
 ; GFX6-NEXT:    v_max_i32_e32 v2, v5, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
 ; GFX6-NEXT:    v_max_i32_e32 v3, v5, v3
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 24, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
@@ -639,11 +635,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_sub_i16 v2, v2, v3 clamp
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v2 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -677,7 +673,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v1, v2, 0xff, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -709,7 +705,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT:    v_and_or_b32 v1, v1, 0xff, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
 ; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
@@ -867,46 +863,46 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ;
 ; GFX9-LABEL: s_ssubsat_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    s_mov_b32 s2, 8
-; GFX9-NEXT:    v_pk_sub_i16 v1, s3, v1 clamp
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_pk_sub_i16 v1, s2, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    s_mov_b32 s5, 24
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -941,14 +937,14 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_sub_i16 v1, s2, s3 clamp
-; GFX10-NEXT:    s_mov_b32 s0, 8
+; GFX10-NEXT:    v_mov_b32_e32 v2, 8
+; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT:    s_mov_b32 s0, 24
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -988,7 +984,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
@@ -1265,19 +1261,17 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v2i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x80000000, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
@@ -1286,19 +1280,17 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v2i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v0
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX8-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s5, v5
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 0x80000000, v5
 ; GFX8-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX8-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v2
 ; GFX8-NEXT:    v_max_i32_e32 v2, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, s4, v2
+; GFX8-NEXT:    v_subrev_u32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX8-NEXT:    v_min_i32_e32 v4, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s5, v4
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 0x80000000, v4
 ; GFX8-NEXT:    v_max_i32_e32 v2, v2, v3
 ; GFX8-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v2
@@ -1383,26 +1375,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v3i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v6, -1, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s4, v6
-; GFX6-NEXT:    v_min_i32_e32 v7, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, s5, v7
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x7fffffff, v6
+; GFX6-NEXT:    v_min_i32_e32 v8, -1, v0
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x80000000, v8
 ; GFX6-NEXT:    v_max_i32_e32 v3, v6, v3
-; GFX6-NEXT:    v_min_i32_e32 v3, v3, v7
+; GFX6-NEXT:    v_min_i32_e32 v3, v3, v8
+; GFX6-NEXT:    v_bfrev_b32_e32 v7, -2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v7
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_max_i32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x80000000, v4
 ; GFX6-NEXT:    v_max_i32_e32 v3, v3, v5
 ; GFX6-NEXT:    v_min_i32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v3
@@ -1411,26 +1402,25 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v3i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v6, -1, v0
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s4, v6
-; GFX8-NEXT:    v_min_i32_e32 v7, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v7, vcc, s5, v7
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x7fffffff, v6
+; GFX8-NEXT:    v_min_i32_e32 v8, -1, v0
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 0x80000000, v8
 ; GFX8-NEXT:    v_max_i32_e32 v3, v6, v3
-; GFX8-NEXT:    v_min_i32_e32 v3, v3, v7
+; GFX8-NEXT:    v_min_i32_e32 v3, v3, v8
+; GFX8-NEXT:    v_bfrev_b32_e32 v7, -2
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v7
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s5, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x80000000, v6
 ; GFX8-NEXT:    v_max_i32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v6
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v3
 ; GFX8-NEXT:    v_max_i32_e32 v3, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s4, v3
+; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX8-NEXT:    v_min_i32_e32 v4, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s5, v4
+; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, 0x80000000, v4
 ; GFX8-NEXT:    v_max_i32_e32 v3, v3, v5
 ; GFX8-NEXT:    v_min_i32_e32 v3, v3, v4
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v3
@@ -1536,26 +1526,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v4i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
-; GFX6-NEXT:    v_min_i32_e32 v9, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v9, vcc, s5, v9
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8
+; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
-; GFX6-NEXT:    v_min_i32_e32 v4, v4, v9
+; GFX6-NEXT:    v_min_i32_e32 v4, v4, v10
+; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v9
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x80000000, v8
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v9
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
@@ -1571,26 +1561,26 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v4i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v8, -1, v0
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s4, v8
-; GFX8-NEXT:    v_min_i32_e32 v9, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v9, vcc, s5, v9
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 0x7fffffff, v8
+; GFX8-NEXT:    v_min_i32_e32 v10, -1, v0
+; GFX8-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v10, v11
 ; GFX8-NEXT:    v_max_i32_e32 v4, v8, v4
-; GFX8-NEXT:    v_min_i32_e32 v4, v4, v9
+; GFX8-NEXT:    v_min_i32_e32 v4, v4, v10
+; GFX8-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v9
 ; GFX8-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, s5, v8
+; GFX8-NEXT:    v_subrev_u32_e32 v8, vcc, 0x80000000, v8
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v4
 ; GFX8-NEXT:    v_max_i32_e32 v4, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v4, vcc, s4, v4
+; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v9
 ; GFX8-NEXT:    v_min_i32_e32 v5, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s5, v5
+; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, 0x80000000, v5
 ; GFX8-NEXT:    v_max_i32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v4
@@ -1724,29 +1714,28 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v5i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v10, -1, v0
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s4, v10
+; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, 0x7fffffff, v10
 ; GFX6-NEXT:    v_min_i32_e32 v12, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s5, v12
+; GFX6-NEXT:    v_bfrev_b32_e32 v13, 1
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v13
 ; GFX6-NEXT:    v_max_i32_e32 v5, v10, v5
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v12
+; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v13
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_min_i32_e32 v5, v5, v6
-; GFX6-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v11
@@ -1767,29 +1756,28 @@ define <5 x i32> @v_ssubsat_v5i32(<5 x i32> %lhs, <5 x i32> %rhs) {
 ; GFX8-LABEL: v_ssubsat_v5i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s4, -2
 ; GFX8-NEXT:    v_max_i32_e32 v10, -1, v0
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s4, v10
+; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, 0x7fffffff, v10
 ; GFX8-NEXT:    v_min_i32_e32 v12, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v12, vcc, s5, v12
+; GFX8-NEXT:    v_bfrev_b32_e32 v13, 1
+; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, v12, v13
 ; GFX8-NEXT:    v_max_i32_e32 v5, v10, v5
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v12
+; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
 ; GFX8-NEXT:    v_min_i32_e32 v10, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v10, vcc, s5, v10
+; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v10, v13
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v6
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v10
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v5, vcc, s4, v5
+; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
 ; GFX8-NEXT:    v_min_i32_e32 v6, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, s5, v6
+; GFX8-NEXT:    v_subrev_u32_e32 v6, vcc, 0x80000000, v6
 ; GFX8-NEXT:    v_max_i32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_min_i32_e32 v5, v5, v6
-; GFX8-NEXT:    v_bfrev_b32_e32 v11, -2
 ; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v5
 ; GFX8-NEXT:    v_max_i32_e32 v5, -1, v3
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v11
@@ -1949,246 +1937,242 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) {
 ; GFX6-LABEL: v_ssubsat_v16i32:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_brev_b32 s4, -2
-; GFX6-NEXT:    v_max_i32_e32 v31, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v31, vcc, s4, v31
-; GFX6-NEXT:    v_max_i32_e32 v16, v31, v16
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_min_i32_e32 v31, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v31, vcc, s5, v31
-; GFX6-NEXT:    v_min_i32_e32 v16, v16, v31
-; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v16
-; GFX6-NEXT:    v_max_i32_e32 v16, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s4, v16
-; GFX6-NEXT:    v_max_i32_e32 v16, v16, v17
-; GFX6-NEXT:    v_min_i32_e32 v17, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v17, vcc, s5, v17
-; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
-; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v16
-; GFX6-NEXT:    v_max_i32_e32 v16, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s4, v16
-; GFX6-NEXT:    v_min_i32_e32 v17, -1, v2
-; GFX6-NEXT:    v_max_i32_e32 v16, v16, v18
-; GFX6-NEXT:    v_subrev_i32_e32 v17, vcc, s5, v17
-; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
-; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v16
-; GFX6-NEXT:    v_bfrev_b32_e32 v16, -2
+; GFX6-NEXT:    v_max_i32_e32 v32, -1, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v31, -2
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v32, v31
+; GFX6-NEXT:    v_max_i32_e32 v32, v32, v16
+; GFX6-NEXT:    v_min_i32_e32 v33, -1, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX6-NEXT:    v_sub_i32_e32 v33, vcc, v33, v16
+; GFX6-NEXT:    v_min_i32_e32 v32, v32, v33
+; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v32
+; GFX6-NEXT:    v_max_i32_e32 v32, -1, v1
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v32, v31
+; GFX6-NEXT:    v_max_i32_e32 v17, v32, v17
+; GFX6-NEXT:    v_min_i32_e32 v32, -1, v1
+; GFX6-NEXT:    v_sub_i32_e32 v32, vcc, v32, v16
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v32
+; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v17
+; GFX6-NEXT:    v_max_i32_e32 v17, -1, v2
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
+; GFX6-NEXT:    v_min_i32_e32 v18, -1, v2
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
+; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v18, -1, v3
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX6-NEXT:    v_bfrev_b32_e32 v18, 1
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v4
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v4
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v18, -1, v4
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_sub_i32_e32 v4, vcc, v4, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v5
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v5
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v18, -1, v5
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v21
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v6
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v19, -1, v6
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v18, -1, v6
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v22
-; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
-; GFX6-NEXT:    buffer_load_dword v19, off, s[0:3], s32
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v16
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v18
+; GFX6-NEXT:    buffer_load_dword v18, off, s[0:3], s32
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v7
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v20, -1, v7
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v19, -1, v7
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v8
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v20, -1, v8
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v19, -1, v8
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v24
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v9
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v20, -1, v9
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v19, -1, v9
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v25
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v10
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v20, -1, v10
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v19, -1, v10
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v26
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v11
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v20, -1, v11
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v19, -1, v11
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v27
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v11, vcc, v11, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v12
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v20, -1, v12
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v19, -1, v12
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v28
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v13
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v20, -1, v13
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v19, -1, v13
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v29
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v13, vcc, v13, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v14
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v20, -1, v14
-; GFX6-NEXT:    v_sub_i32_e32 v20, vcc, v20, v18
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v19, -1, v14
+; GFX6-NEXT:    v_sub_i32_e32 v19, vcc, v19, v16
 ; GFX6-NEXT:    v_max_i32_e32 v17, v17, v30
-; GFX6-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX6-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v17
 ; GFX6-NEXT:    v_max_i32_e32 v17, -1, v15
-; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v17, v16
-; GFX6-NEXT:    v_min_i32_e32 v17, -1, v15
-; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v18
+; GFX6-NEXT:    v_sub_i32_e32 v17, vcc, v17, v31
+; GFX6-NEXT:    v_min_i32_e32 v19, -1, v15
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v19, v16
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
-; GFX6-NEXT:    v_max_i32_e32 v16, v16, v19
-; GFX6-NEXT:    v_min_i32_e32 v16, v16, v17
+; GFX6-NEXT:    v_max_i32_e32 v17, v17, v18
+; GFX6-NEXT:    v_min_i32_e32 v16, v17, v16
 ; GFX6-NEXT:    v_sub_i32_e32 v15, vcc, v15, v16
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: v_ssubsat_v16i32:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    s_brev_b32 s4, -2
-; GFX8-NEXT:    v_max_i32_e32 v31, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v31, vcc, s4, v31
-; GFX8-NEXT:    v_max_i32_e32 v16, v31, v16
-; GFX8-NEXT:    s_brev_b32 s5, 1
-; GFX8-NEXT:    v_min_i32_e32 v31, -1, v0
-; GFX8-NEXT:    v_subrev_u32_e32 v31, vcc, s5, v31
-; GFX8-NEXT:    v_min_i32_e32 v16, v16, v31
-; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v16
-; GFX8-NEXT:    v_max_i32_e32 v16, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_max_i32_e32 v16, v16, v17
-; GFX8-NEXT:    v_min_i32_e32 v17, -1, v1
-; GFX8-NEXT:    v_subrev_u32_e32 v17, vcc, s5, v17
-; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
-; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v16
-; GFX8-NEXT:    v_max_i32_e32 v16, -1, v2
-; GFX8-NEXT:    v_subrev_u32_e32 v16, vcc, s4, v16
-; GFX8-NEXT:    v_min_i32_e32 v17, -1, v2
-; GFX8-NEXT:    v_max_i32_e32 v16, v16, v18
-; GFX8-NEXT:    v_subrev_u32_e32 v17, vcc, s5, v17
-; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
-; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v16
-; GFX8-NEXT:    v_bfrev_b32_e32 v16, -2
+; GFX8-NEXT:    v_max_i32_e32 v32, -1, v0
+; GFX8-NEXT:    v_bfrev_b32_e32 v31, -2
+; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT:    v_max_i32_e32 v32, v32, v16
+; GFX8-NEXT:    v_min_i32_e32 v33, -1, v0
+; GFX8-NEXT:    v_bfrev_b32_e32 v16, 1
+; GFX8-NEXT:    v_sub_u32_e32 v33, vcc, v33, v16
+; GFX8-NEXT:    v_min_i32_e32 v32, v32, v33
+; GFX8-NEXT:    v_sub_u32_e32 v0, vcc, v0, v32
+; GFX8-NEXT:    v_max_i32_e32 v32, -1, v1
+; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v32, v31
+; GFX8-NEXT:    v_max_i32_e32 v17, v32, v17
+; GFX8-NEXT:    v_min_i32_e32 v32, -1, v1
+; GFX8-NEXT:    v_sub_u32_e32 v32, vcc, v32, v16
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v32
+; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, v1, v17
+; GFX8-NEXT:    v_max_i32_e32 v17, -1, v2
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
+; GFX8-NEXT:    v_min_i32_e32 v18, -1, v2
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
+; GFX8-NEXT:    v_sub_u32_e32 v2, vcc, v2, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v3
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v18, -1, v3
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v19
-; GFX8-NEXT:    v_bfrev_b32_e32 v18, 1
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v3
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_sub_u32_e32 v3, vcc, v3, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v4
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v4
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v18, -1, v4
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v20
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_sub_u32_e32 v4, vcc, v4, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v5
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v5
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v18, -1, v5
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v21
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
 ; GFX8-NEXT:    v_sub_u32_e32 v5, vcc, v5, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v6
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v19, -1, v6
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v18, -1, v6
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v22
-; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
-; GFX8-NEXT:    buffer_load_dword v19, off, s[0:3], s32
+; GFX8-NEXT:    v_sub_u32_e32 v18, vcc, v18, v16
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v18
+; GFX8-NEXT:    buffer_load_dword v18, off, s[0:3], s32
 ; GFX8-NEXT:    v_sub_u32_e32 v6, vcc, v6, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v7
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v20, -1, v7
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v19, -1, v7
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v23
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v7, vcc, v7, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v8
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v20, -1, v8
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v19, -1, v8
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v24
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v8, vcc, v8, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v9
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v20, -1, v9
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v19, -1, v9
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v25
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v9, vcc, v9, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v10
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v20, -1, v10
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v19, -1, v10
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v26
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v10, vcc, v10, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v11
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v20, -1, v11
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v19, -1, v11
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v27
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v11, vcc, v11, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v12
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v20, -1, v12
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v19, -1, v12
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v28
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v12, vcc, v12, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v13
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v20, -1, v13
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v19, -1, v13
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v29
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v13, vcc, v13, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v14
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v20, -1, v14
-; GFX8-NEXT:    v_sub_u32_e32 v20, vcc, v20, v18
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v19, -1, v14
+; GFX8-NEXT:    v_sub_u32_e32 v19, vcc, v19, v16
 ; GFX8-NEXT:    v_max_i32_e32 v17, v17, v30
-; GFX8-NEXT:    v_min_i32_e32 v17, v17, v20
+; GFX8-NEXT:    v_min_i32_e32 v17, v17, v19
 ; GFX8-NEXT:    v_sub_u32_e32 v14, vcc, v14, v17
 ; GFX8-NEXT:    v_max_i32_e32 v17, -1, v15
-; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v17, v16
-; GFX8-NEXT:    v_min_i32_e32 v17, -1, v15
-; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v18
+; GFX8-NEXT:    v_sub_u32_e32 v17, vcc, v17, v31
+; GFX8-NEXT:    v_min_i32_e32 v19, -1, v15
+; GFX8-NEXT:    v_sub_u32_e32 v16, vcc, v19, v16
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_max_i32_e32 v16, v16, v19
-; GFX8-NEXT:    v_min_i32_e32 v16, v16, v17
+; GFX8-NEXT:    v_max_i32_e32 v17, v17, v18
+; GFX8-NEXT:    v_min_i32_e32 v16, v17, v16
 ; GFX8-NEXT:    v_sub_u32_e32 v15, vcc, v15, v16
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -2766,22 +2750,20 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v4, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s4, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x7fffffff, v4
 ; GFX6-NEXT:    v_min_i32_e32 v5, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s5, v5
+; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, 0x80000000, v5
 ; GFX6-NEXT:    v_max_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v5
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_max_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7fffffff, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, s5, v4
+; GFX6-NEXT:    v_subrev_i32_e32 v4, vcc, 0x80000000, v4
 ; GFX6-NEXT:    v_max_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v4
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
@@ -2978,22 +2960,20 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-LABEL: ssubsat_v2i16_vs:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s2, -2
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v0
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, 16
-; GFX6-NEXT:    s_brev_b32 s3, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GFX6-NEXT:    v_max_i32_e32 v2, -1, v1
 ; GFX6-NEXT:    s_lshl_b32 s0, s1, 16
-; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, s2, v2
+; GFX6-NEXT:    v_subrev_i32_e32 v2, vcc, 0x7fffffff, v2
 ; GFX6-NEXT:    v_min_i32_e32 v3, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, s3, v3
+; GFX6-NEXT:    v_subrev_i32_e32 v3, vcc, 0x80000000, v3
 ; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
@@ -3056,37 +3036,35 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v8, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s4, v8
+; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, 0x7fffffff, v8
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
+; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v10
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v5, vcc, s4, v5
+; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_bfrev_b32_e32 v9, -2
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v8
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, s5, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_max_i32_e32 v5, -1, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v11, 1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
@@ -3320,37 +3298,35 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v12, -1, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s4, v12
+; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, 0x7fffffff, v12
 ; GFX6-NEXT:    v_min_i32_e32 v14, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v14, vcc, s5, v14
+; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
+; GFX6-NEXT:    v_sub_i32_e32 v14, vcc, v14, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v12, v6
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v14
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v7
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v7, vcc, s4, v7
+; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
 ; GFX6-NEXT:    v_min_i32_e32 v12, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v12, vcc, s5, v12
+; GFX6-NEXT:    v_sub_i32_e32 v12, vcc, v12, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_bfrev_b32_e32 v13, -2
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v12
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v8
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v8, vcc, s5, v8
+; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
 ; GFX6-NEXT:    v_max_i32_e32 v7, -1, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v15, 1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
@@ -3674,37 +3650,35 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT:    s_brev_b32 s4, -2
 ; GFX6-NEXT:    v_max_i32_e32 v16, -1, v0
+; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX6-NEXT:    s_brev_b32 s5, 1
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s4, v16
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v17
 ; GFX6-NEXT:    v_min_i32_e32 v18, -1, v0
-; GFX6-NEXT:    v_subrev_i32_e32 v18, vcc, s5, v18
+; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
+; GFX6-NEXT:    v_sub_i32_e32 v18, vcc, v18, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v16, v8
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v18
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v0, vcc, v0, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v9, vcc, s4, v9
+; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v16, -1, v1
-; GFX6-NEXT:    v_subrev_i32_e32 v16, vcc, s5, v16
+; GFX6-NEXT:    v_sub_i32_e32 v16, vcc, v16, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_bfrev_b32_e32 v17, -2
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v16
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v2
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v10
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v2
-; GFX6-NEXT:    v_subrev_i32_e32 v10, vcc, s5, v10
+; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v3
-; GFX6-NEXT:    v_bfrev_b32_e32 v19, 1
 ; GFX6-NEXT:    v_sub_i32_e32 v2, vcc, v2, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v11
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
index e04b2b71277d0b7..aa7aa6b21a562f7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sub.v2i16.ll
@@ -147,10 +147,10 @@ define <2 x i16> @v_sub_v2i16_neg_inline_imm_splat(<2 x i16> %a) {
 ; GFX8-LABEL: v_sub_v2i16_neg_inline_imm_splat:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_mov_b32_e32 v2, 0xffffffc0
-; GFX8-NEXT:    v_subrev_u16_e32 v1, 0xffc0, v0
-; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffc0
+; GFX8-NEXT:    v_subrev_u16_e32 v2, 0xffc0, v0
+; GFX8-NEXT:    v_sub_u16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_sub_v2i16_neg_inline_imm_splat:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 7744a229392ca91..1821d29d4b050b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -225,8 +225,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -237,14 +237,14 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -330,8 +330,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -350,10 +350,10 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -479,11 +479,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_add_u16 v2, v2, v3 clamp
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -517,7 +517,7 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v1, v2, 0xff, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -549,7 +549,7 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT:    v_and_or_b32 v1, v1, 0xff, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
 ; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
@@ -638,46 +638,46 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ;
 ; GFX9-LABEL: s_uaddsat_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    s_mov_b32 s2, 8
-; GFX9-NEXT:    v_pk_add_u16 v1, s3, v1 clamp
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_pk_add_u16 v1, s2, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    s_mov_b32 s5, 24
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -712,14 +712,14 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_add_u16 v1, s2, s3 clamp
-; GFX10-NEXT:    s_mov_b32 s0, 8
+; GFX10-NEXT:    v_mov_b32_e32 v2, 8
+; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT:    s_mov_b32 s0, 24
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -759,7 +759,7 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
index 926c3d59e2e463b..6588112973f4c94 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i32.ll
@@ -242,15 +242,15 @@ define <2 x i32> @v_udiv_v2i32_oddk_denom(<2 x i32> %num) {
 ; CHECK-LABEL: v_udiv_v2i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0xb2a50881
-; CHECK-NEXT:    v_mul_hi_u32 v2, v0, s4
-; CHECK-NEXT:    v_mul_hi_u32 v3, v1, s4
-; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0xb2a50881
+; CHECK-NEXT:    v_mul_hi_u32 v3, v0, v2
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 1, v1
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v3
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v0, 20, v0
 ; CHECK-NEXT:    v_lshrrev_b32_e32 v1, 20, v1
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index b3b57e14cb3fb55..3add708d1a6394d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -980,28 +980,28 @@ define i64 @v_udiv_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_udiv_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x1fb03c31
-; CHECK-NEXT:    s_mov_b32 s5, 0xd9528440
-; CHECK-NEXT:    v_mul_lo_u32 v2, v1, s4
-; CHECK-NEXT:    v_mul_lo_u32 v3, v0, s5
-; CHECK-NEXT:    v_mul_hi_u32 v4, v0, s4
-; CHECK-NEXT:    v_mul_lo_u32 v5, v1, s5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v1, s4
-; CHECK-NEXT:    v_mul_hi_u32 v0, v0, s5
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v5, v0
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0x1fb03c31
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0xd9528440
+; CHECK-NEXT:    v_mul_lo_u32 v4, v1, v2
+; CHECK-NEXT:    v_mul_lo_u32 v5, v0, v3
+; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v2
+; CHECK-NEXT:    v_mul_lo_u32 v7, v1, v3
+; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
+; CHECK-NEXT:    v_mul_hi_u32 v0, v0, v3
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v6, v4
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v2, v0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT:    v_mul_hi_u32 v1, v1, s5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v7, v2
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
+; CHECK-NEXT:    v_mul_hi_u32 v1, v1, v3
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_lshr_b64 v[0:1], v[0:1], 20
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1013,50 +1013,50 @@ define <2 x i64> @v_udiv_v2i64_oddk_denom(<2 x i64> %num) {
 ; CHECK-LABEL: v_udiv_v2i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x1fb03c31
-; CHECK-NEXT:    s_mov_b32 s5, 0xd9528440
-; CHECK-NEXT:    v_mul_lo_u32 v4, v1, s4
-; CHECK-NEXT:    v_mul_lo_u32 v5, v0, s5
-; CHECK-NEXT:    v_mul_hi_u32 v6, v0, s4
-; CHECK-NEXT:    v_mul_lo_u32 v7, v1, s5
-; CHECK-NEXT:    v_mul_hi_u32 v8, v1, s4
-; CHECK-NEXT:    v_mul_hi_u32 v0, v0, s5
-; CHECK-NEXT:    v_mul_hi_u32 v1, v1, s5
-; CHECK-NEXT:    v_mul_lo_u32 v9, v3, s4
-; CHECK-NEXT:    v_mul_lo_u32 v10, v2, s5
-; CHECK-NEXT:    v_mul_hi_u32 v11, v2, s4
-; CHECK-NEXT:    v_mul_lo_u32 v12, v3, s5
-; CHECK-NEXT:    v_mul_hi_u32 v13, v3, s4
-; CHECK-NEXT:    v_mul_hi_u32 v2, v2, s5
-; CHECK-NEXT:    v_mul_hi_u32 v3, v3, s5
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
+; CHECK-NEXT:    v_mov_b32_e32 v4, 0x1fb03c31
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0xd9528440
+; CHECK-NEXT:    v_mul_lo_u32 v6, v1, v4
+; CHECK-NEXT:    v_mul_lo_u32 v7, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v0, v4
+; CHECK-NEXT:    v_mul_lo_u32 v9, v1, v5
+; CHECK-NEXT:    v_mul_hi_u32 v10, v1, v4
+; CHECK-NEXT:    v_mul_hi_u32 v0, v0, v5
+; CHECK-NEXT:    v_mul_hi_u32 v1, v1, v5
+; CHECK-NEXT:    v_mul_lo_u32 v11, v3, v4
+; CHECK-NEXT:    v_mul_lo_u32 v12, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v13, v2, v4
+; CHECK-NEXT:    v_mul_lo_u32 v14, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v4, v3, v4
+; CHECK-NEXT:    v_mul_hi_u32 v2, v2, v5
+; CHECK-NEXT:    v_mul_hi_u32 v3, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v10
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v10, vcc, v11, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v14, v4
+; CHECK-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
-; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; CHECK-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v12, v2
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v8, v6
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v7
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v13, v9
-; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v13
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
 ; CHECK-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v6
-; CHECK-NEXT:    v_cndmask_b32_e64 v6, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v7, v6
-; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v4
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v9, v7
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v11, v8
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v12, v4
+; CHECK-NEXT:    v_add_i32_e32 v0, vcc, v0, v5
+; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v2, vcc, v2, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v7
+; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; CHECK-NEXT:    v_lshr_b64 v[0:1], v[0:1], 20
 ; CHECK-NEXT:    v_lshr_b64 v[2:3], v[2:3], 20
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
@@ -1248,9 +1248,10 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_udiv_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[7:8], s[4:5], v4
-; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v6
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v10, 0
+; GISEL-NEXT:    v_lshl_b64 v[7:8], v[9:10], v4
+; GISEL-NEXT:    v_lshl_b64 v[4:5], v[9:10], v6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v7
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, 0, v7
@@ -1510,11 +1511,10 @@ define <2 x i64> @v_udiv_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
 ; CGP-NEXT:    v_mov_b32_e32 v10, 0x1000
 ; CGP-NEXT:    v_mov_b32_e32 v11, 0
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
+; CGP-NEXT:    v_lshl_b64 v[2:3], v[10:11], v4
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index fba8ef2948ade94..8b4e218f78948b2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -234,14 +234,13 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v9, v12, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v9, v10, v13, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
-; GFX8-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v3, v9, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v7, v5, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e64 v4, v8, v14, s[0:1]
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v3, v6, v4, vcc
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX8-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v9, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v3, v7, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v8, v14, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v6, v4, s[0:1]
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s5
 ; GFX8-NEXT:    flat_store_dwordx2 v[4:5], v[0:1]
@@ -372,14 +371,13 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v12
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v10, v13, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v10, v11, v14, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v12
-; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
-; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v7, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e64 v5, v9, v15, s[0:1]
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v6, v5, vcc
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v5, v0, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v10, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v9, v15, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[0:1], s[4:5]
 ; GFX9-NEXT:    global_store_dwordx2 v4, v[2:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
@@ -474,49 +472,48 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX10-NEXT:    v_mul_lo_u32 v4, s11, v2
-; GFX10-NEXT:    v_add_co_u32 v6, vcc_lo, v2, 1
 ; GFX10-NEXT:    v_add3_u32 v3, v3, v0, v1
 ; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s10, v2, 0
 ; GFX10-NEXT:    v_mul_lo_u32 v5, s10, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v7, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_add3_u32 v1, v1, v5, v4
-; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v6, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v7, vcc_lo
-; GFX10-NEXT:    v_sub_nc_u32_e32 v8, s9, v1
-; GFX10-NEXT:    v_sub_co_u32 v9, vcc_lo, s8, v0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v10, s0, s9, v1, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v8, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v9
+; GFX10-NEXT:    v_add_co_u32 v4, vcc_lo, v2, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v5, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v6, s9, v1
+; GFX10-NEXT:    v_sub_co_u32 v7, vcc_lo, s8, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v8, s0, s9, v1, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s10, v7
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v9, s10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v11, s0, 0, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v10
+; GFX10-NEXT:    v_sub_co_u32 v6, vcc_lo, v7, s10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v9, s0, 0, v0, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v8
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s11, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s11, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v6
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, 0, -1, s0
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v9
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s10, v8
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s11, v11
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v10
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v13, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v13, vcc_lo, v8, s10
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, 0, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v12
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v12
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v7, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v8, v13, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v0, s0
-; GFX10-NEXT:    v_mov_b32_e32 v7, 0
-; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v9, v6, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v10, v5, s1
-; GFX10-NEXT:    global_store_dwordx2 v7, v[0:1], s[4:5]
-; GFX10-NEXT:    global_store_dwordx2 v7, v[2:3], s[6:7]
+; GFX10-NEXT:    v_add_co_u32 v13, s0, v4, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v14, s0, 0, v5, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v12, v11, s0
+; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s11, v8
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v11
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s0
+; GFX10-NEXT:    v_sub_co_u32 v10, s0, v6, s10
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v0, s0, 0, v0, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v4, v4, v13, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v5, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v10, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v10, 0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v2, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v3, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v7, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v8, v9, s0
+; GFX10-NEXT:    global_store_dwordx2 v10, v[0:1], s[4:5]
+; GFX10-NEXT:    global_store_dwordx2 v10, v[2:3], s[6:7]
 ; GFX10-NEXT:    s_endpgm
   %div = udiv i64 %x, %y
   store i64 %div, ptr addrspace(1) %out0
@@ -1151,40 +1148,39 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, v4
 ; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s2, v14, v[1:2]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v12, v13, v18, vcc
-; GFX8-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
-; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], s3, v15, v[4:5]
-; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v16
-; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v2, vcc
-; GFX8-NEXT:    v_cndmask_b32_e32 v2, v7, v12, vcc
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v10, v19, s[0:1]
+; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
+; GFX8-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], s3, v15, v[4:5]
+; GFX8-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e64 v2, v7, v12, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v10, v19, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v7, v14, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v9, v15, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v5, v8, v5, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[0:1]
 ; GFX8-NEXT:    v_mul_hi_u32 v8, v15, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v6, v11, v20, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v7, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v11, v20, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v8, v14, v4
 ; GFX8-NEXT:    v_mul_hi_u32 v3, v14, v3
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v9, v7
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v9, v7
 ; GFX8-NEXT:    v_mul_hi_u32 v9, v15, v4
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v8, v3
-; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v9
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v8, s[0:1], v8, v9
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v8, v3
+; GFX8-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v9
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, v8, v9
 ; GFX8-NEXT:    v_mul_hi_u32 v4, v14, v4
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v3, v7
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[0:1]
-; GFX8-NEXT:    v_add_u32_e64 v7, s[0:1], v8, v7
-; GFX8-NEXT:    v_add_u32_e64 v4, s[0:1], v4, v7
-; GFX8-NEXT:    v_add_u32_e64 v3, s[0:1], v15, v3
-; GFX8-NEXT:    v_addc_u32_e64 v4, s[0:1], v14, v4, s[0:1]
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v3, v7
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v8, v7
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, v4, v7
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, v15, v3
+; GFX8-NEXT:    v_addc_u32_e32 v4, vcc, v14, v4, vcc
 ; GFX8-NEXT:    v_mul_lo_u32 v7, s11, v3
 ; GFX8-NEXT:    v_mul_lo_u32 v8, s10, v4
-; GFX8-NEXT:    v_cndmask_b32_e32 v6, v0, v6, vcc
+; GFX8-NEXT:    v_cndmask_b32_e64 v6, v0, v6, s[0:1]
 ; GFX8-NEXT:    v_mul_hi_u32 v0, s10, v3
 ; GFX8-NEXT:    v_mul_hi_u32 v3, s11, v3
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, v7, v8
@@ -1420,56 +1416,55 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v6
 ; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s2, v15, v[3:4]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v13, v14, v19, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v10
-; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], s3, v16, v[6:7]
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
-; GFX9-NEXT:    v_cndmask_b32_e32 v4, v9, v13, vcc
-; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v17
+; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
+; GFX9-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], s3, v16, v[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v8, v4, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v9, v13, s[0:1]
 ; GFX9-NEXT:    v_mul_lo_u32 v8, v15, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v9, v16, v6
-; GFX9-NEXT:    v_cndmask_b32_e64 v7, v11, v20, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v11, v20, vcc
 ; GFX9-NEXT:    v_mul_hi_u32 v11, v16, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v10, v12, v21, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], v8, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], v8, v11
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v12, v21, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v11
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v11, v15, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v5, v15, v5
 ; GFX9-NEXT:    v_add_u32_e32 v8, v9, v8
 ; GFX9-NEXT:    v_mul_hi_u32 v9, v16, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v6, v15, v6
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v11, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v8
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v11, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v8
 ; GFX9-NEXT:    v_add_u32_e32 v9, v11, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GFX9-NEXT:    v_add3_u32 v6, v9, v8, v6
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v16, v5
-; GFX9-NEXT:    v_addc_co_u32_e64 v6, s[0:1], v15, v6, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v16, v5
+; GFX9-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v6, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v5
 ; GFX9-NEXT:    v_mul_lo_u32 v9, s10, v6
-; GFX9-NEXT:    v_cndmask_b32_e32 v7, v2, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v2, v7, s[0:1]
 ; GFX9-NEXT:    v_mul_hi_u32 v2, s10, v5
 ; GFX9-NEXT:    v_mul_hi_u32 v5, s11, v5
-; GFX9-NEXT:    v_add_co_u32_e64 v8, s[0:1], v8, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v2, s[0:1], v8, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
+; GFX9-NEXT:    v_add_co_u32_e32 v8, vcc, v8, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX9-NEXT:    v_mul_lo_u32 v8, s11, v6
 ; GFX9-NEXT:    v_add_u32_e32 v2, v9, v2
 ; GFX9-NEXT:    v_mul_hi_u32 v9, s10, v6
 ; GFX9-NEXT:    v_mul_hi_u32 v13, s11, v6
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v8, v5
-; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v5, s[0:1], v5, v9
-; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s[0:1]
-; GFX9-NEXT:    v_add_co_u32_e64 v12, s[0:1], v5, v2
-; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
-; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[0:1], s14, v12, 0
-; GFX9-NEXT:    v_cndmask_b32_e32 v8, v1, v10, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v8, v5
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v9
+; GFX9-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX9-NEXT:    v_add_co_u32_e32 v12, vcc, v5, v2
+; GFX9-NEXT:    v_mad_u64_u32 v[5:6], s[2:3], s14, v12, 0
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX9-NEXT:    v_cndmask_b32_e64 v8, v1, v10, s[0:1]
 ; GFX9-NEXT:    v_add_u32_e32 v1, v11, v9
 ; GFX9-NEXT:    v_add3_u32 v9, v1, v2, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v6
@@ -1705,7 +1700,6 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_sub_co_u32 v8, vcc_lo, v9, s12
 ; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v12, s0, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s13, v11
-; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v0, vcc_lo, s13, v0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, 0, -1, s0
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s12, v8
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s0
@@ -1717,63 +1711,63 @@ define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v13, v13, v1, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s13, v12
 ; GFX10-NEXT:    v_cndmask_b32_e64 v14, v15, v14, s0
-; GFX10-NEXT:    v_add_co_u32 v6, s0, v2, v6
+; GFX10-NEXT:    v_add_co_u32 v2, s0, v2, v6
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
-; GFX10-NEXT:    v_add_co_u32 v15, s0, v16, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s0, 0, v17, s0
+; GFX10-NEXT:    v_add_co_u32 v6, s0, v16, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e64 v15, s0, 0, v17, s0
 ; GFX10-NEXT:    v_add3_u32 v3, v7, v1, v3
-; GFX10-NEXT:    v_mad_u64_u32 v[1:2], s0, s14, v6, 0
-; GFX10-NEXT:    v_mul_lo_u32 v19, s15, v6
+; GFX10-NEXT:    v_subrev_co_ci_u32_e32 v7, vcc_lo, s13, v0, vcc_lo
+; GFX10-NEXT:    v_mad_u64_u32 v[0:1], s0, s14, v2, 0
+; GFX10-NEXT:    v_mul_lo_u32 v18, s14, v3
+; GFX10-NEXT:    v_mul_lo_u32 v19, s15, v2
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v14
-; GFX10-NEXT:    v_mul_lo_u32 v7, s14, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v15, v16, v15, vcc_lo
-; GFX10-NEXT:    v_sub_co_u32 v16, s0, v8, s12
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v20, s0, 0, v0, s0
-; GFX10-NEXT:    v_add3_u32 v2, v2, v7, v19
-; GFX10-NEXT:    v_sub_co_u32 v7, s0, s10, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v17, v17, v18, vcc_lo
-; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v13
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v13, s1, s11, v2, s0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s11, v2
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v14
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v4, v15, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s15, v13
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v5, v17, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s14, v7
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v8, v16, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v12, v12, v20, s1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s15, v13
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s0
-; GFX10-NEXT:    v_sub_co_u32 v14, s0, v7, s14
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v15, s2, 0, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s1
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v9, v4, vcc_lo
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s0, s15, v2, s0
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s15, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s14, v14
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s1
-; GFX10-NEXT:    v_add_co_u32 v16, s1, v6, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v17, s1, 0, v3, s1
-; GFX10-NEXT:    v_cmp_eq_u32_e64 s1, s15, v15
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s1
-; GFX10-NEXT:    v_add_co_u32 v9, s1, v16, 1
-; GFX10-NEXT:    v_add_co_ci_u32_e64 v18, s1, 0, v17, s1
-; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v8
-; GFX10-NEXT:    v_sub_co_u32 v8, s1, v14, s14
-; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v2, s1, 0, v2, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v16, v9, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v16, v17, v18, s0
+; GFX10-NEXT:    v_sub_co_u32 v14, s0, v8, s12
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v7, s0, 0, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v16, v6, vcc_lo
+; GFX10-NEXT:    v_cmp_ne_u32_e64 s0, 0, v13
+; GFX10-NEXT:    v_add3_u32 v16, v1, v18, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v15, v17, v15, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v13, s1, s10, v0
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v4, v6, s0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, s11, v16
+; GFX10-NEXT:    v_sub_co_ci_u32_e64 v17, s2, s11, v16, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v5, v15, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v8, v14, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s1, s15, v4, s1
+; GFX10-NEXT:    v_cmp_le_u32_e64 s1, s14, v13
+; GFX10-NEXT:    v_cmp_le_u32_e64 s2, s15, v17
+; GFX10-NEXT:    v_cndmask_b32_e32 v7, v12, v7, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v17
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, v9, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v14, 0, -1, s1
+; GFX10-NEXT:    v_sub_co_u32 v15, s1, v13, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s2
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v16, s2, 0, v8, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, v6, v14, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s15, v16
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s14, v15
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v12, vcc_lo, v2, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v14, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s15, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v6, v9, vcc_lo
+; GFX10-NEXT:    v_add_co_u32 v9, vcc_lo, v12, 1
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v18, vcc_lo, 0, v14, vcc_lo
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, vcc_lo, s15, v8, s1
+; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v6
+; GFX10-NEXT:    v_sub_co_u32 v6, s1, v15, s14
+; GFX10-NEXT:    v_subrev_co_ci_u32_e64 v8, s1, 0, v8, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v9, v12, v9, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, v14, v18, vcc_lo
 ; GFX10-NEXT:    v_cmp_ne_u32_e64 s1, 0, v5
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v14, v8, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v14, v15, v2, s0
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, v11, v12, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v6, v9, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v16, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v7, v8, s1
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v13, v14, s1
+; GFX10-NEXT:    v_cndmask_b32_e32 v6, v15, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v8, v16, v8, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v12, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v13, v6, s1
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v17, v8, s1
 ; GFX10-NEXT:    global_store_dwordx4 v10, v[0:3], s[4:5]
 ; GFX10-NEXT:    global_store_dwordx4 v10, v[4:7], s[6:7]
 ; GFX10-NEXT:    s_endpgm
@@ -2035,17 +2029,17 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, s1
 ; GFX10-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    s_sub_i32 s6, 0, s1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s3, v0
-; GFX10-NEXT:    s_sub_i32 s3, 0, s1
-; GFX10-NEXT:    v_mul_lo_u32 v3, s3, v1
 ; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
@@ -2055,30 +2049,30 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s3, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s2, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s2, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    s_movk_i32 s1, 0xff
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
+; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, s1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
index 48f05a33f03649c..158403644607abe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i32.ll
@@ -211,22 +211,22 @@ define i32 @v_urem_i32_oddk_denom(i32 %num) {
 ; CHECK-LABEL: v_urem_i32_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v1, 0x4996c7d8
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v3, 0x12d8fb
 ; CHECK-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; CHECK-NEXT:    v_mul_lo_u32 v2, v1, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v2, v1, v2
 ; CHECK-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
 ; CHECK-NEXT:    v_mul_hi_u32 v1, v0, v1
-; CHECK-NEXT:    v_mul_lo_u32 v1, v1, s4
+; CHECK-NEXT:    v_mul_lo_u32 v1, v1, v3
 ; CHECK-NEXT:    v_sub_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CHECK-NEXT:    v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
-; CHECK-NEXT:    v_subrev_i32_e32 v1, vcc, s4, v0
-; CHECK-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CHECK-NEXT:    v_subrev_i32_e32 v1, vcc, 0x12d8fb, v0
+; CHECK-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v3
 ; CHECK-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i32 %num, 1235195
@@ -237,7 +237,6 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-LABEL: v_urem_v2i32_oddk_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; GISEL-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0xffed2705
@@ -249,19 +248,19 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GISEL-NEXT:    v_mul_hi_u32 v4, v0, v3
 ; GISEL-NEXT:    v_mul_hi_u32 v3, v1, v3
-; GISEL-NEXT:    v_mul_lo_u32 v4, v4, s4
+; GISEL-NEXT:    v_mul_lo_u32 v4, v4, v2
 ; GISEL-NEXT:    v_mul_lo_u32 v3, v3, v2
 ; GISEL-NEXT:    v_sub_i32_e32 v0, vcc, v0, v4
 ; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v3
-; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
 ; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v0
+; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v0, v2
 ; GISEL-NEXT:    v_subrev_i32_e32 v4, vcc, 0x12d8fb, v1
-; GISEL-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v2
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
@@ -270,31 +269,31 @@ define <2 x i32> @v_urem_v2i32_oddk_denom(<2 x i32> %num) {
 ; CGP-LABEL: v_urem_v2i32_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v2, 0x4996c7d8
-; CGP-NEXT:    s_mov_b32 s5, 0xffed2705
+; CGP-NEXT:    v_mov_b32_e32 v3, 0xffed2705
+; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; CGP-NEXT:    v_mul_lo_u32 v3, v2, s5
+; CGP-NEXT:    v_mul_lo_u32 v3, v2, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CGP-NEXT:    v_add_i32_e32 v2, vcc, v2, v3
 ; CGP-NEXT:    v_mul_hi_u32 v3, v0, v2
 ; CGP-NEXT:    v_mul_hi_u32 v2, v1, v2
-; CGP-NEXT:    v_mul_lo_u32 v3, v3, s4
-; CGP-NEXT:    v_mul_lo_u32 v2, v2, s4
+; CGP-NEXT:    v_mul_lo_u32 v3, v3, v4
+; CGP-NEXT:    v_mul_lo_u32 v2, v2, v4
 ; CGP-NEXT:    v_sub_i32_e32 v0, vcc, v0, v3
 ; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; CGP-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v0
-; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, s4, v1
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CGP-NEXT:    v_subrev_i32_e32 v2, vcc, 0x12d8fb, v0
+; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
-; CGP-NEXT:    v_subrev_i32_e32 v2, vcc, s4, v0
+; CGP-NEXT:    v_sub_i32_e32 v2, vcc, v0, v4
 ; CGP-NEXT:    v_subrev_i32_e32 v3, vcc, 0x12d8fb, v1
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v0
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; CGP-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
+; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v4
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i32> %num, <i32 1235195, i32 1235195>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index ecf7cc921886c08..12df4b7c7fc33d7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -969,11 +969,10 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-LABEL: v_urem_i64_oddk_denom:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_u32_e32 v3, 0x12d8fb
 ; CHECK-NEXT:    v_cvt_f32_ubyte0_e32 v4, 0
-; CHECK-NEXT:    s_mov_b32 s5, 0xffed2705
+; CHECK-NEXT:    v_mov_b32_e32 v5, 0xffed2705
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0x4f800000, v4
 ; CHECK-NEXT:    v_rcp_iflag_f32_e32 v3, v3
 ; CHECK-NEXT:    v_mul_f32_e32 v3, 0x5f7ffffc, v3
@@ -982,62 +981,62 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_mac_f32_e32 v3, 0xcf800000, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v4, v4
 ; CHECK-NEXT:    v_cvt_u32_f32_e32 v3, v3
-; CHECK-NEXT:    v_mul_lo_u32 v5, v4, s5
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, s5
-; CHECK-NEXT:    v_mul_hi_u32 v7, s5, v3
-; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v8, v3, v6
+; CHECK-NEXT:    v_mul_lo_u32 v6, v4, v5
+; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v8, v5, v3
+; CHECK-NEXT:    v_sub_i32_e32 v6, vcc, v6, v3
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v7
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v7
+; CHECK-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CHECK-NEXT:    v_mul_lo_u32 v10, v3, v6
+; CHECK-NEXT:    v_mul_lo_u32 v11, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v12, v3, v6
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
-; CHECK-NEXT:    v_mul_lo_u32 v9, v3, v5
-; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v5
-; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v5
-; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v11, v7
+; CHECK-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CHECK-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
+; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v7
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v5
+; CHECK-NEXT:    v_mul_hi_u32 v7, v5, v3
+; CHECK-NEXT:    v_mul_lo_u32 v5, v4, v5
+; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v6
+; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v6
+; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
+; CHECK-NEXT:    v_sub_i32_e32 v5, vcc, v5, v3
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
-; CHECK-NEXT:    v_mul_lo_u32 v5, v3, s5
-; CHECK-NEXT:    v_mul_hi_u32 v6, s5, v3
-; CHECK-NEXT:    v_mul_lo_u32 v7, v4, s5
-; CHECK-NEXT:    v_mul_lo_u32 v8, v4, v5
-; CHECK-NEXT:    v_mul_hi_u32 v9, v3, v5
+; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v5
+; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v5
+; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v5
 ; CHECK-NEXT:    v_mul_hi_u32 v5, v4, v5
-; CHECK-NEXT:    v_sub_i32_e32 v7, vcc, v7, v3
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; CHECK-NEXT:    v_mul_lo_u32 v7, v3, v6
-; CHECK-NEXT:    v_mul_lo_u32 v10, v4, v6
-; CHECK-NEXT:    v_mul_hi_u32 v11, v3, v6
-; CHECK-NEXT:    v_mul_hi_u32 v6, v4, v6
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v10, v5
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v10, v6
 ; CHECK-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
 ; CHECK-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
 ; CHECK-NEXT:    v_add_i32_e32 v8, vcc, v10, v9
-; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
-; CHECK-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
-; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
-; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v6, vcc
+; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v5, v7
+; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v6
+; CHECK-NEXT:    v_addc_u32_e32 v4, vcc, v4, v5, vcc
 ; CHECK-NEXT:    v_mul_lo_u32 v5, v1, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v6, v0, v3
 ; CHECK-NEXT:    v_mul_hi_u32 v3, v1, v3
@@ -1058,10 +1057,10 @@ define i64 @v_urem_i64_oddk_denom(i64 %num) {
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v3, v5
 ; CHECK-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
 ; CHECK-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
-; CHECK-NEXT:    v_mul_lo_u32 v6, v3, s4
-; CHECK-NEXT:    v_mul_hi_u32 v3, s4, v3
+; CHECK-NEXT:    v_mul_lo_u32 v6, v3, v2
+; CHECK-NEXT:    v_mul_hi_u32 v3, v2, v3
 ; CHECK-NEXT:    v_add_i32_e32 v4, vcc, v4, v5
-; CHECK-NEXT:    v_mul_lo_u32 v4, v4, s4
+; CHECK-NEXT:    v_mul_lo_u32 v4, v4, v2
 ; CHECK-NEXT:    v_add_i32_e32 v3, vcc, v4, v3
 ; CHECK-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v6
 ; CHECK-NEXT:    v_subb_u32_e64 v4, vcc, v1, v3, s[4:5]
@@ -1095,209 +1094,200 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-LABEL: v_urem_v2i64_oddk_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; GISEL-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
 ; GISEL-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
-; GISEL-NEXT:    s_sub_u32 s5, 0, 0x12d8fb
-; GISEL-NEXT:    v_madmk_f32 v7, v6, 0x4f800000, v5
-; GISEL-NEXT:    s_subb_u32 s6, 0, 0
+; GISEL-NEXT:    s_sub_u32 s4, 0, 0x12d8fb
 ; GISEL-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; GISEL-NEXT:    v_rcp_iflag_f32_e32 v6, v7
+; GISEL-NEXT:    s_subb_u32 s5, 0, 0
 ; GISEL-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; GISEL-NEXT:    s_sub_u32 s7, 0, 0x12d8fb
-; GISEL-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
+; GISEL-NEXT:    s_sub_u32 s6, 0, 0x12d8fb
 ; GISEL-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; GISEL-NEXT:    s_subb_u32 s8, 0, 0
-; GISEL-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v6
-; GISEL-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v5
-; GISEL-NEXT:    v_trunc_f32_e32 v7, v7
-; GISEL-NEXT:    v_trunc_f32_e32 v8, v8
-; GISEL-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v7
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v8
-; GISEL-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GISEL-NEXT:    s_subb_u32 s7, 0, 0
+; GISEL-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; GISEL-NEXT:    v_trunc_f32_e32 v6, v6
+; GISEL-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; GISEL-NEXT:    v_mul_lo_u32 v9, s5, v7
 ; GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; GISEL-NEXT:    v_mul_lo_u32 v10, s7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v11, s5, v6
-; GISEL-NEXT:    v_mul_lo_u32 v12, s6, v6
-; GISEL-NEXT:    v_mul_hi_u32 v13, s5, v6
-; GISEL-NEXT:    v_mul_lo_u32 v14, s7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v15, s8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v16, s7, v5
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v12, v9
-; GISEL-NEXT:    v_mul_lo_u32 v12, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v17, v6, v11
-; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT:    v_mul_lo_u32 v15, v8, v14
-; GISEL-NEXT:    v_mul_hi_u32 v18, v5, v14
-; GISEL-NEXT:    v_mul_hi_u32 v14, v8, v14
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v9
-; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v19, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_mul_lo_u32 v20, v5, v10
-; GISEL-NEXT:    v_mul_lo_u32 v21, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v22, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v10
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v16, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v20
-; GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v21, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v19
-; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v15, v18
-; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v22
+; GISEL-NEXT:    v_mul_lo_u32 v7, s4, v6
+; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v6
+; GISEL-NEXT:    v_mul_lo_u32 v9, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, s5, v5
+; GISEL-NEXT:    v_mul_hi_u32 v11, s4, v5
+; GISEL-NEXT:    v_mul_lo_u32 v12, s6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v13, s7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v10, v7
+; GISEL-NEXT:    v_mul_lo_u32 v10, v6, v9
+; GISEL-NEXT:    v_mul_hi_u32 v15, v5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v13, v8
+; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v12
+; GISEL-NEXT:    v_mul_hi_u32 v16, v5, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v12
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v11
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v14
+; GISEL-NEXT:    v_mul_lo_u32 v11, v5, v7
+; GISEL-NEXT:    v_mul_lo_u32 v14, v6, v7
+; GISEL-NEXT:    v_mul_hi_u32 v17, v5, v7
+; GISEL-NEXT:    v_mul_hi_u32 v7, v6, v7
+; GISEL-NEXT:    v_mul_lo_u32 v18, v5, v8
+; GISEL-NEXT:    v_mul_lo_u32 v19, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v20, v5, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v8
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v17
-; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v20, v15
-; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v21, v18
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v19, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v12
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v15
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v11
-; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, s5, v6
-; GISEL-NEXT:    v_mul_lo_u32 v11, s6, v6
-; GISEL-NEXT:    v_mul_hi_u32 v12, s5, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v14
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, s7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v13, s8, v5
-; GISEL-NEXT:    v_mul_hi_u32 v14, s7, v5
-; GISEL-NEXT:    v_mul_lo_u32 v15, s5, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v9
-; GISEL-NEXT:    v_mul_hi_u32 v17, v6, v9
-; GISEL-NEXT:    v_mul_hi_u32 v9, v7, v9
-; GISEL-NEXT:    v_mul_lo_u32 v18, s7, v8
-; GISEL-NEXT:    v_mul_lo_u32 v19, v8, v10
-; GISEL-NEXT:    v_mul_hi_u32 v20, v5, v10
-; GISEL-NEXT:    v_mul_hi_u32 v10, v8, v10
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v16
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v20
+; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v15
+; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v18, v13
+; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v19, v16
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v13
+; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v13
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v11
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v5, v9
+; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v6, v7, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v10, s4, v9
+; GISEL-NEXT:    v_mul_lo_u32 v11, s5, v9
+; GISEL-NEXT:    v_mul_hi_u32 v13, s4, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v12
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, s6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v12, s7, v5
+; GISEL-NEXT:    v_mul_hi_u32 v14, s6, v5
+; GISEL-NEXT:    v_mul_lo_u32 v15, s4, v7
+; GISEL-NEXT:    v_mul_lo_u32 v16, v7, v10
+; GISEL-NEXT:    v_mul_hi_u32 v17, v9, v10
+; GISEL-NEXT:    v_mul_hi_u32 v10, v7, v10
+; GISEL-NEXT:    v_mul_lo_u32 v18, s6, v6
+; GISEL-NEXT:    v_mul_lo_u32 v19, v6, v8
+; GISEL-NEXT:    v_mul_hi_u32 v20, v5, v8
+; GISEL-NEXT:    v_mul_hi_u32 v8, v6, v8
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v15
-; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v18
-; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
-; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v13, v14
-; GISEL-NEXT:    v_mul_lo_u32 v13, v6, v11
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v18
+; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
+; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; GISEL-NEXT:    v_mul_lo_u32 v13, v9, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v7, v11
-; GISEL-NEXT:    v_mul_hi_u32 v15, v6, v11
+; GISEL-NEXT:    v_mul_hi_u32 v15, v9, v11
 ; GISEL-NEXT:    v_mul_hi_u32 v11, v7, v11
 ; GISEL-NEXT:    v_mul_lo_u32 v18, v5, v12
-; GISEL-NEXT:    v_mul_lo_u32 v21, v8, v12
+; GISEL-NEXT:    v_mul_lo_u32 v21, v6, v12
 ; GISEL-NEXT:    v_mul_hi_u32 v22, v5, v12
-; GISEL-NEXT:    v_mul_hi_u32 v12, v8, v12
+; GISEL-NEXT:    v_mul_hi_u32 v12, v6, v12
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v14, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v18, vcc, v19, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v21, v10
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v21, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v13, v17
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v17, vcc, v18, v20
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v22
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v22
 ; GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v16, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v14, v15
 ; GISEL-NEXT:    v_add_i32_e32 v15, vcc, v19, v17
 ; GISEL-NEXT:    v_add_i32_e32 v16, vcc, v21, v18
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v15
 ; GISEL-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
 ; GISEL-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v9, v1, v6
-; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v6
-; GISEL-NEXT:    v_mul_hi_u32 v6, v1, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
-; GISEL-NEXT:    v_mul_lo_u32 v10, v3, v5
+; GISEL-NEXT:    v_mul_lo_u32 v10, v1, v9
+; GISEL-NEXT:    v_mul_hi_u32 v11, v0, v9
+; GISEL-NEXT:    v_mul_hi_u32 v9, v1, v9
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_addc_u32_e32 v6, vcc, v6, v12, vcc
+; GISEL-NEXT:    v_mul_lo_u32 v8, v3, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v12, v2, v5
 ; GISEL-NEXT:    v_mul_hi_u32 v5, v3, v5
 ; GISEL-NEXT:    v_mul_lo_u32 v13, v0, v7
 ; GISEL-NEXT:    v_mul_lo_u32 v14, v1, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v15, v0, v7
 ; GISEL-NEXT:    v_mul_hi_u32 v7, v1, v7
-; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v8
-; GISEL-NEXT:    v_mul_lo_u32 v17, v3, v8
-; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v8
-; GISEL-NEXT:    v_mul_hi_u32 v8, v3, v8
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; GISEL-NEXT:    v_mul_lo_u32 v16, v2, v6
+; GISEL-NEXT:    v_mul_lo_u32 v17, v3, v6
+; GISEL-NEXT:    v_mul_hi_u32 v18, v2, v6
+; GISEL-NEXT:    v_mul_hi_u32 v6, v3, v6
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v13
 ; GISEL-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v14, v6
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v14, v9
 ; GISEL-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v16
 ; GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v15
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v15
+; GISEL-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v12
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
 ; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v18
 ; GISEL-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v13, v10
 ; GISEL-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v16, v8
 ; GISEL-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v9
-; GISEL-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
+; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT:    v_mul_lo_u32 v11, v6, s4
-; GISEL-NEXT:    v_mul_hi_u32 v6, s4, v6
-; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; GISEL-NEXT:    v_mul_lo_u32 v12, v5, s4
-; GISEL-NEXT:    v_mul_hi_u32 v5, s4, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; GISEL-NEXT:    v_mul_lo_u32 v11, v9, v4
+; GISEL-NEXT:    v_mul_hi_u32 v9, v4, v9
+; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; GISEL-NEXT:    v_mul_lo_u32 v12, v5, v4
+; GISEL-NEXT:    v_mul_hi_u32 v5, v4, v5
+; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v10
+; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT:    v_mul_lo_u32 v7, v7, v4
+; GISEL-NEXT:    v_mul_lo_u32 v6, v6, v4
 ; GISEL-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; GISEL-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; GISEL-NEXT:    v_mul_lo_u32 v7, v7, s4
-; GISEL-NEXT:    v_mul_lo_u32 v8, v8, s4
-; GISEL-NEXT:    v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
 ; GISEL-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v11
-; GISEL-NEXT:    v_subb_u32_e64 v7, vcc, v1, v6, s[4:5]
-; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v6
+; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
+; GISEL-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
 ; GISEL-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v12
 ; GISEL-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
 ; GISEL-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
 ; GISEL-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; GISEL-NEXT:    v_sub_i32_e32 v9, vcc, v2, v4
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v7
-; GISEL-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[8:9]
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v6
+; GISEL-NEXT:    v_cndmask_b32_e64 v7, -1, v7, s[8:9]
 ; GISEL-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
 ; GISEL-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[4:5]
@@ -1323,219 +1313,152 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v10
 ; GISEL-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
 ; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
-; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
 ; GISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
 ; GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; CGP-LABEL: v_urem_v2i64_oddk_denom:
 ; CGP:       ; %bb.0:
 ; CGP-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT:    s_mov_b32 s4, 0x12d8fb
 ; CGP-NEXT:    v_mov_b32_e32 v4, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v5, 0x12d8fb
 ; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v6, 0
-; CGP-NEXT:    s_mov_b32 s5, 0xffed2705
-; CGP-NEXT:    v_cvt_f32_u32_e32 v7, 0x12d8fb
-; CGP-NEXT:    v_cvt_f32_ubyte0_e32 v8, 0
+; CGP-NEXT:    v_mov_b32_e32 v7, 0xffed2705
 ; CGP-NEXT:    v_mac_f32_e32 v5, 0x4f800000, v6
-; CGP-NEXT:    v_mac_f32_e32 v7, 0x4f800000, v8
 ; CGP-NEXT:    v_rcp_iflag_f32_e32 v5, v5
-; CGP-NEXT:    v_rcp_iflag_f32_e32 v6, v7
 ; CGP-NEXT:    v_mul_f32_e32 v5, 0x5f7ffffc, v5
-; CGP-NEXT:    v_mul_f32_e32 v6, 0x5f7ffffc, v6
-; CGP-NEXT:    v_mul_f32_e32 v7, 0x2f800000, v5
-; CGP-NEXT:    v_mul_f32_e32 v8, 0x2f800000, v6
-; CGP-NEXT:    v_trunc_f32_e32 v7, v7
-; CGP-NEXT:    v_trunc_f32_e32 v8, v8
-; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v7
-; CGP-NEXT:    v_cvt_u32_f32_e32 v7, v7
-; CGP-NEXT:    v_mac_f32_e32 v6, 0xcf800000, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v8, v8
-; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
-; CGP-NEXT:    v_mul_lo_u32 v9, v7, s5
+; CGP-NEXT:    v_mul_f32_e32 v6, 0x2f800000, v5
+; CGP-NEXT:    v_trunc_f32_e32 v6, v6
+; CGP-NEXT:    v_mac_f32_e32 v5, 0xcf800000, v6
 ; CGP-NEXT:    v_cvt_u32_f32_e32 v6, v6
-; CGP-NEXT:    v_mul_lo_u32 v10, v8, s5
-; CGP-NEXT:    v_mul_lo_u32 v11, v5, s5
-; CGP-NEXT:    v_mul_hi_u32 v12, s5, v5
-; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v9, v5
-; CGP-NEXT:    v_mul_lo_u32 v13, v6, s5
-; CGP-NEXT:    v_mul_hi_u32 v14, s5, v6
-; CGP-NEXT:    v_sub_i32_e32 v10, vcc, v10, v6
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CGP-NEXT:    v_mul_lo_u32 v12, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT:    v_mul_lo_u32 v14, v8, v13
-; CGP-NEXT:    v_mul_hi_u32 v16, v6, v13
-; CGP-NEXT:    v_mul_hi_u32 v13, v8, v13
-; CGP-NEXT:    v_mul_lo_u32 v17, v5, v9
-; CGP-NEXT:    v_mul_lo_u32 v18, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v19, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v20, v6, v10
-; CGP-NEXT:    v_mul_lo_u32 v21, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v22, v6, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v8, v10
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v17
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v18, v11
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v20
-; CGP-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v21, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v14, v16
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v22
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v18, v15
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v20, v14
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v21, v16
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v12
+; CGP-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; CGP-NEXT:    v_mul_lo_u32 v8, v6, v7
+; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v10, v7, v5
+; CGP-NEXT:    v_sub_i32_e32 v8, vcc, v8, v5
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, v9
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v9
+; CGP-NEXT:    v_mul_hi_u32 v9, v6, v9
+; CGP-NEXT:    v_mul_lo_u32 v12, v5, v8
+; CGP-NEXT:    v_mul_lo_u32 v13, v6, v8
+; CGP-NEXT:    v_mul_hi_u32 v14, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
 ; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v15, v12
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v12
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v14
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v11
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v9, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v5, s5
-; CGP-NEXT:    v_mul_hi_u32 v11, s5, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v13
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v10, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v6, s5
-; CGP-NEXT:    v_mul_hi_u32 v12, s5, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v7, s5
-; CGP-NEXT:    v_mul_lo_u32 v14, v7, v9
-; CGP-NEXT:    v_mul_hi_u32 v15, v5, v9
-; CGP-NEXT:    v_mul_hi_u32 v9, v7, v9
-; CGP-NEXT:    v_mul_lo_u32 v16, v8, s5
-; CGP-NEXT:    v_mul_lo_u32 v17, v8, v10
-; CGP-NEXT:    v_mul_hi_u32 v18, v6, v10
-; CGP-NEXT:    v_mul_hi_u32 v10, v8, v10
-; CGP-NEXT:    v_sub_i32_e32 v13, vcc, v13, v5
-; CGP-NEXT:    v_sub_i32_e32 v16, vcc, v16, v6
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v16, v12
-; CGP-NEXT:    v_mul_lo_u32 v13, v5, v11
-; CGP-NEXT:    v_mul_lo_u32 v16, v7, v11
-; CGP-NEXT:    v_mul_hi_u32 v19, v5, v11
-; CGP-NEXT:    v_mul_hi_u32 v11, v7, v11
-; CGP-NEXT:    v_mul_lo_u32 v20, v6, v12
-; CGP-NEXT:    v_mul_lo_u32 v21, v8, v12
-; CGP-NEXT:    v_mul_hi_u32 v22, v6, v12
-; CGP-NEXT:    v_mul_hi_u32 v12, v8, v12
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v16, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v20
-; CGP-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v21, v10
-; CGP-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v13, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v19
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v17, vcc, v17, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v22
-; CGP-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v15, vcc, v20, v17
-; CGP-NEXT:    v_add_i32_e32 v16, vcc, v21, v18
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT:    v_add_i32_e32 v14, vcc, v16, v15
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v11, v13
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v12, v14
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v14
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v13, v11
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v10
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
 ; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_addc_u32_e32 v7, vcc, v7, v11, vcc
-; CGP-NEXT:    v_mul_lo_u32 v9, v1, v5
-; CGP-NEXT:    v_mul_hi_u32 v11, v0, v5
-; CGP-NEXT:    v_mul_hi_u32 v5, v1, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
-; CGP-NEXT:    v_addc_u32_e32 v8, vcc, v8, v12, vcc
-; CGP-NEXT:    v_mul_lo_u32 v10, v3, v6
-; CGP-NEXT:    v_mul_hi_u32 v12, v2, v6
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v8, vcc
+; CGP-NEXT:    v_mul_lo_u32 v8, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v9, v7, v5
+; CGP-NEXT:    v_mul_lo_u32 v7, v6, v7
+; CGP-NEXT:    v_mul_lo_u32 v10, v6, v8
+; CGP-NEXT:    v_mul_hi_u32 v11, v5, v8
+; CGP-NEXT:    v_mul_hi_u32 v8, v6, v8
+; CGP-NEXT:    v_sub_i32_e32 v7, vcc, v7, v5
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_mul_lo_u32 v9, v5, v7
+; CGP-NEXT:    v_mul_lo_u32 v12, v6, v7
+; CGP-NEXT:    v_mul_hi_u32 v13, v5, v7
+; CGP-NEXT:    v_mul_hi_u32 v7, v6, v7
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v12, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v13
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v11
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v9
+; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v10, v9
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v8
+; CGP-NEXT:    v_addc_u32_e32 v6, vcc, v6, v7, vcc
+; CGP-NEXT:    v_mul_lo_u32 v7, v1, v5
+; CGP-NEXT:    v_mul_hi_u32 v8, v0, v5
+; CGP-NEXT:    v_mul_hi_u32 v9, v1, v5
+; CGP-NEXT:    v_mul_lo_u32 v10, v3, v5
+; CGP-NEXT:    v_mul_hi_u32 v11, v2, v5
+; CGP-NEXT:    v_mul_hi_u32 v5, v3, v5
+; CGP-NEXT:    v_mul_lo_u32 v12, v0, v6
+; CGP-NEXT:    v_mul_lo_u32 v13, v1, v6
+; CGP-NEXT:    v_mul_hi_u32 v14, v0, v6
+; CGP-NEXT:    v_mul_hi_u32 v15, v1, v6
+; CGP-NEXT:    v_mul_lo_u32 v16, v2, v6
+; CGP-NEXT:    v_mul_lo_u32 v17, v3, v6
+; CGP-NEXT:    v_mul_hi_u32 v18, v2, v6
 ; CGP-NEXT:    v_mul_hi_u32 v6, v3, v6
-; CGP-NEXT:    v_mul_lo_u32 v13, v0, v7
-; CGP-NEXT:    v_mul_lo_u32 v14, v1, v7
-; CGP-NEXT:    v_mul_hi_u32 v15, v0, v7
-; CGP-NEXT:    v_mul_hi_u32 v7, v1, v7
-; CGP-NEXT:    v_mul_lo_u32 v16, v2, v8
-; CGP-NEXT:    v_mul_lo_u32 v17, v3, v8
-; CGP-NEXT:    v_mul_hi_u32 v18, v2, v8
-; CGP-NEXT:    v_mul_hi_u32 v8, v3, v8
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v13
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v12
+; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
 ; CGP-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v14, v5
-; CGP-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v16
 ; CGP-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v17, v6
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v17, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v9, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v14
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v15
-; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v12
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v10, v11
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v18
-; CGP-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v18
+; CGP-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v12, v7
 ; CGP-NEXT:    v_add_i32_e32 v9, vcc, v13, v9
-; CGP-NEXT:    v_add_i32_e32 v11, vcc, v14, v11
 ; CGP-NEXT:    v_add_i32_e32 v10, vcc, v16, v10
-; CGP-NEXT:    v_add_i32_e32 v12, vcc, v17, v12
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v9
-; CGP-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_add_i32_e32 v11, vcc, v17, v11
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v5, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT:    v_add_i32_e32 v9, vcc, v11, v9
-; CGP-NEXT:    v_mul_lo_u32 v11, v5, s4
-; CGP-NEXT:    v_mul_hi_u32 v5, s4, v5
-; CGP-NEXT:    v_add_i32_e32 v10, vcc, v12, v10
-; CGP-NEXT:    v_mul_lo_u32 v12, v6, s4
-; CGP-NEXT:    v_mul_hi_u32 v6, s4, v6
-; CGP-NEXT:    v_add_i32_e32 v7, vcc, v7, v9
-; CGP-NEXT:    v_add_i32_e32 v8, vcc, v8, v10
-; CGP-NEXT:    v_mul_lo_u32 v7, v7, s4
-; CGP-NEXT:    v_mul_lo_u32 v8, v8, s4
-; CGP-NEXT:    v_add_i32_e32 v5, vcc, v7, v5
-; CGP-NEXT:    v_add_i32_e32 v6, vcc, v8, v6
-; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v11
-; CGP-NEXT:    v_subb_u32_e64 v7, vcc, v1, v5, s[4:5]
-; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v5
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v9, v8
+; CGP-NEXT:    v_mul_lo_u32 v9, v7, v4
+; CGP-NEXT:    v_mul_hi_u32 v7, v4, v7
+; CGP-NEXT:    v_add_i32_e32 v10, vcc, v11, v10
+; CGP-NEXT:    v_mul_lo_u32 v11, v5, v4
+; CGP-NEXT:    v_mul_hi_u32 v5, v4, v5
+; CGP-NEXT:    v_add_i32_e32 v8, vcc, v15, v8
+; CGP-NEXT:    v_add_i32_e32 v6, vcc, v6, v10
+; CGP-NEXT:    v_mul_lo_u32 v8, v8, v4
+; CGP-NEXT:    v_mul_lo_u32 v6, v6, v4
+; CGP-NEXT:    v_add_i32_e32 v7, vcc, v8, v7
+; CGP-NEXT:    v_add_i32_e32 v5, vcc, v6, v5
+; CGP-NEXT:    v_sub_i32_e64 v0, s[4:5], v0, v9
+; CGP-NEXT:    v_subb_u32_e64 v6, vcc, v1, v7, s[4:5]
+; CGP-NEXT:    v_sub_i32_e32 v1, vcc, v1, v7
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
-; CGP-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v12
-; CGP-NEXT:    v_subb_u32_e64 v8, vcc, v3, v6, s[6:7]
-; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; CGP-NEXT:    v_sub_i32_e64 v2, s[6:7], v2, v11
+; CGP-NEXT:    v_subb_u32_e64 v8, vcc, v3, v5, s[6:7]
+; CGP-NEXT:    v_sub_i32_e32 v3, vcc, v3, v5
 ; CGP-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v4
-; CGP-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; CGP-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
 ; CGP-NEXT:    v_sub_i32_e32 v9, vcc, v2, v4
-; CGP-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v7
-; CGP-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[8:9]
+; CGP-NEXT:    v_cmp_eq_u32_e64 s[8:9], 0, v6
+; CGP-NEXT:    v_cndmask_b32_e64 v7, -1, v7, s[8:9]
 ; CGP-NEXT:    v_subbrev_u32_e64 v1, s[4:5], 0, v1, s[4:5]
 ; CGP-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v8
-; CGP-NEXT:    v_cndmask_b32_e64 v6, -1, v6, s[4:5]
+; CGP-NEXT:    v_cndmask_b32_e64 v5, -1, v5, s[4:5]
 ; CGP-NEXT:    v_subbrev_u32_e64 v3, s[4:5], 0, v3, s[6:7]
 ; CGP-NEXT:    v_cmp_ge_u32_e64 s[4:5], v9, v4
 ; CGP-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[4:5]
@@ -1558,12 +1481,12 @@ define <2 x i64> @v_urem_v2i64_oddk_denom(<2 x i64> %num) {
 ; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v10
 ; CGP-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[4:5]
 ; CGP-NEXT:    v_cndmask_b32_e32 v1, v1, v14, vcc
-; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; CGP-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v7
 ; CGP-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, v3, v15, s[4:5]
-; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v6
+; CGP-NEXT:    v_cmp_ne_u32_e64 s[4:5], 0, v5
 ; CGP-NEXT:    v_cndmask_b32_e64 v2, v2, v9, s[4:5]
-; CGP-NEXT:    v_cndmask_b32_e32 v1, v7, v1, vcc
+; CGP-NEXT:    v_cndmask_b32_e32 v1, v6, v1, vcc
 ; CGP-NEXT:    v_cndmask_b32_e64 v3, v8, v3, s[4:5]
 ; CGP-NEXT:    s_setpc_b64 s[30:31]
   %result = urem <2 x i64> %num, <i64 1235195, i64 1235195>
@@ -1751,9 +1674,10 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; GISEL-LABEL: v_urem_v2i64_pow2_shl_denom:
 ; GISEL:       ; %bb.0:
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT:    s_mov_b64 s[4:5], 0x1000
-; GISEL-NEXT:    v_lshl_b64 v[7:8], s[4:5], v4
-; GISEL-NEXT:    v_lshl_b64 v[4:5], s[4:5], v6
+; GISEL-NEXT:    v_mov_b32_e32 v9, 0x1000
+; GISEL-NEXT:    v_mov_b32_e32 v10, 0
+; GISEL-NEXT:    v_lshl_b64 v[7:8], v[9:10], v4
+; GISEL-NEXT:    v_lshl_b64 v[4:5], v[9:10], v6
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v10, v7
 ; GISEL-NEXT:    v_cvt_f32_u32_e32 v11, v8
 ; GISEL-NEXT:    v_sub_i32_e32 v6, vcc, 0, v7
@@ -2011,11 +1935,10 @@ define <2 x i64> @v_urem_v2i64_pow2_shl_denom(<2 x i64> %x, <2 x i64> %y) {
 ; CGP-NEXT:    v_mov_b32_e32 v9, v1
 ; CGP-NEXT:    v_mov_b32_e32 v5, v2
 ; CGP-NEXT:    v_mov_b32_e32 v7, v3
-; CGP-NEXT:    s_mov_b64 s[4:5], 0x1000
 ; CGP-NEXT:    v_mov_b32_e32 v10, 0x1000
 ; CGP-NEXT:    v_mov_b32_e32 v11, 0
 ; CGP-NEXT:    v_mov_b32_e32 v0, 0
-; CGP-NEXT:    v_lshl_b64 v[2:3], s[4:5], v4
+; CGP-NEXT:    v_lshl_b64 v[2:3], v[10:11], v4
 ; CGP-NEXT:    v_or_b32_e32 v1, v9, v3
 ; CGP-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; CGP-NEXT:    v_cvt_f32_u32_e32 v4, v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index b97eba8e70b4988..a60370cd460f9ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -219,8 +219,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -231,14 +231,14 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v1
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX10-NEXT:    s_movk_i32 s4, 0xff
 ; GFX10-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
 ; GFX10-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -322,8 +322,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -342,10 +342,10 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
 ; GFX10-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
-; GFX10-NEXT:    s_movk_i32 s0, 0xff
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -467,11 +467,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_sub_u16 v2, v2, v3 clamp
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v2 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v2, 8
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s4, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_or_b32 v1, v1, s4, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v1, v1, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
@@ -505,7 +505,7 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v0
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v1, v2, 0xff, v1
+; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -537,7 +537,7 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; GFX11-NEXT:    v_and_or_b32 v1, v1, 0xff, v2
+; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
 ; GFX11-NEXT:    v_or3_b32 v0, v1, v3, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
@@ -622,46 +622,46 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ;
 ; GFX9-LABEL: s_usubsat_v4i8:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s2, s0, 8
+; GFX9-NEXT:    s_lshr_b32 s3, s0, 16
+; GFX9-NEXT:    s_lshr_b32 s4, s0, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s3, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s4, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s0, 16
 ; GFX9-NEXT:    s_lshl_b32 s0, s0, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_lshr_b32 s7, s1, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s3, 16
-; GFX9-NEXT:    s_lshr_b32 s8, s1, 16
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 24
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s7
-; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s6
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_lshr_b32 s5, s1, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s0, s0, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s6, s1, 16
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s8, s9
+; GFX9-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s5
+; GFX9-NEXT:    s_lshl_b32 s2, s2, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s2, s2, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s6, s7
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s6
-; GFX9-NEXT:    s_lshr_b32 s6, s4, 16
-; GFX9-NEXT:    s_lshl_b32 s4, s4, 0x80008
-; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_pack_ll_b32_b16 s4, s4, s6
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s1, s1, s4
+; GFX9-NEXT:    s_lshr_b32 s4, s3, 16
+; GFX9-NEXT:    s_lshl_b32 s3, s3, 0x80008
+; GFX9-NEXT:    s_lshl_b32 s4, s4, 8
+; GFX9-NEXT:    s_pack_ll_b32_b16 s3, s3, s4
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    s_mov_b32 s2, 8
-; GFX9-NEXT:    v_pk_sub_u16 v1, s3, v1 clamp
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_pk_sub_u16 v1, s2, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
+; GFX9-NEXT:    v_mov_b32_e32 v3, 8
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX9-NEXT:    s_movk_i32 s0, 0xff
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v2, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    s_mov_b32 s5, 24
-; GFX9-NEXT:    v_and_or_b32 v0, v0, s0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0xff
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_and_or_b32 v0, v0, v2, v3
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 24
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, s5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX9-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -696,14 +696,14 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s3, s3, s5
 ; GFX10-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_sub_u16 v1, s2, s3 clamp
-; GFX10-NEXT:    s_mov_b32 s0, 8
+; GFX10-NEXT:    v_mov_b32_e32 v2, 8
+; GFX10-NEXT:    v_mov_b32_e32 v4, 24
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v1, 8, v1 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v1
-; GFX10-NEXT:    s_mov_b32 s0, 24
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v2, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
@@ -743,7 +743,7 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v1
 ; GFX11-NEXT:    v_bfe_u32 v1, v1, 16, 8
-; GFX11-NEXT:    v_and_or_b32 v0, v0, 0xff, v2
+; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v1
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index 069a3bda4333bb4..20b248875487ecc 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2512,7 +2512,8 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -2520,11 +2521,10 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    v_max_f32_e32 v3, 0x7fc00000, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_med3_f32 v2, v2, 0, s2
+; GFX6-NEXT:    v_med3_f32 v2, v2, 0, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3168,6 +3168,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; GFX6-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v3
@@ -3176,7 +3177,7 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
 ; GFX6-NEXT:    v_med3_f32 v3, v3, s2, 1.0
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    v_med3_f32 v2, v2, 0, s2
+; GFX6-NEXT:    v_med3_f32 v2, v2, 0, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -3253,7 +3254,8 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX6-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; GFX6-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT:    s_mov_b32 s2, 0x7fc00000
+; GFX6-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
+; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    s_waitcnt vmcnt(0)
 ; GFX6-NEXT:    v_cvt_f32_f16_e32 v3, v2
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
@@ -3261,11 +3263,10 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
 ; GFX6-NEXT:    v_mul_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    v_max_f32_e32 v3, 0x7fc00000, v3
 ; GFX6-NEXT:    v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT:    v_med3_f32 v2, v2, 0, s2
+; GFX6-NEXT:    v_med3_f32 v2, v2, 0, v4
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v2, v2
 ; GFX6-NEXT:    v_min_f32_e32 v3, 1.0, v3
 ; GFX6-NEXT:    v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_or_b32_e32 v2, v3, v2
 ; GFX6-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
diff --git a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
index 439ff32bc4cc1cd..d262ed2b7018bbb 100644
--- a/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds-alignment.ll
@@ -105,14 +105,14 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
 ; ALIGNED-GISEL-LABEL: ds4align1:
 ; ALIGNED-GISEL:       ; %bb.0:
 ; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
+; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, 8
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v2, v0 offset:1
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v3, v0 offset:3
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v0, v0 offset:2
-; ALIGNED-GISEL-NEXT:    s_mov_b32 s0, 8
-; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v5, s1
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(2)
 ; ALIGNED-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 8, v1
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(1)
@@ -121,11 +121,11 @@ define amdgpu_kernel void @ds4align1(ptr addrspace(3) %in, ptr addrspace(3) %out
 ; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v2, v0, v1
 ; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
-; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0
-; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:1
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v0 offset:2
-; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:3
+; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v0
+; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:1
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v0 offset:2
+; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:3
 ; ALIGNED-GISEL-NEXT:    s_endpgm
 ;
 ; UNALIGNED-LABEL: ds4align1:
@@ -235,7 +235,6 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
 ; ALIGNED-GISEL-LABEL: ds8align1:
 ; ALIGNED-GISEL:       ; %bb.0:
 ; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 8
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
@@ -262,13 +261,14 @@ define amdgpu_kernel void @ds8align1(ptr addrspace(3) %in, ptr addrspace(3) %out
 ; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, s1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v2 offset:1
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v2, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v2, 8
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v4, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v3, v1 offset:2
-; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v2 offset:3
+; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v4 offset:3
 ; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v0 offset:4
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1 offset:5
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v3, v0 offset:6
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v3, v1 offset:7
 ; ALIGNED-GISEL-NEXT:    s_endpgm
@@ -416,7 +416,6 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
 ; ALIGNED-GISEL-LABEL: ds12align1:
 ; ALIGNED-GISEL:       ; %bb.0:
 ; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 8
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
@@ -448,25 +447,26 @@ define amdgpu_kernel void @ds12align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
 ; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
-; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v6, v7, v2
 ; ALIGNED-GISEL-NEXT:    v_or3_b32 v0, v0, v4, v3
 ; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v3, 8, v1
 ; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, s1
+; ALIGNED-GISEL-NEXT:    v_or3_b32 v2, v6, v7, v2
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v3 offset:1
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v3, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v3, 8
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v5, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v1 offset:2
-; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v3 offset:3
+; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v5 offset:3
 ; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v2
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v2 offset:4
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:5
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v2 offset:6
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:7
 ; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v0 offset:8
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:9
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v4, v0 offset:10
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v4, v1 offset:11
 ; ALIGNED-GISEL-NEXT:    s_endpgm
@@ -717,7 +717,6 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
 ; ALIGNED-GISEL-LABEL: ds16align1:
 ; ALIGNED-GISEL:       ; %bb.0:
 ; ALIGNED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; ALIGNED-GISEL-NEXT:    s_mov_b32 s2, 8
 ; ALIGNED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; ALIGNED-GISEL-NEXT:    ds_read_u8 v1, v0
@@ -764,26 +763,26 @@ define amdgpu_kernel void @ds16align1(ptr addrspace(3) %in, ptr addrspace(3) %ou
 ; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v5, s1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v4 offset:1
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v4, s2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v4, 8
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v6, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v1 offset:2
-; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v4 offset:3
+; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v6 offset:3
 ; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v2
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v2 offset:4
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:5
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v2 offset:6
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:7
 ; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v3
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v3 offset:8
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:9
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, s2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v3 offset:10
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:11
 ; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v0 offset:12
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:13
-; ALIGNED-GISEL-NEXT:    v_mov_b32_e32 v1, 8
-; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; ALIGNED-GISEL-NEXT:    v_lshrrev_b16_sdwa v1, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; ALIGNED-GISEL-NEXT:    ds_write_b8_d16_hi v5, v0 offset:14
 ; ALIGNED-GISEL-NEXT:    ds_write_b8 v5, v1 offset:15
 ; ALIGNED-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/fma.f16.ll b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
index e8423ce9fbc36a2..4ed3abff0ad8515 100644
--- a/llvm/test/CodeGen/AMDGPU/fma.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fma.f16.ll
@@ -133,11 +133,11 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX9-GISEL-LABEL: test_D139469_f16:
 ; GFX9-GISEL:       ; %bb.0: ; %bb
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    v_mul_f16_e32 v1, 0x291e, v0
-; GFX9-GISEL-NEXT:    s_movk_i32 s4, 0x291e
-; GFX9-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e
-; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, s4, v1
+; GFX9-GISEL-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x291e
+; GFX9-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x211e
+; GFX9-GISEL-NEXT:    v_fma_f16 v0, v0, v1, v2
 ; GFX9-GISEL-NEXT:    v_cmp_gt_f16_e64 s[4:5], 0, v0
 ; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -157,11 +157,11 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX10-GISEL-LABEL: test_D139469_f16:
 ; GFX10-GISEL:       ; %bb.0: ; %bb
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_movk_i32 s4, 0x291e
-; GFX10-GISEL-NEXT:    v_mul_f16_e32 v1, 0x291e, v0
-; GFX10-GISEL-NEXT:    v_fmaak_f16 v0, s4, v0, 0x211e
-; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
-; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e64 s4, 0, v0
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e
+; GFX10-GISEL-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
+; GFX10-GISEL-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
+; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e64 s4, 0, v1
 ; GFX10-GISEL-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -182,14 +182,15 @@ define i32 @test_D139469_f16(half %arg) {
 ; GFX11-GISEL-LABEL: test_D139469_f16:
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_movk_i32 s0, 0x291e
-; GFX11-GISEL-NEXT:    v_mul_f16_e32 v1, 0x291e, v0
-; GFX11-GISEL-NEXT:    v_fmaak_f16 v0, s0, v0, 0x211e
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e
+; GFX11-GISEL-NEXT:    v_mul_f16_e32 v2, 0x291e, v0
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
-; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    v_fmac_f16_e32 v1, 0x291e, v0
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v1
 ; GFX11-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -221,15 +222,15 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX9-GISEL-LABEL: test_D139469_v2f16:
 ; GFX9-GISEL:       ; %bb.0: ; %bb
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, 0x291e291e
-; GFX9-GISEL-NEXT:    s_mov_b32 s8, 0
-; GFX9-GISEL-NEXT:    v_pk_mul_f16 v1, v0, s4
-; GFX9-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v1
-; GFX9-GISEL-NEXT:    v_cmp_lt_f16_sdwa s[6:7], v1, s8 src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e211e
-; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v0, s4, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x291e291e
+; GFX9-GISEL-NEXT:    v_pk_mul_f16 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc, 0, v2
+; GFX9-GISEL-NEXT:    v_cmp_lt_f16_sdwa s[6:7], v2, v3 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x211e211e
+; GFX9-GISEL-NEXT:    v_pk_fma_f16 v0, v0, v1, v2
 ; GFX9-GISEL-NEXT:    v_cmp_gt_f16_e64 s[4:5], 0, v0
-; GFX9-GISEL-NEXT:    v_cmp_lt_f16_sdwa s[8:9], v0, s8 src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-GISEL-NEXT:    v_cmp_lt_f16_sdwa s[8:9], v0, v3 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[8:9]
@@ -253,17 +254,17 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX10-GISEL-LABEL: test_D139469_v2f16:
 ; GFX10-GISEL:       ; %bb.0: ; %bb
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, 0x291e291e
-; GFX10-GISEL-NEXT:    v_pk_mul_f16 v1, v0, 0x291e op_sel_hi:[1,0]
-; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, v0, s4, 0x211e op_sel_hi:[1,1,0]
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, 0
-; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e211e
+; GFX10-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e, v0 op_sel_hi:[0,1]
+; GFX10-GISEL-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, v1 op_sel_hi:[0,1,1]
+; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
 ; GFX10-GISEL-NEXT:    v_cmp_gt_f16_e64 s4, 0, v0
-; GFX10-GISEL-NEXT:    v_cmp_lt_f16_sdwa s6, v1, s5 src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-GISEL-NEXT:    v_cmp_lt_f16_sdwa s5, v0, s5 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_cmp_lt_f16_sdwa s5, v2, v1 src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT:    v_cmp_lt_f16_sdwa s6, v0, v1 src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-GISEL-NEXT:    s_or_b32 s4, vcc_lo, s4
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX10-GISEL-NEXT:    s_or_b32 s4, s6, s5
+; GFX10-GISEL-NEXT:    s_or_b32 s4, s5, s6
 ; GFX10-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
 ; GFX10-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -286,22 +287,23 @@ define <2 x i32> @test_D139469_v2f16(<2 x half> %arg) {
 ; GFX11-GISEL-LABEL: test_D139469_v2f16:
 ; GFX11-GISEL:       ; %bb.0: ; %bb
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s0, 0x291e291e
-; GFX11-GISEL-NEXT:    v_pk_mul_f16 v1, v0, 0x291e op_sel_hi:[1,0]
-; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, v0, s0, 0x211e op_sel_hi:[1,1,0]
+; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, 0x211e211e
+; GFX11-GISEL-NEXT:    v_pk_mul_f16 v2, 0x291e, v0 op_sel_hi:[0,1]
 ; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v1
+; GFX11-GISEL-NEXT:    v_pk_fma_f16 v0, 0x291e, v0, v1 op_sel_hi:[0,1,1]
+; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v2
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX11-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v0
-; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e32 vcc_lo, 0, v1
 ; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s0, 0, v0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s1, 0, v2
+; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s1, 0, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-GISEL-NEXT:    v_cmp_gt_f16_e64 s2, 0, v3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b32 s0, s1, s2
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s0
 ; GFX11-GISEL-NEXT:    s_setpc_b64 s[30:31]
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 70d915df7cb007f..5d9836a4de96a04 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -7938,13 +7938,13 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x41800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT:    v_add_f32_e32 v3, 0.5, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v4, 0.5, v2
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x41800000, v2
-; SI-GISEL-NEXT:    v_med3_f32 v3, v3, 1.0, s4
+; SI-GISEL-NEXT:    v_med3_f32 v3, v4, 1.0, v3
 ; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -7976,6 +7976,7 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -7983,50 +7984,33 @@ define amdgpu_kernel void @one_non_inline_constant(ptr addrspace(1) %out, ptr ad
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0x41800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, 0.5, v3
-; VI-GISEL-NEXT:    v_med3_f32 v2, v2, 1.0, s2
+; VI-GISEL-NEXT:    v_med3_f32 v2, v2, 1.0, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, 0x41800000, v3
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v3
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-SDAG-LABEL: one_non_inline_constant:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x41800000
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    v_add_f32_e32 v3, 0.5, v1
-; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, 0x41800000, v1
-; GFX9-SDAG-NEXT:    v_med3_f32 v2, v3, 1.0, v2
-; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v1, off
-; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: one_non_inline_constant:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, 0x41800000
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_add_f32_e32 v2, 0.5, v1
-; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 0x41800000, v1
-; GFX9-GISEL-NEXT:    v_med3_f32 v2, v2, 1.0, s2
-; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v1, off
-; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: one_non_inline_constant:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x41800000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_f32_e32 v3, 0.5, v1
+; GFX9-NEXT:    v_add_f32_e32 v1, 0x41800000, v1
+; GFX9-NEXT:    v_med3_f32 v2, v3, 1.0, v2
+; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: one_non_inline_constant:
 ; GFX11:       ; %bb.0:
@@ -8097,17 +8081,17 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x41000000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41800000
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, 0.5, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v5, 0x41800000, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, 0.5, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v6, 0x41800000, v2
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, 0x41000000, v2
-; SI-GISEL-NEXT:    v_med3_f32 v3, v4, s4, v3
+; SI-GISEL-NEXT:    v_med3_f32 v3, v5, v3, v4
 ; SI-GISEL-NEXT:    buffer_store_dword v3, v[0:1], s[0:3], 0 addr64
-; SI-GISEL-NEXT:    buffer_store_dword v5, off, s[4:7], 0
+; SI-GISEL-NEXT:    buffer_store_dword v6, off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -8143,7 +8127,8 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; VI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x41800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
@@ -8151,41 +8136,60 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    flat_load_dword v3, v[0:1]
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0x41000000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-GISEL-NEXT:    v_add_u32_e32 v0, vcc, v0, v2
 ; VI-GISEL-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    v_add_f32_e32 v2, 0.5, v3
-; VI-GISEL-NEXT:    v_med3_f32 v2, v2, s2, v4
-; VI-GISEL-NEXT:    v_add_f32_e32 v5, 0x41800000, v3
+; VI-GISEL-NEXT:    v_med3_f32 v2, v2, v4, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v6, 0x41800000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, 0x41000000, v3
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v2
-; VI-GISEL-NEXT:    flat_store_dword v[0:1], v5
+; VI-GISEL-NEXT:    flat_store_dword v[0:1], v6
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    flat_store_dword v[0:1], v3
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; VI-GISEL-NEXT:    s_endpgm
 ;
-; GFX9-LABEL: two_non_inline_constant_multi_use:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX9-NEXT:    v_mov_b32_e32 v2, 0x41800000
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-NEXT:    s_mov_b32 s2, 0x41000000
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_f32_e32 v3, 0.5, v1
-; GFX9-NEXT:    v_add_f32_e32 v4, 0x41800000, v1
-; GFX9-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
-; GFX9-NEXT:    v_med3_f32 v2, v3, s2, v2
-; GFX9-NEXT:    global_store_dword v0, v2, s[0:1]
-; GFX9-NEXT:    global_store_dword v[0:1], v4, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    global_store_dword v[0:1], v1, off
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    s_endpgm
+; GFX9-SDAG-LABEL: two_non_inline_constant_multi_use:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x41800000
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-SDAG-NEXT:    s_mov_b32 s2, 0x41000000
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v3, 0.5, v1
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v4, 0x41800000, v1
+; GFX9-SDAG-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
+; GFX9-SDAG-NEXT:    v_med3_f32 v2, v3, s2, v2
+; GFX9-SDAG-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: two_non_inline_constant_multi_use:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41000000
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v4, 0.5, v1
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v5, 0x41800000, v1
+; GFX9-GISEL-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
+; GFX9-GISEL-NEXT:    v_med3_f32 v2, v4, v2, v3
+; GFX9-GISEL-NEXT:    global_store_dword v0, v2, s[0:1]
+; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v5, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_dword v[0:1], v1, off
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
 ; GFX11-SDAG-LABEL: two_non_inline_constant_multi_use:
 ; GFX11-SDAG:       ; %bb.0:
@@ -8215,13 +8219,12 @@ define amdgpu_kernel void @two_non_inline_constant_multi_use(ptr addrspace(1) %o
 ; GFX11-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
-; GFX11-GISEL-NEXT:    s_mov_b32 s2, 0x41800000
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT:    v_dual_mov_b32 v2, 0x41800000 :: v_dual_add_f32 v3, 0.5, v1
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT:    v_med3_f32 v2, v3, 0x41000000, v2
 ; GFX11-GISEL-NEXT:    v_add_f32_e32 v3, 0x41800000, v1
-; GFX11-GISEL-NEXT:    v_add_f32_e32 v2, 0.5, v1
 ; GFX11-GISEL-NEXT:    v_add_f32_e32 v1, 0x41000000, v1
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-GISEL-NEXT:    v_med3_f32 v2, v2, 0x41000000, s2
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v2, s[0:1]
 ; GFX11-GISEL-NEXT:    global_store_b32 v[0:1], v3, off dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
index d9e0ddd3b904486..5216f565c862097 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-to-ldexp.ll
@@ -607,13 +607,21 @@ define <2 x float> @v_mul_neg16_v2f32(<2 x float> %x) {
 }
 
 define <2 x float> @v_mul_fabs_16_v2f32(<2 x float> %x) {
-; GFX9-LABEL: v_mul_fabs_16_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0x41800000
-; GFX9-NEXT:    v_mul_f32_e64 v0, |v0|, s4
-; GFX9-NEXT:    v_mul_f32_e64 v1, |v1|, s4
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_mul_fabs_16_v2f32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x41800000
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v0, |v0|, s4
+; GFX9-SDAG-NEXT:    v_mul_f32_e64 v1, |v1|, s4
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_mul_fabs_16_v2f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41800000
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v2
+; GFX9-GISEL-NEXT:    v_mul_f32_e64 v1, |v1|, v2
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1011-LABEL: v_mul_fabs_16_v2f32:
 ; GFX1011:       ; %bb.0:
@@ -627,13 +635,21 @@ define <2 x float> @v_mul_fabs_16_v2f32(<2 x float> %x) {
 }
 
 define <2 x float> @v_fma_mul_add_32_v2f32(<2 x float> %x, <2 x float> %y) {
-; GFX9-LABEL: v_fma_mul_add_32_v2f32:
-; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    s_mov_b32 s4, 0x42000000
-; GFX9-NEXT:    v_fma_f32 v0, v0, s4, v2
-; GFX9-NEXT:    v_fma_f32 v1, v1, s4, v3
-; GFX9-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-SDAG-LABEL: v_fma_mul_add_32_v2f32:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_mov_b32 s4, 0x42000000
+; GFX9-SDAG-NEXT:    v_fma_f32 v0, v0, s4, v2
+; GFX9-SDAG-NEXT:    v_fma_f32 v1, v1, s4, v3
+; GFX9-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-GISEL-LABEL: v_fma_mul_add_32_v2f32:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
+; GFX9-GISEL-NEXT:    v_fma_f32 v0, v0, v4, v2
+; GFX9-GISEL-NEXT:    v_fma_f32 v1, v1, v4, v3
+; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fma_mul_add_32_v2f32:
 ; GFX10:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fold-cndmask-wave32.mir b/llvm/test/CodeGen/AMDGPU/fold-cndmask-wave32.mir
index 91165685d30a22e..473a37d33ed5d40 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-cndmask-wave32.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-cndmask-wave32.mir
@@ -9,7 +9,6 @@ body:             |
   bb.0.entry:
     ; CHECK-LABEL: name: fold_cndmask
     ; CHECK: [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-    ; CHECK-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
     ; CHECK-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_MOV_B32_e32_]]
     %0:sreg_32_xm0_xexec = IMPLICIT_DEF
diff --git a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
index 8f15402e516d1ba..a5ba78f018c064a 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir
@@ -1,8 +1,7 @@
 # RUN: llc -march=amdgcn -run-pass si-fold-operands -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
 
 # GCN-LABEL:       name: fold-imm-copy
-# GCN:             [[SREG:%[0-9]+]]:sreg_32_xm0 = S_MOV_B32 65535
-# GCN:             V_AND_B32_e32 [[SREG]]
+# GCN:             V_AND_B32_e32 65535
 
 ---
 name: fold-imm-copy
diff --git a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
index b60780db77378c7..0a74efe79bee7be 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -2030,34 +2030,33 @@ define <2 x double> @fdiv_pow_shl_cnt_vec(<2 x i64> %cnt) nounwind {
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 20, v0
 ; VI-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
 ; VI-NEXT:    v_sub_u32_e64 v0, vcc, 0, 0
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
-; VI-NEXT:    v_subb_u32_e64 v1, s[4:5], v3, v1, vcc
-; VI-NEXT:    v_subb_u32_e32 v3, vcc, v3, v2, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, v0
+; VI-NEXT:    v_subb_u32_e32 v1, vcc, v3, v1, vcc
+; VI-NEXT:    v_lshlrev_b32_e32 v4, 20, v2
+; VI-NEXT:    v_sub_u32_e64 v2, vcc, 0, 0
+; VI-NEXT:    v_subb_u32_e32 v3, vcc, v3, v4, vcc
 ; VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: fdiv_pow_shl_cnt_vec:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 20, v0
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX10-NEXT:    v_sub_co_u32 v0, vcc_lo, 0, 0
-; GFX10-NEXT:    v_sub_co_ci_u32_e64 v1, s4, 0x3ff00000, v1, vcc_lo
-; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v2, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v2, v0
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo
+; GFX10-NEXT:    v_sub_co_u32 v2, vcc_lo, 0, 0
+; GFX10-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: fdiv_pow_shl_cnt_vec:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 20, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 20, v2
 ; GFX11-NEXT:    v_sub_co_u32 v0, vcc_lo, 0, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_sub_co_ci_u32_e64 v1, s0, 0x3ff00000, v1, vcc_lo
-; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
-; GFX11-NEXT:    v_mov_b32_e32 v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v1, vcc_lo, 0x3ff00000, v1, vcc_lo
+; GFX11-NEXT:    v_sub_co_u32 v2, vcc_lo, 0, 0
+; GFX11-NEXT:    v_sub_co_ci_u32_e32 v3, vcc_lo, 0x3ff00000, v3, vcc_lo
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl nuw <2 x i64> <i64 1, i64 1>, %cnt
   %conv = uitofp <2 x i64> %shl to <2 x double>
diff --git a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir
index b76cf45c785f41b..25502a5b39dea8f 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-readlane.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-readlane.mir
@@ -27,8 +27,7 @@ body:             |
 ...
 
 # GCN-LABEL: name: fold-imm-readfirstlane-readfirstlane{{$}}
-# GCN: %1:sreg_32_xm0 = S_MOV_B32 123
-# GCN: %3:sreg_32_xm0 = COPY %1
+# GCN: %3:sreg_32_xm0 = S_MOV_B32 123
 
 ---
 name: fold-imm-readfirstlane-readfirstlane
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index ead59312b89d38a..77198ed90c3ffad 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -387,7 +387,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
 ; SI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
-; SI-NEXT:    v_mov_b32_e32 v3, 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; SI-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -395,6 +394,7 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
 ; SI-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; SI-NEXT:    v_cvt_u32_f32_e32 v2, v1
 ; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    v_mov_b32_e32 v3, v1
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -411,13 +411,13 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
 ; VI-NEXT:    buffer_load_dword v0, off, s[8:11], 0
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
-; VI-NEXT:    v_mov_b32_e32 v3, 0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_cvt_f32_f16_e32 v1, v0
 ; VI-NEXT:    v_cvt_f32_f16_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI-NEXT:    v_cvt_u32_f32_e32 v0, v1
 ; VI-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; VI-NEXT:    v_mov_b32_e32 v1, 0
+; VI-NEXT:    v_mov_b32_e32 v3, v1
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -428,7 +428,6 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
 ; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
 ; GFX11-NEXT:    s_mov_b32 s10, s6
 ; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_mov_b32 s8, s2
 ; GFX11-NEXT:    s_mov_b32 s9, s3
@@ -442,8 +441,9 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
 ; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX11-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
index 13e588dffaf5c18..046f26246969589 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f32.ll
@@ -1347,35 +1347,34 @@ define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32:
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v2, v0
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0xf800000
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v2
-; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v4, v2, v0
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v2
-; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v2, v0
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0xf800000
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
+; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v0
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v3, v0
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v3
+; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v4, v3, v0
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v3
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v3, v0
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v7
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v1
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v1
+; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v3, v1
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v2, v1
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, v1
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v3, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[4:5]
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v3, v2, v1
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v2
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v2, v1
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v7
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
@@ -1424,35 +1423,34 @@ define <2 x float> @v_sqrt_v2f32(<2 x float> %x) {
 ; GISEL-DAZ-LABEL: v_sqrt_v2f32:
 ; GISEL-DAZ:       ; %bb.0:
 ; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v2, v0
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0xf800000
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, v0, v2
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0.5, v2
-; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v2, v4, 0.5
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, 0xf800000
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
+; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v0
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v3, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, v0, v3
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0.5, v3
+; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v3, v4, 0.5
 ; GISEL-DAZ-NEXT:    v_fma_f32 v4, v4, v5, v4
-; GISEL-DAZ-NEXT:    v_fma_f32 v2, v2, v5, v2
+; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v5, v3
 ; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v4, v4, v0
-; GISEL-DAZ-NEXT:    v_fma_f32 v2, v5, v2, v4
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-DAZ-NEXT:    v_fma_f32 v3, v5, v3, v4
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0x37800000, v3
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v1
-; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v1
+; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v1
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v3, v1
+; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v2, v1
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v4
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, v1, v3
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0.5, v3
-; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v3, v2, 0.5
-; GISEL-DAZ-NEXT:    v_fma_f32 v2, v2, v5, v2
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[4:5]
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, v1, v2
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, 0.5, v2
+; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v2, v3, 0.5
 ; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v5, v3
-; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v2, v2, v1
-; GISEL-DAZ-NEXT:    v_fma_f32 v2, v5, v3, v2
+; GISEL-DAZ-NEXT:    v_fma_f32 v2, v2, v5, v2
+; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v3, v3, v1
+; GISEL-DAZ-NEXT:    v_fma_f32 v2, v5, v2, v3
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v1, v4
@@ -1521,51 +1519,50 @@ define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
 ; GISEL-IEEE-LABEL: v_sqrt_v3f32:
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v3, v0
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0xf800000
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v5, v3, v0
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v8, -v7, v3, v0
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0xf800000
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
+; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v0
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v4, v0
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
+; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v5, v4, v0
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v4
+; GISEL-IEEE-NEXT:    v_fma_f32 v8, -v7, v4, v0
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v8
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v6, 0x4f800000, v1
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v1
+; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v6, v1
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[4:5]
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v6
-; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v3, v6, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v6
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v4, v6, v1
 ; GISEL-IEEE-NEXT:    v_add_i32_e64 v8, s[4:5], 1, v6
 ; GISEL-IEEE-NEXT:    v_fma_f32 v9, -v8, v6, v1
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v7
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v6, v3, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v9
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[4:5]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, v4, v8, s[4:5]
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v6, 0x37800000, v4
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v6, 0x4f800000, v2
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v2
+; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v4, v2
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v3, v2
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e64 s[4:5], v1, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[4:5]
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v4
-; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v3, v4, v2
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v4
-; GISEL-IEEE-NEXT:    v_fma_f32 v8, -v7, v4, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[4:5]
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v3
+; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v4, v3, v2
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v3
+; GISEL-IEEE-NEXT:    v_fma_f32 v8, -v7, v3, v2
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v8
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[4:5]
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v3
@@ -1629,50 +1626,49 @@ define <3 x float> @v_sqrt_v3f32(<3 x float> %x) {
 ; GISEL-DAZ-LABEL: v_sqrt_v3f32:
 ; GISEL-DAZ:       ; %bb.0:
 ; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-DAZ-NEXT:    s_mov_b32 s4, 0xf800000
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
-; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v3, v0
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0xf800000
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, v0, v3
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0.5, v3
-; GISEL-DAZ-NEXT:    v_fma_f32 v6, -v3, v5, 0.5
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0xf800000
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0x4f800000, v0
+; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v0
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v4, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, v0, v4
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0.5, v4
+; GISEL-DAZ-NEXT:    v_fma_f32 v6, -v4, v5, 0.5
 ; GISEL-DAZ-NEXT:    v_fma_f32 v5, v5, v6, v5
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v6, v3
+; GISEL-DAZ-NEXT:    v_fma_f32 v4, v4, v6, v4
 ; GISEL-DAZ-NEXT:    v_fma_f32 v6, -v5, v5, v0
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v6, v3, v5
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, 0x37800000, v3
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GISEL-DAZ-NEXT:    v_fma_f32 v4, v6, v4, v5
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v1
-; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v4, v1
+; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v1
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v5, v1
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v6, 0x260
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v6
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[4:5]
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, v1, v5
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, v1, v5
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, 0.5, v5
-; GISEL-DAZ-NEXT:    v_fma_f32 v7, -v5, v3, 0.5
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v7, v3
+; GISEL-DAZ-NEXT:    v_fma_f32 v7, -v5, v4, 0.5
+; GISEL-DAZ-NEXT:    v_fma_f32 v4, v4, v7, v4
 ; GISEL-DAZ-NEXT:    v_fma_f32 v5, v5, v7, v5
-; GISEL-DAZ-NEXT:    v_fma_f32 v7, -v3, v3, v1
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v7, v5, v3
+; GISEL-DAZ-NEXT:    v_fma_f32 v7, -v4, v4, v1
+; GISEL-DAZ-NEXT:    v_fma_f32 v4, v7, v5, v4
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v7, 0x4f800000, v2
-; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e64 s[4:5], v4, v2
+; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e64 s[4:5], v3, v2
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[4:5]
-; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v4, v2
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, 0x37800000, v3
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v3, v2
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v1, v6
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, v2, v4
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0.5, v4
-; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v4, v3, 0.5
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v5, v3
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, v2, v3
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0.5, v3
+; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v3, v4, 0.5
 ; GISEL-DAZ-NEXT:    v_fma_f32 v4, v4, v5, v4
-; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v3, v3, v2
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v5, v4, v3
+; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v5, v3
+; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v4, v4, v2
+; GISEL-DAZ-NEXT:    v_fma_f32 v3, v5, v3, v4
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0x37800000, v3
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
 ; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v2, v6
@@ -2021,35 +2017,34 @@ define <2 x float> @v_sqrt_v2f32_ulp1(<2 x float> %x) {
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp1:
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, 0x4f800000, v0
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v2, v0
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0xf800000
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v2
-; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v4, v2, v0
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v2
-; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v2, v0
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0xf800000
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x4f800000, v0
+; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v0
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v3, v0
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v4, s[4:5], -1, v3
+; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v4, v3, v0
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v3
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v3, v0
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v7
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[4:5]
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v5, 0x4f800000, v1
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v3, v1
+; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, v2, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v3, v1
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v2, v1
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v2, v3, v1
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v3, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, v3, v0, s[4:5]
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v3, v2, v1
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v2
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v2, v1
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v7
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
@@ -2093,11 +2088,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2(<2 x float> %x) {
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2:
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0x800000
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
@@ -2162,37 +2157,35 @@ define <2 x float> @v_sqrt_v2f32_ulp1_fabs(<2 x float> %x) {
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp1_fabs:
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0xf800000
-; GISEL-IEEE-NEXT:    s_mov_b32 s5, 0x4f800000
-; GISEL-IEEE-NEXT:    v_mul_f32_e64 v2, |v0|, s5
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e64 vcc, s4, |v0|
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, |v0|, v2, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v2, v0
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0xf800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x4f800000
-; GISEL-IEEE-NEXT:    v_mul_f32_e64 v4, |v1|, v4
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v2
-; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v5, v2, v0
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v2
-; GISEL-IEEE-NEXT:    v_fma_f32 v8, -v7, v2, v0
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0xf800000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x4f800000
+; GISEL-IEEE-NEXT:    v_mul_f32_e64 v4, |v0|, v3
+; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e64 vcc, v2, |v0|
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, |v0|, v4, vcc
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v4, v0
+; GISEL-IEEE-NEXT:    v_mul_f32_e64 v3, |v1|, v3
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v5, s[4:5], -1, v4
+; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v5, v4, v0
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v7, s[4:5], 1, v4
+; GISEL-IEEE-NEXT:    v_fma_f32 v8, -v7, v4, v0
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v6
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v8
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v7, s[4:5]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v2
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e64 vcc, v3, |v1|
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, |v1|, v4, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v3, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, v4, v7, s[4:5]
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v4
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, v4, v5, vcc
+; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e64 vcc, v2, |v1|
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, |v1|, v3, vcc
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v2, v1
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x260
 ; GISEL-IEEE-NEXT:    v_cmp_class_f32_e64 s[4:5], v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, v2, v0, s[4:5]
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v2, s[4:5], -1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v4, -v2, v3, v1
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v3, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v0, v4, v0, s[4:5]
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v3, s[4:5], -1, v2
+; GISEL-IEEE-NEXT:    v_fma_f32 v4, -v3, v2, v1
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[4:5], 1, v2
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v2, v1
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[4:5], 0, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[4:5]
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
 ; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], 0, v7
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[4:5]
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v2
@@ -2237,11 +2230,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_fabs(<2 x float> %x) {
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_fabs:
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-IEEE-NEXT:    s_mov_b32 s6, 0x800000
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, s6
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[6:7], |v1|, s6
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v2
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, s[4:5]
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[6:7], |v1|, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v0, |v0|, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[6:7]
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e64 v1, |v1|, v2
@@ -2436,16 +2429,16 @@ define float @v_sqrt_f32_ulp2_noncontractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
-; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v1, v4, vcc
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
@@ -2520,16 +2513,16 @@ define float @v_sqrt_f32_ulp2_contractable_fdiv(float %x, float %y) {
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
-; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v1
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v1, v3, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v1, v4, vcc
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, v3, v2
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v1, v0
@@ -2663,11 +2656,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_noncontractable_rcp:
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0x800000
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, 32, s[4:5]
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
@@ -2675,17 +2668,17 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 0, -16, s[4:5]
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v2
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
 ; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, 0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v2, v0
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -2707,25 +2700,45 @@ define <2 x float> @v_sqrt_v2f32_ulp2_noncontractable_rcp(<2 x float> %x) {
 }
 
 define <2 x float> @v_sqrt_v2f32_ulp2_contractable_rcp(<2 x float> %x) {
-; GCN-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
-; GCN-IEEE:       ; %bb.0:
-; GCN-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-IEEE-NEXT:    s_mov_b32 s4, 0x800000
-; GCN-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
-; GCN-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v3, 1.0, v2, vcc
-; GCN-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
-; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v3
-; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[4:5]
-; GCN-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
-; GCN-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
-; GCN-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
-; GCN-IEEE-NEXT:    v_mov_b32_e32 v3, 0x45800000
-; GCN-IEEE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
-; GCN-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
-; GCN-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
-; GCN-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
-; GCN-IEEE-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
+; SDAG-IEEE:       ; %bb.0:
+; SDAG-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-IEEE-NEXT:    s_mov_b32 s4, 0x800000
+; SDAG-IEEE-NEXT:    v_mov_b32_e32 v2, 0x4b800000
+; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v3, 1.0, v2, vcc
+; SDAG-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
+; SDAG-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v3
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[4:5]
+; SDAG-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; SDAG-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
+; SDAG-IEEE-NEXT:    v_mov_b32_e32 v3, 0x45800000
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v3, vcc
+; SDAG-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; SDAG-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
+; SDAG-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; SDAG-IEEE-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
+; GISEL-IEEE:       ; %bb.0:
+; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x4b800000
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v2
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v4
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[4:5]
+; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v0, v0
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    v_rsq_f32_e32 v1, v1
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x45800000
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v0, v2
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, 1.0, v4, s[4:5]
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, v1, v2
+; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_rcp:
 ; GCN-DAZ:       ; %bb.0:
@@ -2787,11 +2800,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0x800000
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
@@ -2799,26 +2812,26 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, s[4:5]
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v5
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
-; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v5, v2
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, s4
+; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v6, v2
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, v5
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v5, v2, v5, vcc
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v6, v2, v6, vcc
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v2, v2
 ; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v4, v4
 ; GISEL-IEEE-NEXT:    v_sub_i32_e32 v0, vcc, v2, v0
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, v5, v4
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, v6, v4
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v3
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v5
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, v3, v4, vcc
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v3, v3
@@ -2827,26 +2840,47 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv(<2 x float> %x, <2 x flo
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v2, v1
 ; GISEL-IEEE-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GCN-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
-; GCN-DAZ:       ; %bb.0:
-; GCN-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
-; GCN-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
-; GCN-DAZ-NEXT:    s_mov_b32 s4, 0x6f800000
-; GCN-DAZ-NEXT:    v_mov_b32_e32 v4, 0x2f800000
-; GCN-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
-; GCN-DAZ-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
-; GCN-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
-; GCN-DAZ-NEXT:    v_cndmask_b32_e32 v4, 1.0, v4, vcc
-; GCN-DAZ-NEXT:    v_mul_f32_e32 v0, v0, v5
-; GCN-DAZ-NEXT:    v_mul_f32_e32 v1, v1, v4
-; GCN-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
-; GCN-DAZ-NEXT:    v_rcp_f32_e32 v1, v1
-; GCN-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GCN-DAZ-NEXT:    v_mul_f32_e32 v1, v3, v1
-; GCN-DAZ-NEXT:    v_mul_f32_e32 v0, v5, v0
-; GCN-DAZ-NEXT:    v_mul_f32_e32 v1, v4, v1
-; GCN-DAZ-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
+; SDAG-DAZ:       ; %bb.0:
+; SDAG-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
+; SDAG-DAZ-NEXT:    s_mov_b32 s4, 0x6f800000
+; SDAG-DAZ-NEXT:    v_mov_b32_e32 v4, 0x2f800000
+; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, s4
+; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
+; SDAG-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, s4
+; SDAG-DAZ-NEXT:    v_cndmask_b32_e32 v4, 1.0, v4, vcc
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v0, v5
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, v1, v4
+; SDAG-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; SDAG-DAZ-NEXT:    v_rcp_f32_e32 v1, v1
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, v3, v1
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v0, v5, v0
+; SDAG-DAZ-NEXT:    v_mul_f32_e32 v1, v4, v1
+; SDAG-DAZ-NEXT:    s_setpc_b64 s[30:31]
+;
+; GISEL-DAZ-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv:
+; GISEL-DAZ:       ; %bb.0:
+; GISEL-DAZ-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-DAZ-NEXT:    v_sqrt_f32_e32 v0, v0
+; GISEL-DAZ-NEXT:    v_sqrt_f32_e32 v1, v1
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0x6f800000
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v5, 0x2f800000
+; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v0|, v4
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v6, 1.0, v5, vcc
+; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e64 vcc, |v1|, v4
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v4, 1.0, v5, vcc
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v0, v6
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, v1, v4
+; GISEL-DAZ-NEXT:    v_rcp_f32_e32 v0, v0
+; GISEL-DAZ-NEXT:    v_rcp_f32_e32 v1, v1
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v2, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, v6, v0
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, v4, v1
+; GISEL-DAZ-NEXT:    s_setpc_b64 s[30:31]
   %sqrt = call contract <2 x float> @llvm.sqrt.v2f32(<2 x float> %x), !fpmath !4
   %result = fdiv contract <2 x float> %y, %sqrt, !fpmath !3
   ret <2 x float> %result
@@ -2893,11 +2927,11 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; GISEL-IEEE-LABEL: v_sqrt_v2f32_ulp2_contractable_fdiv_arcp:
 ; GISEL-IEEE:       ; %bb.0:
 ; GISEL-IEEE-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0x800000
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v0
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, vcc
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e64 s[4:5], s4, v1
-; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x800000
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, v0, v4
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v5, 0, 32, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[4:5], v1, v4
+; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v5
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, 32, s[4:5]
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v0, v0
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
@@ -2905,10 +2939,10 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, vcc
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v0, v4
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v4, 0, -16, s[4:5]
-; GISEL-IEEE-NEXT:    s_mov_b32 s4, 0x7f800000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v1, v1, v4
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v4, v0
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v5
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v4, v0, v4, vcc
 ; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v4, v4
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
@@ -2916,7 +2950,7 @@ define <2 x float> @v_sqrt_v2f32_ulp2_contractable_fdiv_arcp(<2 x float> %x, <2
 ; GISEL-IEEE-NEXT:    v_ldexp_f32_e32 v0, v4, v0
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, v2, v0
 ; GISEL-IEEE-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
 ; GISEL-IEEE-NEXT:    v_rcp_f32_e32 v2, v2
 ; GISEL-IEEE-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
@@ -4365,52 +4399,51 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float
 ; GISEL-IEEE-LABEL: elim_redun_check_v2:
 ; GISEL-IEEE:       ; %bb.0: ; %entry
 ; GISEL-IEEE-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GISEL-IEEE-NEXT:    s_mov_b32 s0, 0xf800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0xf800000
-; GISEL-IEEE-NEXT:    s_mov_b32 s2, 0x80000000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v0, 0xf800000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GISEL-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, s6
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, s6, v1
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v3, v2
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, s7, v0
-; GISEL-IEEE-NEXT:    s_mov_b32 s6, -1
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v5, s[0:1], -1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v5, v3, v2
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v8, -v7, v3, v2
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, s7, v1
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v4, s[0:1], -1, v3
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v3
+; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v4, v3, v2
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v5, v3, v2
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v6
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v7
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v8
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[0:1]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v6, s7
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v4, v0
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-IEEE-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v3, s[0:1], -1, v4
-; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v3, v4, v0
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v4
-; GISEL-IEEE-NEXT:    v_fma_f32 v9, -v8, v4, v0
-; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v7
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v9
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[0:1]
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, s7
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v0
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-IEEE-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v4
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v3, s[0:1], -1, v1
+; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v3, v1, v0
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[0:1], 1, v1
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v1, v0
+; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v7
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x80000000
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v1
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v6
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-IEEE-NEXT:    s_mov_b32 s6, -1
 ; GISEL-IEEE-NEXT:    s_mov_b32 s7, 0xf000
 ; GISEL-IEEE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GISEL-IEEE-NEXT:    s_endpgm
@@ -4464,18 +4497,16 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float
 ; GISEL-DAZ-LABEL: elim_redun_check_v2:
 ; GISEL-DAZ:       ; %bb.0: ; %entry
 ; GISEL-DAZ-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GISEL-DAZ-NEXT:    s_mov_b32 s0, 0xf800000
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0xf800000
-; GISEL-DAZ-NEXT:    s_mov_b32 s2, 0x80000000
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v0, 0xf800000
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GISEL-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, s6
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v1
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, s6, v1
+; GISEL-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v3, v2
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, s7, v0
-; GISEL-DAZ-NEXT:    s_mov_b32 s6, -1
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, s7
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, s7, v1
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, v2, v3
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0.5, v3
 ; GISEL-DAZ-NEXT:    v_fma_f32 v6, -v3, v5, 0.5
@@ -4485,29 +4516,30 @@ define amdgpu_kernel void @elim_redun_check_v2(ptr addrspace(1) %out, <2 x float
 ; GISEL-DAZ-NEXT:    v_fma_f32 v3, v6, v3, v5
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, 0x37800000, v3
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v5, s7
-; GISEL-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v4, v0
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v6, 0x260
-; GISEL-DAZ-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v6
+; GISEL-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-DAZ-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v4
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, v0, v4
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0.5, v4
-; GISEL-DAZ-NEXT:    v_fma_f32 v7, -v4, v3, 0.5
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v7, v3
-; GISEL-DAZ-NEXT:    v_fma_f32 v4, v4, v7, v4
-; GISEL-DAZ-NEXT:    v_fma_f32 v7, -v3, v3, v0
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v7, v4, v3
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0x37800000, v3
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v6
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, v0, v1
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
+; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v1, v3, 0.5
+; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v5, v3
+; GISEL-DAZ-NEXT:    v_fma_f32 v1, v1, v5, v1
+; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v3, v3, v0
+; GISEL-DAZ-NEXT:    v_fma_f32 v1, v5, v1, v3
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0x80000000
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v1
+; GISEL-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v3
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v5
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GISEL-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v3
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-DAZ-NEXT:    s_mov_b32 s6, -1
 ; GISEL-DAZ-NEXT:    s_mov_b32 s7, 0xf000
 ; GISEL-DAZ-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GISEL-DAZ-NEXT:    s_endpgm
@@ -4571,52 +4603,51 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f
 ; GISEL-IEEE-LABEL: elim_redun_check_v2_ult:
 ; GISEL-IEEE:       ; %bb.0: ; %entry
 ; GISEL-IEEE-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GISEL-IEEE-NEXT:    s_mov_b32 s0, 0xf800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0xf800000
-; GISEL-IEEE-NEXT:    s_mov_b32 s2, 0x80000000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v0, 0xf800000
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GISEL-IEEE-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v1, s6
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GISEL-IEEE-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v1
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, s6, v1
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v3, v2
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v0, s7, v0
-; GISEL-IEEE-NEXT:    s_mov_b32 s6, -1
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v5, s[0:1], -1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v5, v3, v2
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v7, s[0:1], 1, v3
-; GISEL-IEEE-NEXT:    v_fma_f32 v8, -v7, v3, v2
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v1, s7, v1
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v4, s[0:1], -1, v3
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v5, s[0:1], 1, v3
+; GISEL-IEEE-NEXT:    v_fma_f32 v6, -v4, v3, v2
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v5, v3, v2
 ; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v6
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v7
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v8
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s[0:1]
-; GISEL-IEEE-NEXT:    v_mul_f32_e32 v5, 0x37800000, v3
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v6, s7
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
-; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v4, v0
-; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, 0x260
-; GISEL-IEEE-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v3, s[0:1], -1, v4
-; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v3, v4, v0
-; GISEL-IEEE-NEXT:    v_add_i32_e64 v8, s[0:1], 1, v4
-; GISEL-IEEE-NEXT:    v_fma_f32 v9, -v8, v4, v0
-; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v7
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
-; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v9
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v3, v3, v8, s[0:1]
 ; GISEL-IEEE-NEXT:    v_mul_f32_e32 v4, 0x37800000, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v5
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v5, s7
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v5, v1, vcc
+; GISEL-IEEE-NEXT:    v_sqrt_f32_e32 v1, v0
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-IEEE-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v4
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v3, s[0:1], -1, v1
+; GISEL-IEEE-NEXT:    v_fma_f32 v5, -v3, v1, v0
+; GISEL-IEEE-NEXT:    v_add_i32_e64 v6, s[0:1], 1, v1
+; GISEL-IEEE-NEXT:    v_fma_f32 v7, -v6, v1, v0
+; GISEL-IEEE-NEXT:    v_cmp_ge_f32_e64 s[0:1], 0, v5
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GISEL-IEEE-NEXT:    v_cmp_lt_f32_e64 s[0:1], 0, v7
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
+; GISEL-IEEE-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GISEL-IEEE-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
+; GISEL-IEEE-NEXT:    v_mov_b32_e32 v3, 0x80000000
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GISEL-IEEE-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GISEL-IEEE-NEXT:    v_cmp_nle_f32_e32 vcc, s2, v1
+; GISEL-IEEE-NEXT:    v_cmp_nge_f32_e32 vcc, s6, v3
 ; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GISEL-IEEE-NEXT:    v_cmp_nle_f32_e32 vcc, s2, v6
-; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GISEL-IEEE-NEXT:    v_cmp_nge_f32_e32 vcc, s7, v3
+; GISEL-IEEE-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-IEEE-NEXT:    s_mov_b32 s6, -1
 ; GISEL-IEEE-NEXT:    s_mov_b32 s7, 0xf000
 ; GISEL-IEEE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GISEL-IEEE-NEXT:    s_endpgm
@@ -4670,18 +4701,16 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f
 ; GISEL-DAZ-LABEL: elim_redun_check_v2_ult:
 ; GISEL-DAZ:       ; %bb.0: ; %entry
 ; GISEL-DAZ-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; GISEL-DAZ-NEXT:    s_mov_b32 s0, 0xf800000
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0xf800000
-; GISEL-DAZ-NEXT:    s_mov_b32 s2, 0x80000000
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v0, 0xf800000
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, 0x4f800000
 ; GISEL-DAZ-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v1, s6
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v2, s6, v0
-; GISEL-DAZ-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v1
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v2, s6
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, s6, v1
+; GISEL-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
 ; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v3, v2
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v0, s7, v0
-; GISEL-DAZ-NEXT:    s_mov_b32 s6, -1
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, s7
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, s7, v1
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, v2, v3
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0.5, v3
 ; GISEL-DAZ-NEXT:    v_fma_f32 v6, -v3, v5, 0.5
@@ -4691,29 +4720,30 @@ define amdgpu_kernel void @elim_redun_check_v2_ult(ptr addrspace(1) %out, <2 x f
 ; GISEL-DAZ-NEXT:    v_fma_f32 v3, v6, v3, v5
 ; GISEL-DAZ-NEXT:    v_mul_f32_e32 v5, 0x37800000, v3
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v5, s7
-; GISEL-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
-; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v4, v0
-; GISEL-DAZ-NEXT:    v_mov_b32_e32 v6, 0x260
-; GISEL-DAZ-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v6
+; GISEL-DAZ-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v0
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GISEL-DAZ-NEXT:    v_rsq_f32_e32 v1, v0
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0x260
+; GISEL-DAZ-NEXT:    v_cmp_class_f32_e64 s[0:1], v2, v4
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, v0, v4
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0.5, v4
-; GISEL-DAZ-NEXT:    v_fma_f32 v7, -v4, v3, 0.5
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v7, v3
-; GISEL-DAZ-NEXT:    v_fma_f32 v4, v4, v7, v4
-; GISEL-DAZ-NEXT:    v_fma_f32 v7, -v3, v3, v0
-; GISEL-DAZ-NEXT:    v_fma_f32 v3, v7, v4, v3
-; GISEL-DAZ-NEXT:    v_mul_f32_e32 v4, 0x37800000, v3
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v6
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, v0, v1
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v1, 0.5, v1
+; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v1, v3, 0.5
+; GISEL-DAZ-NEXT:    v_fma_f32 v3, v3, v5, v3
+; GISEL-DAZ-NEXT:    v_fma_f32 v1, v1, v5, v1
+; GISEL-DAZ-NEXT:    v_fma_f32 v5, -v3, v3, v0
+; GISEL-DAZ-NEXT:    v_fma_f32 v1, v5, v1, v3
+; GISEL-DAZ-NEXT:    v_mul_f32_e32 v3, 0x37800000, v1
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GISEL-DAZ-NEXT:    v_cmp_class_f32_e32 vcc, v0, v4
+; GISEL-DAZ-NEXT:    v_mov_b32_e32 v3, 0x80000000
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc
 ; GISEL-DAZ-NEXT:    v_mov_b32_e32 v4, 0x7fc00000
-; GISEL-DAZ-NEXT:    v_cmp_nle_f32_e32 vcc, s2, v1
+; GISEL-DAZ-NEXT:    v_cmp_nge_f32_e32 vcc, s6, v3
 ; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
-; GISEL-DAZ-NEXT:    v_cmp_nle_f32_e32 vcc, s2, v5
-; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GISEL-DAZ-NEXT:    v_cmp_nge_f32_e32 vcc, s7, v3
+; GISEL-DAZ-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GISEL-DAZ-NEXT:    s_mov_b32 s6, -1
 ; GISEL-DAZ-NEXT:    s_mov_b32 s7, 0xf000
 ; GISEL-DAZ-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GISEL-DAZ-NEXT:    s_endpgm
@@ -4746,3 +4776,5 @@ attributes #5 = { "no-infs-fp-math"="true" }
 !2 = !{float 2.0}
 !3 = !{float 2.5}
 !4 = !{float 3.0}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN-IEEE: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
index e28fba285cb7cb3..e254d9c46dd1817 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.gfx90a.ll
@@ -163,7 +163,7 @@ bb:
 
 ; GCN-LABEL: {{^}}test_mfma_f64_16x16x4f64_splat_lit:
 ; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0x405ec000
+; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x405ec000
 ; GFX90A:  v_mfma_f64_16x16x4f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
 ; GFX940:  v_mfma_f64_16x16x4_f64 a[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], a[{{[0-9]+:[0-9]+}}]{{$}}
 ; GCN:     global_store_dwordx4
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
index 20ad0db1de066d8..2dab8782ac1c47d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mfma.ll
@@ -595,8 +595,7 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat:
-; GFX908:         v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
-; GFX90A_40:      s_mov_b32 [[TMP:s[0-9]+]], 0x42f60000
+; GCN:            v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
 ; GCN:            v_accvgpr_write_b32 [[TTMPA:a[0-9]+]], [[TMP]]
 ; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
 ; GFX908:         v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
@@ -621,8 +620,7 @@ bb:
 }
 
 ; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code:
-; GFX908:   v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000
-; GFX90A_40:s_mov_b32 [[TMP0:s[0-9]+]], 0x42f60000
+; GCN:      v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000
 ; GCN:      v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]]
 ; GFX90A_40-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]]
 ; GFX908-NEXT:   v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
index b8df8b3d05727ac..c8570d6f279a6b2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp.ll
@@ -108,15 +108,14 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-LABEL: s_exp_f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x2c
-; GFX900-GISEL-NEXT:    s_mov_b32 s3, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x32a5705f
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v1, s3, -v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s2, v0, -v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s2, v0, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s2, v1, v0
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v2, v3
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v3
@@ -164,15 +163,15 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-LABEL: s_exp_f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dword s2, s[0:1], 0xb
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x32a5705f
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_fma_f32 v1, v1, s3, -v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
+; SI-GISEL-NEXT:    v_fma_f32 v0, s2, v0, -v2
 ; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_fma_f32 v0, s2, v0, v1
+; SI-GISEL-NEXT:    v_fma_f32 v0, s2, v1, v0
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v2, v3
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v3
@@ -186,7 +185,6 @@ define amdgpu_kernel void @s_exp_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -484,43 +482,38 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL-LABEL: s_exp_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; GFX900-GISEL-NEXT:    s_mov_b32 s5, 0x42b17218
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v1, s4, -v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, s2, v0, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v3
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s3, v3, -v5
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, v3
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v5
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v5, v5, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v5, v5, v1
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v5, v0
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v0
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v2
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v5, v3
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v5, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -570,43 +563,38 @@ define amdgpu_kernel void @s_exp_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-GISEL-LABEL: s_exp_v2f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x32a5705f
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
-; SI-GISEL-NEXT:    s_mov_b32 s5, 0x42b17218
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v1
-; SI-GISEL-NEXT:    v_fma_f32 v1, v1, s4, -v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_fma_f32 v1, s2, v0, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v3
-; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_fma_f32 v3, s3, v3, -v5
-; SI-GISEL-NEXT:    v_fma_f32 v0, s3, v0, v3
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v5
-; SI-GISEL-NEXT:    v_sub_f32_e32 v5, v5, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s2, v0
+; SI-GISEL-NEXT:    v_fma_f32 v3, s2, v0, -v2
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, s3, v0
+; SI-GISEL-NEXT:    v_fma_f32 v3, s2, v1, v3
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    v_fma_f32 v0, s3, v0, -v5
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_fma_f32 v0, s3, v1, v0
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v1, v5
+; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
+; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v5, v5, v1
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v5, v0
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v5, v0
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v2
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0xc2ce8ed0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v4, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s3
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, v5, v3
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v3
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v2, v6, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v5, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s3, v3
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1065,59 +1053,53 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX900-GISEL-LABEL: s_exp_v3f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42b17218
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
+; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, s2, -v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s4, v1, v0
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s5, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, s5, v3, -v6
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v8, v6
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, s5, v1, v7
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v6, v6, v8
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v7, v8
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v6, v6
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v5, s5, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v6, s5, v1, -v5
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v7, v5
+; GFX900-GISEL-NEXT:    v_fma_f32 v6, s5, v2, v6
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v5, v5, v7
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v6, v7
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v5, v5
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s4, v1, -v0
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, s4, v2, v3
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v5, v5, v6
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s6, v1
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0xc2ce8ed0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, s5
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v6, v6, v7
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v7, s6, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s6, v3, -v7
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, s6, v1, v3
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v7
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v7, v7, v3
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v7, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v7, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, s6, v1, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, s6, v2, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v2, v6
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v6, v6, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v6, v1
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v4
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42b17218
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v6, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v4
-; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s3, 0x42b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v8
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v8
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v3, v7, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v3, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v4
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x7f800000
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v5, 0, vcc
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s5, v3
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v2, v6, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    global_store_dwordx3 v3, v[0:2], s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -1179,59 +1161,54 @@ define amdgpu_kernel void @s_exp_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; SI-GISEL-LABEL: s_exp_v3f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42b17218
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v0, v0, s2, -v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_fma_f32 v0, s4, v1, v0
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s5, v3
-; SI-GISEL-NEXT:    v_fma_f32 v7, s5, v3, -v6
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v8, v6
-; SI-GISEL-NEXT:    v_fma_f32 v7, s5, v1, v7
-; SI-GISEL-NEXT:    v_sub_f32_e32 v6, v6, v8
-; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v7, v8
-; SI-GISEL-NEXT:    v_exp_f32_e32 v6, v6
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, s5, v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, s5, v1, -v5
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v7, v5
+; SI-GISEL-NEXT:    v_fma_f32 v6, s5, v2, v6
+; SI-GISEL-NEXT:    v_sub_f32_e32 v5, v5, v7
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v6, v7
+; SI-GISEL-NEXT:    v_exp_f32_e32 v5, v5
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v1
+; SI-GISEL-NEXT:    v_fma_f32 v3, s4, v1, -v0
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v4, v0
+; SI-GISEL-NEXT:    v_fma_f32 v3, s4, v2, v3
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v3
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v5, v5, v6
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s6, v1
+; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0xc2ce8ed0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v6, v6, v7
-; SI-GISEL-NEXT:    v_mul_f32_e32 v7, s6, v3
-; SI-GISEL-NEXT:    v_fma_f32 v3, s6, v3, -v7
-; SI-GISEL-NEXT:    v_fma_f32 v1, s6, v1, v3
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v7
-; SI-GISEL-NEXT:    v_sub_f32_e32 v7, v7, v3
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2ce8ed0
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v7, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v3
-; SI-GISEL-NEXT:    v_exp_f32_e32 v7, v1
+; SI-GISEL-NEXT:    v_fma_f32 v1, s6, v1, -v6
+; SI-GISEL-NEXT:    v_fma_f32 v1, s6, v2, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v2, v6
+; SI-GISEL-NEXT:    v_sub_f32_e32 v6, v6, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v6, v1
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42b17218
+; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; SI-GISEL-NEXT:    v_exp_f32_e32 v6, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s5
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0x42b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v8
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, 0, vcc
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v8
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v3, v7, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v3, 0, vcc
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v4
-; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x7f800000
+; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v3
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v5, 0, vcc
+; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s5, v3
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v2, v6, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, 0, vcc
+; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v3
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; SI-GISEL-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:8
 ; SI-GISEL-NEXT:    s_endpgm
@@ -1872,72 +1849,66 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX900-GISEL-LABEL: s_exp_v4f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x42b17218
+; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, s2, -v1
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, s4, v2, v0
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v3
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, s4, v2, -v0
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, s4, v3, v1
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v4
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0xc2ce8ed0
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s5, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, s5, v3, -v1
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v8, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v7, s5, v2, v7
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v7, v8
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s5, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v6, s5, v2, -v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v7, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v6, s5, v3, v6
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v7
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v6
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v5
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v8, s5
-; GFX900-GISEL-NEXT:    s_mov_b32 s3, 0x42b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v7
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v7, s6, v3
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, s6, v3, -v7
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v10, v7
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v6
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s6, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v8, s6, v2, -v6
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v9, v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v8, s6, v3, v8
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v6, v6, v9
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v8, v9
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v6, v6
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v8
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, s7, v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, s6, v2, v9
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v7, v7, v10
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, s7, v3, -v8
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v7, v7, v9
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, s7, v2, v3
+; GFX900-GISEL-NEXT:    v_ldexp_f32 v6, v6, v8
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, s7, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, s7, v2, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, s7, v3, v2
 ; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v8
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v9, v10
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v7, v7
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v8, v8, v3
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v8, v2
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v8, v2
-; GFX900-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; GFX900-GISEL-NEXT:    v_ldexp_f32 v7, v7, v9
+; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s5, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v3, v8, v3
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v5
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0
-; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX900-GISEL-NEXT:    s_endpgm
 ;
@@ -2012,73 +1983,67 @@ define amdgpu_kernel void @s_exp_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; SI-GISEL-LABEL: s_exp_v4f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x42b17218
+; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v0, v0, s2, -v1
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_fma_f32 v0, s4, v2, v0
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v3
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, s4, v2, -v0
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v4, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, s4, v3, v1
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v4
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc2ce8ed0
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v4
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0xc2ce8ed0
+; SI-GISEL-NEXT:    s_mov_b32 s2, -1
 ; SI-GISEL-NEXT:    v_ldexp_f32_e32 v0, v0, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s5, v3
-; SI-GISEL-NEXT:    v_fma_f32 v7, s5, v3, -v1
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v8, v1
-; SI-GISEL-NEXT:    v_fma_f32 v7, s5, v2, v7
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v7, v8
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s5, v2
+; SI-GISEL-NEXT:    v_fma_f32 v6, s5, v2, -v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v7, v1
+; SI-GISEL-NEXT:    v_fma_f32 v6, s5, v3, v6
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v6, v7
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s4, v5
-; SI-GISEL-NEXT:    v_mov_b32_e32 v8, s5
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0x42b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v7
-; SI-GISEL-NEXT:    v_mul_f32_e32 v7, s6, v3
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v8
-; SI-GISEL-NEXT:    v_fma_f32 v9, s6, v3, -v7
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v10, v7
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s6, v2
+; SI-GISEL-NEXT:    v_fma_f32 v8, s6, v2, -v6
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v9, v6
+; SI-GISEL-NEXT:    v_fma_f32 v8, s6, v3, v8
+; SI-GISEL-NEXT:    v_sub_f32_e32 v6, v6, v9
+; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v8
+; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v8, v9
+; SI-GISEL-NEXT:    v_exp_f32_e32 v6, v6
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v7, vcc
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v4
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s3, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, s7, v3
-; SI-GISEL-NEXT:    v_fma_f32 v9, s6, v2, v9
-; SI-GISEL-NEXT:    v_sub_f32_e32 v7, v7, v10
-; SI-GISEL-NEXT:    v_fma_f32 v3, s7, v3, -v8
-; SI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v9
-; SI-GISEL-NEXT:    v_fma_f32 v2, s7, v2, v3
+; SI-GISEL-NEXT:    v_ldexp_f32_e32 v6, v6, v8
+; SI-GISEL-NEXT:    v_mul_f32_e32 v8, s7, v2
+; SI-GISEL-NEXT:    v_fma_f32 v2, s7, v2, -v8
+; SI-GISEL-NEXT:    v_fma_f32 v2, s7, v3, v2
 ; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v8
-; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v9, v10
-; SI-GISEL-NEXT:    v_exp_f32_e32 v7, v7
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v8, v8, v3
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, v8, v2
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v8, v2
-; SI-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v6, vcc
-; SI-GISEL-NEXT:    v_ldexp_f32_e32 v7, v7, v9
+; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s5, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v4
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v7, 0, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v6, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s6, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, v2, v7, vcc
 ; SI-GISEL-NEXT:    v_ldexp_f32_e32 v3, v8, v3
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s7, v4
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, v3, 0, vcc
 ; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s7, v5
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
-; SI-GISEL-NEXT:    s_mov_b32 s2, -1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc
 ; SI-GISEL-NEXT:    s_mov_b32 s3, 0xf000
-; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
@@ -2648,14 +2613,14 @@ define float @v_exp_f32(float %in) {
 ; GFX900-GISEL-LABEL: v_exp_f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -2694,14 +2659,14 @@ define float @v_exp_f32(float %in) {
 ; SI-GISEL-LABEL: v_exp_f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -2808,14 +2773,14 @@ define float @v_exp_fabs_f32(float %in) {
 ; GFX900-GISEL-LABEL: v_exp_fabs_f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v1, |v0|, s4
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, |v0|, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e64 v2, |v0|, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, |v0|, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, |v0|, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, |v0|, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -2854,14 +2819,14 @@ define float @v_exp_fabs_f32(float %in) {
 ; SI-GISEL-LABEL: v_exp_fabs_f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e64 v1, |v0|, s4
-; SI-GISEL-NEXT:    v_fma_f32 v2, |v0|, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e64 v2, |v0|, v1
+; SI-GISEL-NEXT:    v_fma_f32 v1, |v0|, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, |v0|, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, |v0|, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -2969,14 +2934,14 @@ define float @v_exp_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL-LABEL: v_exp_fneg_fabs_f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v1, -|v0|, s4
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, -|v0|, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e64 v2, -|v0|, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, -|v0|, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, -|v0|, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, -|v0|, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -3015,14 +2980,14 @@ define float @v_exp_fneg_fabs_f32(float %in) {
 ; SI-GISEL-LABEL: v_exp_fneg_fabs_f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e64 v1, -|v0|, s4
-; SI-GISEL-NEXT:    v_fma_f32 v2, -|v0|, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e64 v2, -|v0|, v1
+; SI-GISEL-NEXT:    v_fma_f32 v1, -|v0|, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, -|v0|, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, -|v0|, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -3131,14 +3096,14 @@ define float @v_exp_fneg_f32(float %in) {
 ; GFX900-GISEL-LABEL: v_exp_fneg_f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e64 v1, -v0, s4
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, -v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e64 v2, -v0, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, -v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, -v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, -v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -3177,14 +3142,14 @@ define float @v_exp_fneg_f32(float %in) {
 ; SI-GISEL-LABEL: v_exp_fneg_f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e64 v1, -v0, s4
-; SI-GISEL-NEXT:    v_fma_f32 v2, -v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e64 v2, -v0, v1
+; SI-GISEL-NEXT:    v_fma_f32 v1, -v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, -v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, -v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -3476,14 +3441,14 @@ define float @v_exp_f32_ninf(float %in) {
 ; GFX900-GISEL-LABEL: v_exp_f32_ninf:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -3514,14 +3479,14 @@ define float @v_exp_f32_ninf(float %in) {
 ; SI-GISEL-LABEL: v_exp_f32_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
@@ -3853,14 +3818,14 @@ define float @v_exp_f32_daz(float %in) #0 {
 ; GFX900-GISEL-LABEL: v_exp_f32_daz:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -3899,14 +3864,14 @@ define float @v_exp_f32_daz(float %in) #0 {
 ; SI-GISEL-LABEL: v_exp_f32_daz:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -4011,14 +3976,14 @@ define float @v_exp_f32_nnan(float %in) {
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -4057,14 +4022,14 @@ define float @v_exp_f32_nnan(float %in) {
 ; SI-GISEL-LABEL: v_exp_f32_nnan:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -4169,14 +4134,14 @@ define float @v_exp_f32_nnan_daz(float %in) #0 {
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_daz:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -4215,14 +4180,14 @@ define float @v_exp_f32_nnan_daz(float %in) #0 {
 ; SI-GISEL-LABEL: v_exp_f32_nnan_daz:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -4327,14 +4292,14 @@ define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -4373,14 +4338,14 @@ define float @v_exp_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL-LABEL: v_exp_f32_nnan_dynamic:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -4473,14 +4438,14 @@ define float @v_exp_f32_ninf_daz(float %in) #0 {
 ; GFX900-GISEL-LABEL: v_exp_f32_ninf_daz:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -4511,14 +4476,14 @@ define float @v_exp_f32_ninf_daz(float %in) #0 {
 ; SI-GISEL-LABEL: v_exp_f32_ninf_daz:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
@@ -4607,14 +4572,14 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-LABEL: v_exp_f32_ninf_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -4645,14 +4610,14 @@ define float @v_exp_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-LABEL: v_exp_f32_ninf_dynamic:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
@@ -4741,14 +4706,14 @@ define float @v_exp_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_ninf:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -4779,14 +4744,14 @@ define float @v_exp_f32_nnan_ninf(float %in) {
 ; SI-GISEL-LABEL: v_exp_f32_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
@@ -4875,14 +4840,14 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_ninf_daz:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -4913,14 +4878,14 @@ define float @v_exp_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-GISEL-LABEL: v_exp_f32_nnan_ninf_daz:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
@@ -5009,14 +4974,14 @@ define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-LABEL: v_exp_f32_nnan_ninf_dynamic:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -5047,14 +5012,14 @@ define float @v_exp_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-LABEL: v_exp_f32_nnan_ninf_dynamic:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
@@ -5183,14 +5148,14 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL-LABEL: v_exp_f32_dynamic_mode:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -5229,14 +5194,14 @@ define float @v_exp_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL-LABEL: v_exp_f32_dynamic_mode:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -5320,14 +5285,14 @@ define float @v_exp_f32_undef() {
 ; GFX900-GISEL-LABEL: v_exp_f32_undef:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, s4, -v0
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s4, v0, -v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, s4, v2, v1
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v2, v0
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, s4, v2, v0
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v2, v1
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v2
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
@@ -5359,14 +5324,14 @@ define float @v_exp_f32_undef() {
 ; SI-GISEL-LABEL: v_exp_f32_undef:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, s4, -v0
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s4, v0
+; SI-GISEL-NEXT:    v_fma_f32 v0, s4, v0, -v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v1, s4, v2, v1
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v2, v0
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
+; SI-GISEL-NEXT:    v_fma_f32 v0, s4, v2, v0
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v2, v1
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v2
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
@@ -5501,13 +5466,13 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v0, s4, -v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v3
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v3
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v4, v3
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v3, v4
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v4
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
@@ -5549,13 +5514,13 @@ define float @v_exp_f32_from_fpext_f16(i16 %src.i) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v3, v0, s4, -v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v3
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v3
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v4, v3
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v3, v4
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v4
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
@@ -5671,15 +5636,15 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v0, s4, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v3
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v3
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v2, v3
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -5722,17 +5687,17 @@ define float @v_exp_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v3, v0, s4, -v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v3
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v3
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v4, v3
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v3, v4
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
@@ -5991,15 +5956,15 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v0, s4, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v3
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
-; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v3
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v2, v3
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v2
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v2
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -6042,17 +6007,17 @@ define float @v_exp_f32_from_fpext_math_f16_daz(i16 %src0.i, i16 %src1.i) #0 {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x32a5705f
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x32a5705f
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v3, v0, s4, -v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v4, v2
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v3
-; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v4
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v3
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v4, v3
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v3, v4
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v3, v4
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
@@ -6755,12 +6720,12 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 ; VI-GISEL-LABEL: v_exp_v2f16_fast:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3dc5
-; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v0
-; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v1
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3dc5
+; VI-GISEL-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_exp_f16_e32 v2, v2
 ; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-GISEL-NEXT:    v_or_b32_e32 v0, v1, v0
+; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_v2f16_fast:
@@ -6776,12 +6741,12 @@ define <2 x half> @v_exp_v2f16_fast(<2 x half> %in) {
 ; GFX900-GISEL-LABEL: v_exp_v2f16_fast:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3dc5
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_exp_f16_e32 v2, v2
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_v2f16_fast:
@@ -6923,18 +6888,31 @@ define <3 x half> @v_exp_v3f16(<3 x half> %in) {
 }
 
 define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
-; VI-LABEL: v_exp_v3f16_afn:
-; VI:       ; %bb.0:
-; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v3, 0x3dc5
-; VI-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
-; VI-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; VI-NEXT:    v_exp_f16_e32 v2, v2
-; VI-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; VI-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
-; VI-NEXT:    v_exp_f16_e32 v1, v1
-; VI-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-NEXT:    s_setpc_b64 s[30:31]
+; VI-SDAG-LABEL: v_exp_v3f16_afn:
+; VI-SDAG:       ; %bb.0:
+; VI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3dc5
+; VI-SDAG-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
+; VI-SDAG-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-SDAG-NEXT:    v_exp_f16_e32 v2, v2
+; VI-SDAG-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-SDAG-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
+; VI-SDAG-NEXT:    v_exp_f16_e32 v1, v1
+; VI-SDAG-NEXT:    v_or_b32_e32 v0, v2, v0
+; VI-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-GISEL-LABEL: v_exp_v3f16_afn:
+; VI-GISEL:       ; %bb.0:
+; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3dc5
+; VI-GISEL-NEXT:    v_mul_f16_e32 v3, 0x3dc5, v0
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-GISEL-NEXT:    v_exp_f16_e32 v3, v3
+; VI-GISEL-NEXT:    v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI-GISEL-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
+; VI-GISEL-NEXT:    v_exp_f16_e32 v1, v1
+; VI-GISEL-NEXT:    v_or_b32_e32 v0, v3, v0
+; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_exp_v3f16_afn:
 ; GFX900-SDAG:       ; %bb.0:
@@ -6952,14 +6930,14 @@ define <3 x half> @v_exp_v3f16_afn(<3 x half> %in) {
 ; GFX900-GISEL-LABEL: v_exp_v3f16_afn:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_movk_i32 s4, 0x3dc5
-; GFX900-GISEL-NEXT:    v_mul_f16_e32 v2, 0x3dc5, v0
-; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX900-GISEL-NEXT:    v_exp_f16_e32 v2, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3dc5
+; GFX900-GISEL-NEXT:    v_mul_f16_e32 v3, 0x3dc5, v0
+; GFX900-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX900-GISEL-NEXT:    v_exp_f16_e32 v3, v3
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mul_f16_e32 v1, 0x3dc5, v1
 ; GFX900-GISEL-NEXT:    v_exp_f16_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX900-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SI-SDAG-LABEL: v_exp_v3f16_afn:
@@ -7101,14 +7079,14 @@ define float @v_exp_f32_contract(float %in) {
 ; GFX900-GISEL-LABEL: v_exp_f32_contract:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -7147,14 +7125,14 @@ define float @v_exp_f32_contract(float %in) {
 ; SI-GISEL-LABEL: v_exp_f32_contract:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -7259,14 +7237,14 @@ define float @v_exp_f32_contract_daz(float %in) #0 {
 ; GFX900-GISEL-LABEL: v_exp_f32_contract_daz:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -7305,14 +7283,14 @@ define float @v_exp_f32_contract_daz(float %in) #0 {
 ; SI-GISEL-LABEL: v_exp_f32_contract_daz:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
@@ -7405,14 +7383,14 @@ define float @v_exp_f32_contract_nnan_ninf(float %in) {
 ; GFX900-GISEL-LABEL: v_exp_f32_contract_nnan_ninf:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_ldexp_f32 v1, v1, v2
@@ -7443,14 +7421,14 @@ define float @v_exp_f32_contract_nnan_ninf(float %in) {
 ; SI-GISEL-LABEL: v_exp_f32_contract_nnan_ninf:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3fb8aa3b
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3fb8aa3b, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3fb8aa3b
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3fb8aa3b, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x32a5705f
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v1
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_rndne_f32_e32 v3, v2
+; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_cvt_i32_f32_e32 v2, v3
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_ldexp_f32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 942b742451dfebb..587109b9431a1a5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -199,24 +199,22 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL-LABEL: s_exp2_v2f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0xc2fc0000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x42800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, s7
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v3
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[0:1]
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, s6, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[0:1]
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, s6, v3
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v2
-; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v3, v1
+; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -249,24 +247,22 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-LABEL: s_exp2_v2f32:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0xc2fc0000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x42800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s7
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[0:1]
-; VI-GISEL-NEXT:    v_add_f32_e32 v2, s6, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_add_f32_e32 v3, s6, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_exp_f32_e32 v2, v2
-; VI-GISEL-NEXT:    v_exp_f32_e32 v3, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    v_exp_f32_e32 v3, v3
+; VI-GISEL-NEXT:    v_exp_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
+; VI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -298,24 +294,22 @@ define amdgpu_kernel void @s_exp2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-LABEL: s_exp2_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0xc2fc0000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x42800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x1f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x1f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, s7
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v0, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v0, s[0:1]
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, s6, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, v1, s[0:1]
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, s6, v3
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s7, v0
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v3, v3
+; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v3, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -420,31 +414,28 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0xc2fc0000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, s4, v0
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
-; SI-GISEL-NEXT:    s_mov_b32 s10, -1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v4
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
 ; SI-GISEL-NEXT:    v_add_f32_e32 v4, s5, v4
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v4, v4
 ; SI-GISEL-NEXT:    v_add_f32_e32 v1, s6, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v2, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v3, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v3, s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s10, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@@ -486,23 +477,20 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0xc2fc0000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, s5, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, s6, v1
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v4, v4
@@ -550,23 +538,20 @@ define amdgpu_kernel void @s_exp2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0xc2fc0000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x1f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v1
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v2, s[0:1]
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, s5, v4
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s6, v1
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v4, v4
@@ -707,42 +692,39 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ;
 ; SI-GISEL-LABEL: s_exp2_v4f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0xc2fc0000
+; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; SI-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s8
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v1
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, s4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, s5, v1
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v2
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_add_f32_e32 v5, s10, v5
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_add_f32_e32 v5, s6, v5
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, s7, v2
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v5, v5
 ; SI-GISEL-NEXT:    v_exp_f32_e32 v3, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v4, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v4, s[0:1]
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, v5, v2
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, -1
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: s_exp2_v4f32:
@@ -786,21 +768,18 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0xc2fc0000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, s5, v1
 ; VI-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
@@ -862,21 +841,18 @@ define amdgpu_kernel void @s_exp2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0xc2fc0000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x1f800000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, s5, v1
 ; GFX900-GISEL-NEXT:    v_exp_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0xc2fc0000
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v4, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 1.0, v4, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
index 16b1ccf58cf6a51..2c07b47bf1ed591 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.frexp.ll
@@ -228,19 +228,19 @@ define { <2 x half>, <2 x i32> } @test_frexp_v2f16_v2i32(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0x7f800000
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v3, v0
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v4, v0
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v4, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v5, v1
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v5, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32:
@@ -320,13 +320,13 @@ define <2 x half> @test_frexp_v2f16_v2i32_only_use_fract(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0x7f800000
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v3, v0
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v3, v1
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v2
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -395,13 +395,13 @@ define <2 x i32> @test_frexp_v2f16_v2i32_only_use_exp(<2 x half> %a) {
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX6-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0x7f800000
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v0
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v2
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f16_v2i32_only_use_exp:
@@ -778,17 +778,17 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 ; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32:
 ; GFX6-GISEL:       ; %bb.0:
 ; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-GISEL-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v3, v0
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v4
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
-; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v4, v1
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v5, v1
 ; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v3, v1
-; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v4
 ; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
-; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
 ; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-GISEL-LABEL: test_frexp_v2f32_v2i32:
@@ -817,17 +817,17 @@ define { <2 x float>, <2 x i32> } @test_frexp_v2f32_v2i32(<2 x float> %a) {
 }
 
 define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) {
-; GFX6-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
-; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v0
-; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT:    v_frexp_mant_f32_e32 v2, v1
-; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-SDAG-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-SDAG-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
 ; GFX8:       ; %bb.0:
@@ -849,23 +849,35 @@ define <2 x float> @test_frexp_v2f32_v2i32_only_use_fract(<2 x float> %a) {
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v0, v0
 ; GFX11-NEXT:    v_frexp_mant_f32_e32 v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_fract:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v0
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX6-GISEL-NEXT:    v_frexp_mant_f32_e32 v2, v1
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
   %result.0 = extractvalue { <2 x float>, <2 x i32> } %result, 0
   ret <2 x float> %result.0
 }
 
 define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
-; GFX6-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
-; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    s_mov_b32 s4, 0x7f800000
-; GFX6-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
-; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
-; GFX6-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
-; GFX6-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
-; GFX6-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
-; GFX6-NEXT:    s_setpc_b64 s[30:31]
+; GFX6-SDAG-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
+; GFX6-SDAG:       ; %bb.0:
+; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-SDAG-NEXT:    s_mov_b32 s4, 0x7f800000
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, s4
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-SDAG-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
+; GFX6-SDAG-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s4
+; GFX6-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX8-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
 ; GFX8:       ; %bb.0:
@@ -887,6 +899,18 @@ define <2 x i32> @test_frexp_v2f32_v2i32_only_use_exp(<2 x float> %a) {
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v0, v0
 ; GFX11-NEXT:    v_frexp_exp_i32_f32_e32 v1, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX6-GISEL-LABEL: test_frexp_v2f32_v2i32_only_use_exp:
+; GFX6-GISEL:       ; %bb.0:
+; GFX6-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v0
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX6-GISEL-NEXT:    v_frexp_exp_i32_f32_e32 v2, v1
+; GFX6-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
+; GFX6-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX6-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %result = call { <2 x float>, <2 x i32> } @llvm.frexp.v2f32.v2i32(<2 x float> %a)
   %result.1 = extractvalue { <2 x float>, <2 x i32> } %result, 1
   ret <2 x i32> %result.1
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
index 671ead6127308dd..d74948a460c98b7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll
@@ -894,9 +894,9 @@ define i1 @not_isnan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s4, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -954,12 +954,12 @@ define <2 x i1> @isnan_v2f16(<2 x half> %x) nounwind {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s4, 0x7c00
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v2
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v1
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1032,16 +1032,16 @@ define <3 x i1> @isnan_v3f16(<3 x half> %x) nounwind {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s4, 0x7c00
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v3, 0x7c00
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v3
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v1
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v3
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v2, v3
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1180,20 +1180,20 @@ define <4 x i1> @isnan_v4f16(<4 x half> %x) nounwind {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s4, 0x7c00
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v4, 0x7c00
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0x7fff, v2
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v0
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v4
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v1
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v4
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v2
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v2, v4
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s4, v3
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v3, v4
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, vcc
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1538,9 +1538,9 @@ define i1 @not_issubnormal_or_zero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s4, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v1
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v1
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1660,12 +1660,12 @@ define i1 @not_isnormal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v1
-; GFX7GLISEL-NEXT:    s_movk_i32 s6, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v0
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1724,19 +1724,19 @@ define i1 @not_is_plus_normal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
+; GFX7GLISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], v2, v3
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], s8, v3
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v3, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v3
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v3, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
-; GFX7GLISEL-NEXT:    v_cmp_ne_u32_e64 s[4:5], v2, v3
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
@@ -1797,19 +1797,19 @@ define i1 @not_is_neg_normal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7c00, v0
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v2, v3
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s8, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], s8, v3
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[6:7], v3, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[6:7], vcc, s[6:7]
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v3
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v3, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[6:7], s[6:7], vcc
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v1
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7800
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v2, v3
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
@@ -1922,11 +1922,11 @@ define i1 @not_issubnormal_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s6, 0x7c00
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v1
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -2040,10 +2040,10 @@ define i1 @not_iszero_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s6, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v1
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -2100,9 +2100,9 @@ define i1 @ispositive_f16(half %x) {
 ; GFX7GLISEL:       ; %bb.0:
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s4, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, v0
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2165,14 +2165,14 @@ define i1 @not_ispositive_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX7GLISEL-NEXT:    s_movk_i32 s6, 0x7c00
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
 ; GFX7GLISEL-NEXT:    v_cmp_ne_u32_e32 vcc, v0, v1
-; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], s6, v1
-; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0xfc00
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v1, v2
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v3, 0xfc00
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v3
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v1
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2291,12 +2291,12 @@ define i1 @not_isnegative_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s6, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, s6, v0
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s6, v0
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v2
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v0, v2
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s6, v0
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v0, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], s[4:5], vcc
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -2870,11 +2870,11 @@ define i1 @not_iszero_or_qnan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v1, v2
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s8, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], s8, v1
-; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7c00
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], v1, v2
 ; GFX7GLISEL-NEXT:    s_or_b64 s[6:7], vcc, s[4:5]
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, s8, v1
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, v1, v2
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
 ; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], v1, v2
 ; GFX7GLISEL-NEXT:    s_and_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_subrev_i32_e32 v0, vcc, 0x400, v0
@@ -3016,9 +3016,9 @@ define i1 @isinf_or_nan_f16(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s4, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v0
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
+; GFX7GLISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -3128,9 +3128,9 @@ define i1 @isfinite_or_nan_f(half %x) {
 ; GFX7GLISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
 ; GFX7GLISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX7GLISEL-NEXT:    s_movk_i32 s4, 0x7c00
-; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e32 vcc, s4, v0
-; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e64 s[4:5], s4, v0
+; GFX7GLISEL-NEXT:    v_mov_b32_e32 v1, 0x7c00
+; GFX7GLISEL-NEXT:    v_cmp_lt_u32_e32 vcc, v0, v1
+; GFX7GLISEL-NEXT:    v_cmp_gt_u32_e64 s[4:5], v0, v1
 ; GFX7GLISEL-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
 ; GFX7GLISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX7GLISEL-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log.ll b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
index ab2618934c4d99e..a0b2d3b32b7957f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log.ll
@@ -46,20 +46,20 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s0, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -165,19 +165,19 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s0, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -229,7 +229,7 @@ define amdgpu_kernel void @s_log_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -360,41 +360,38 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; SI-GISEL-LABEL: s_log_v2f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0x3377d1cf
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v2
-; SI-GISEL-NEXT:    v_fma_f32 v5, v2, s0, -v4
-; SI-GISEL-NEXT:    v_fma_f32 v5, v2, s2, v5
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, s3
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
+; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
+; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
 ; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, s2, v3
+; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
@@ -449,47 +446,44 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; VI-GISEL-LABEL: s_log_v2f32:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s6, v1
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x7f800000
-; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v1, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
+; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; VI-GISEL-NEXT:    v_and_b32_e32 v4, 0xfffff000, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v0, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41b17218
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v1, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3805fdf4, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317000, v5
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317000, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -537,39 +531,36 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX900-GISEL-LABEL: s_log_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0x3377d1cf
-; GFX900-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v2, s0, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v2, s2, v5
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, s3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41b17218
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x41b17218
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, s2, v3
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -627,7 +618,7 @@ define amdgpu_kernel void @s_log_v2f32(ptr addrspace(1) %out, <2 x float> %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, 0x3f317217, v0 :: v_dual_mul_f32 v3, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v4, v0, 0x3f317217, -v2
+; GFX1100-GISEL-NEXT:    v_fma_f32 v4, 0x3f317217, v0, -v2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3f317217, v1, -v3
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3377d1cf, v0 :: v_dual_fmac_f32 v5, 0x3377d1cf, v1
@@ -813,54 +804,49 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s8
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x3f317217
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v5, v0, s0, -v4
-; SI-GISEL-NEXT:    v_fma_f32 v5, v0, s2, v5
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s3
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
+; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, s9, v4
-; SI-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
+; SI-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v4
+; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v6
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v4, v3, -v8
+; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v4, s2, v9
+; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
 ; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v4|, s3
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3377d1cf
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v6, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
+; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
+; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -937,45 +923,42 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317000, v3
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s5, v3
 ; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x41b17218
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
 ; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3805fdf4, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
@@ -1056,50 +1039,45 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x3f317217
-; GFX900-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v0, s0, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v0, s2, v5
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, s5, v4
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s5, v6
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v4
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v6
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s6, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v4, v3, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v4, s2, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v4|, s3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v8, s[2:3]
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v6, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317217, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
@@ -1191,7 +1169,7 @@ define amdgpu_kernel void @s_log_v3f32(ptr addrspace(1) %out, <3 x float> %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v6, v0, 0x3f317217, -v3
+; GFX1100-GISEL-NEXT:    v_fma_f32 v6, 0x3f317217, v0, -v3
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3f317217, v2
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v7, 0x3f317217, v1, -v4
@@ -1443,23 +1421,22 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s8
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x3f317217
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v5, v0, s0, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v5, v0, s2, v5
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s3
+; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
@@ -1467,17 +1444,16 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v1
 ; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, s2, v9
+; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
 ; SI-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
@@ -1485,25 +1461,22 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v9
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3377d1cf
 ; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
 ; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v6, v10
+; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
 ; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v5
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
 ; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v6, v4
+; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
 ; SI-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v5
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
@@ -1594,29 +1567,27 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x800000
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3805fdf4, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3805fdf4, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317000, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s5, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x41b17218
@@ -1630,20 +1601,19 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3f317000, v7
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3f317000, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, s6, v7
 ; VI-GISEL-NEXT:    v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v5, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
 ; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v2
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x3805fdf4, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v10, 0x3805fdf4, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s7, v2
@@ -1742,23 +1712,21 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x3f317217
-; GFX900-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v0, s0, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v0, s2, v5
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s3
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
@@ -1766,17 +1734,16 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x41b17218
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, s2, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v9, s6, v9
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, s3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v2
@@ -1784,21 +1751,19 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v9
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s7, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v6, v10
+; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v5
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3f317217, v3
 ; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v6, v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v5
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
@@ -1900,7 +1865,7 @@ define amdgpu_kernel void @s_log_v4f32(ptr addrspace(1) %out, <4 x float> %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v7, 0x3f317217, v2 :: v_dual_mul_f32 v8, 0x3f317217, v3
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v10, v0, 0x3f317217, -v5
+; GFX1100-GISEL-NEXT:    v_fma_f32 v10, 0x3f317217, v0, -v5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v11, 0x3f317217, v1, -v6
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3f317217, v2, -v7
@@ -2144,12 +2109,12 @@ define float @v_log_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2242,12 +2207,12 @@ define float @v_log_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2290,7 +2255,7 @@ define float @v_log_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -2346,12 +2311,12 @@ define float @v_log_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2444,12 +2409,12 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2492,7 +2457,7 @@ define float @v_log_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -2549,12 +2514,12 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2647,12 +2612,12 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2695,7 +2660,7 @@ define float @v_log_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -2753,12 +2718,12 @@ define float @v_log_fneg_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2851,12 +2816,12 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2899,7 +2864,7 @@ define float @v_log_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -3310,12 +3275,12 @@ define float @v_log_f32_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -3408,12 +3373,12 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -3456,7 +3421,7 @@ define float @v_log_f32_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -3885,14 +3850,14 @@ define float @v_log_f32_daz(float %in) #0 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3951,14 +3916,14 @@ define float @v_log_f32_daz(float %in) #0 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3985,7 +3950,7 @@ define float @v_log_f32_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4038,12 +4003,12 @@ define float @v_log_f32_nnan(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4136,12 +4101,12 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4184,7 +4149,7 @@ define float @v_log_f32_nnan(float %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4227,14 +4192,14 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4293,14 +4258,14 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4327,7 +4292,7 @@ define float @v_log_f32_nnan_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4380,12 +4345,12 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4478,12 +4443,12 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4526,7 +4491,7 @@ define float @v_log_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4569,14 +4534,14 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4635,14 +4600,14 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4669,7 +4634,7 @@ define float @v_log_f32_ninf_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4722,12 +4687,12 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4820,12 +4785,12 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4868,7 +4833,7 @@ define float @v_log_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4921,13 +4886,13 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -5007,13 +4972,13 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -5048,7 +5013,7 @@ define float @v_log_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
@@ -5086,12 +5051,12 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log_f32_nnan_ninf_daz:
@@ -5125,12 +5090,12 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log_f32_nnan_ninf_daz:
@@ -5153,7 +5118,7 @@ define float @v_log_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
@@ -5202,13 +5167,13 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -5288,13 +5253,13 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x41b17218
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -5329,7 +5294,7 @@ define float @v_log_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
@@ -5412,12 +5377,12 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -5510,12 +5475,12 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -5558,7 +5523,7 @@ define float @v_log_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -5606,12 +5571,12 @@ define float @v_log_f32_undef() {
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -5688,12 +5653,12 @@ define float @v_log_f32_undef() {
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -5729,7 +5694,7 @@ define float @v_log_f32_undef() {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -5773,14 +5738,14 @@ define float @v_log_f32_0() {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, 0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v2, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41b17218
@@ -5851,14 +5816,14 @@ define float @v_log_f32_0() {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, 0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3377d1cf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x41b17218
@@ -5888,12 +5853,12 @@ define float @v_log_f32_0() {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, 0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, 0
+; GFX1100-GISEL-NEXT:    v_cmp_lt_f32_e64 s0, 0, 0x800000
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -5937,15 +5902,15 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6007,15 +5972,15 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6045,7 +6010,7 @@ define float @v_log_f32_from_fpext_f16(i16 %src.i) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -6100,18 +6065,18 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6177,15 +6142,15 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3f317217
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3377d1cf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3f317217
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3377d1cf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3f317217, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3f317217, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6218,7 +6183,7 @@ define float @v_log_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3f317217, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3f317217, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3f317217, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3377d1cf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -7450,15 +7415,15 @@ define <4 x half> @v_log_v4f16(<4 x half> %in) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_log_f16_e32 v3, v1
+; VI-GISEL-NEXT:    v_log_f16_e32 v4, v1
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x398c
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x398c
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-GISEL-NEXT:    v_mul_f16_e32 v4, 0x398c, v4
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_v4f16:
@@ -7618,15 +7583,15 @@ define <4 x half> @v_log_v4f16_fast(<4 x half> %in) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_log_f16_e32 v3, v1
+; VI-GISEL-NEXT:    v_log_f16_e32 v4, v1
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x398c
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x398c
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v2, 0x398c, v2
-; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-GISEL-NEXT:    v_mul_f16_e32 v3, 0x398c, v3
-; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-GISEL-NEXT:    v_mul_f16_e32 v4, 0x398c, v4
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
index c732483733f5a1c..5ba72612321a6ad 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log10.ll
@@ -46,20 +46,20 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s2, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s0, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -165,19 +165,19 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s0, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
@@ -229,7 +229,7 @@ define amdgpu_kernel void @s_log10_f32(ptr addrspace(1) %out, float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -360,41 +360,38 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL-LABEL: s_log10_v2f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0x3284fbcf
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v2
-; SI-GISEL-NEXT:    v_fma_f32 v5, v2, s0, -v4
-; SI-GISEL-NEXT:    v_fma_f32 v5, v2, s2, v5
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, s3
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
+; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
+; SI-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
 ; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; SI-GISEL-NEXT:    v_fma_f32 v3, v1, s2, v3
+; SI-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
 ; SI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
@@ -449,47 +446,44 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-LABEL: s_log10_v2f32:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s6
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v1
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v0, vcc
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s6, v1
-; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x7f800000
-; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v1
-; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v1, v3
-; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
+; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
+; VI-GISEL-NEXT:    v_and_b32_e32 v4, 0xfffff000, v2
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v5
+; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
+; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
 ; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v1|, s0
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v0, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x411a209b
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v3, vcc
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v1, v0
-; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v2
-; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v2, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
+; VI-GISEL-NEXT:    v_and_b32_e32 v2, 0xfffff000, v1
+; VI-GISEL-NEXT:    v_sub_f32_e32 v5, v1, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v5
-; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v1
+; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x369a84fb, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v7, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a2000, v5
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
-; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v2|, v4
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a2000, v2
+; VI-GISEL-NEXT:    v_add_f32_e32 v2, v2, v5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v3
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
@@ -537,39 +531,36 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-LABEL: s_log10_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0x3284fbcf
-; GFX900-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v2, s0, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v2, s2, v5
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, s3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v2, v4, v7
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x411a209b
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x411a209b
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v6, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v3, -v2
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, s2, v3
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v1, v4, v3
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v2, v3
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, s3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v1|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
@@ -627,7 +618,7 @@ define amdgpu_kernel void @s_log10_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v2, 0x3e9a209a, v0 :: v_dual_mul_f32 v3, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v4, v0, 0x3e9a209a, -v2
+; GFX1100-GISEL-NEXT:    v_fma_f32 v4, 0x3e9a209a, v0, -v2
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v5, 0x3e9a209a, v1, -v3
 ; GFX1100-GISEL-NEXT:    v_dual_fmac_f32 v4, 0x3284fbcf, v0 :: v_dual_fmac_f32 v5, 0x3284fbcf, v1
@@ -813,54 +804,49 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s8
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x3e9a209a
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v5, v0, s0, -v4
-; SI-GISEL-NEXT:    v_fma_f32 v5, v0, s2, v5
-; SI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s3
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
+; SI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v1
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v2, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, s9, v4
-; SI-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, s9, v6
+; SI-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v4
+; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v6
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s10, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v4, v3, -v8
+; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; SI-GISEL-NEXT:    v_fma_f32 v9, v4, s2, v9
+; SI-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
 ; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v4|, s3
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v8, s[2:3]
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
-; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3284fbcf
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v6, v3
-; SI-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
+; SI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
+; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
+; SI-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
+; SI-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -937,45 +923,42 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; VI-GISEL-NEXT:    v_and_b32_e32 v3, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a2000, v3
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v3, v3, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s5, v3
 ; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x411a209b
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v5, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v6
 ; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v3
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v7, v3, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x369a84fb, v7
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; VI-GISEL-NEXT:    v_add_f32_e32 v8, v9, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
 ; VI-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
@@ -1056,50 +1039,45 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x3e9a209a
-; GFX900-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v0, s0, -v4
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v0, s2, v5
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, v7
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v1
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 1.0, v2, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, s5, v4
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v4, v4
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 1.0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, s5, v6
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v6, v6
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v1
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v4
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v6
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s6, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v4, v3, -v8
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v3, -v8
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v4, s2, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v6, v4, v9
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v4|, s3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v8, s[2:3]
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v4
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v6, v3
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v4, v3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v6|, v5
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v8, s[2:3]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v7, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a209a, v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v3, -v6
+; GFX900-GISEL-NEXT:    v_fma_f32 v3, v2, v4, v3
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v3, v6, v3
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v2|, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 0, v7, vcc
@@ -1191,7 +1169,7 @@ define amdgpu_kernel void @s_log10_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v1
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_3)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v6, v0, 0x3e9a209a, -v3
+; GFX1100-GISEL-NEXT:    v_fma_f32 v6, 0x3e9a209a, v0, -v3
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v5, 0x3e9a209a, v2
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v7, 0x3e9a209a, v1, -v4
@@ -1443,23 +1421,22 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; SI-GISEL-NEXT:    s_mov_b32 s2, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s8
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s8, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x3e9a209a
-; SI-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
+; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v5, v0, s0, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v5, v0, s2, v5
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s3
+; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; SI-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s9, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
@@ -1467,17 +1444,16 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v1
 ; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v9, v1, s2, v9
+; SI-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
 ; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v9, s10, v9
 ; SI-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, s3
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
@@ -1485,25 +1461,22 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v9
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3284fbcf
 ; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
 ; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; SI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v6, v10
+; SI-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
 ; SI-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v5
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
 ; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v6, v4
+; SI-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
 ; SI-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v5
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-GISEL-NEXT:    s_endpgm
@@ -1594,29 +1567,27 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s2, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x7f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s2, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x800000
 ; VI-GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff000, v0
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v4, v0, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v5, 0x369a84fb, v1
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x369a84fb, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a2000, v4
 ; VI-GISEL-NEXT:    v_add_f32_e32 v5, v5, v6
-; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a2000, v1
+; VI-GISEL-NEXT:    v_add_f32_e32 v4, v4, v5
 ; VI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s0
+; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s5, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v5, 0x411a209b
@@ -1630,20 +1601,19 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, 0x3e9a2000, v7
 ; VI-GISEL-NEXT:    v_add_f32_e32 v7, v7, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v6, 0x3e9a2000, v6
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
 ; VI-GISEL-NEXT:    v_add_f32_e32 v6, v6, v7
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v7, 1.0, v3, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v7, s6, v7
 ; VI-GISEL-NEXT:    v_log_f32_e32 v7, v7
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7f800000
 ; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v4
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[2:3]
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v5, s[0:1]
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v6
 ; VI-GISEL-NEXT:    v_and_b32_e32 v6, 0xfffff000, v7
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v3
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v2
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v8, v7, v6
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v9, 0x369a84fb, v8
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v10, 0x369a84fb, v6
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s7, v2
@@ -1742,23 +1712,21 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
-; GFX900-GISEL-NEXT:    s_mov_b32 s2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x3e9a209a
-; GFX900-GISEL-NEXT:    s_mov_b32 s3, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v0, s0, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v5, v0, s2, v5
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v5
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, s3
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v4, -v1
+; GFX900-GISEL-NEXT:    v_fma_f32 v7, v0, v5, v7
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v7
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], |v0|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
@@ -1766,17 +1734,16 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v7, 0x411a209b
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v1
 ; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, s2, v9
+; GFX900-GISEL-NEXT:    v_fma_f32 v9, v1, v5, v9
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v9
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v9, 1.0, v3, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v9, s6, v9
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v9, v9
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, s3
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v1|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, v1, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v2
@@ -1784,21 +1751,19 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v9
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s7, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v4, -v8
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v2
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7f800000
-; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v6, v10
+; GFX900-GISEL-NEXT:    v_fma_f32 v10, v9, v5, v10
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v8, v8, v10
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v5
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[2:3], |v9|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, v9, v8, s[2:3]
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v8, 0, v7, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v8
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v8, 0x3e9a209a, v3
 ; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v4, -v8
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v6, v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v4, v3, v5, v4
 ; GFX900-GISEL-NEXT:    v_add_f32_e32 v4, v8, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v5
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v3|, v6
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v7, s[0:1]
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
@@ -1900,7 +1865,7 @@ define amdgpu_kernel void @s_log10_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_dual_mul_f32 v7, 0x3e9a209a, v2 :: v_dual_mul_f32 v8, 0x3e9a209a, v3
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v10, v0, 0x3e9a209a, -v5
+; GFX1100-GISEL-NEXT:    v_fma_f32 v10, 0x3e9a209a, v0, -v5
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v11, 0x3e9a209a, v1, -v6
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1100-GISEL-NEXT:    v_fma_f32 v12, 0x3e9a209a, v2, -v7
@@ -2144,12 +2109,12 @@ define float @v_log10_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2242,12 +2207,12 @@ define float @v_log10_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2290,7 +2255,7 @@ define float @v_log10_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -2346,12 +2311,12 @@ define float @v_log10_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2444,12 +2409,12 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, |v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2492,7 +2457,7 @@ define float @v_log10_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -2549,12 +2514,12 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2647,12 +2612,12 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -|v0|, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2695,7 +2660,7 @@ define float @v_log10_fneg_fabs_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -2753,12 +2718,12 @@ define float @v_log10_fneg_f32(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2851,12 +2816,12 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e64 v0, -v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -2899,7 +2864,7 @@ define float @v_log10_fneg_f32(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -3310,12 +3275,12 @@ define float @v_log10_f32_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -3408,12 +3373,12 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -3456,7 +3421,7 @@ define float @v_log10_f32_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -3885,14 +3850,14 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3951,14 +3916,14 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3985,7 +3950,7 @@ define float @v_log10_f32_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4038,12 +4003,12 @@ define float @v_log10_f32_nnan(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4136,12 +4101,12 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4184,7 +4149,7 @@ define float @v_log10_f32_nnan(float %in) {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4227,14 +4192,14 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4293,14 +4258,14 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4327,7 +4292,7 @@ define float @v_log10_f32_nnan_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4380,12 +4345,12 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4478,12 +4443,12 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4526,7 +4491,7 @@ define float @v_log10_f32_nnan_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4569,14 +4534,14 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4635,14 +4600,14 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -4669,7 +4634,7 @@ define float @v_log10_f32_ninf_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4722,12 +4687,12 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4820,12 +4785,12 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -4868,7 +4833,7 @@ define float @v_log10_f32_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -4921,13 +4886,13 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -5007,13 +4972,13 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -5048,7 +5013,7 @@ define float @v_log10_f32_nnan_ninf(float %in) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
@@ -5086,12 +5051,12 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; VI-LABEL: v_log10_f32_nnan_ninf_daz:
@@ -5125,12 +5090,12 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX1100-SDAG-LABEL: v_log10_f32_nnan_ninf_daz:
@@ -5153,7 +5118,7 @@ define float @v_log10_f32_nnan_ninf_daz(float %in) #0 {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
@@ -5202,13 +5167,13 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; SI-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -5288,13 +5253,13 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v0, v0, v3, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x411a209b
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -5329,7 +5294,7 @@ define float @v_log10_f32_nnan_ninf_dynamic(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v0, v1, v2
@@ -5412,12 +5377,12 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -5510,12 +5475,12 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v1, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -5558,7 +5523,7 @@ define float @v_log10_f32_dynamic_mode(float %in) #1 {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -5606,12 +5571,12 @@ define float @v_log10_f32_undef() {
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -5688,12 +5653,12 @@ define float @v_log10_f32_undef() {
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, s4, -v1
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v2
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v3, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[4:5], |v0|, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
@@ -5729,7 +5694,7 @@ define float @v_log10_f32_undef() {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -5773,14 +5738,14 @@ define float @v_log10_f32_0() {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, 0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v2, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; SI-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
 ; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x411a209b
@@ -5851,14 +5816,14 @@ define float @v_log10_f32_0() {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, 0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3284fbcf
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v3, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v2, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v2, v0, v3, v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v2, v4, v2
 ; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x411a209b
@@ -5888,12 +5853,12 @@ define float @v_log10_f32_0() {
 ; GFX1100-GISEL:       ; %bb.0:
 ; GFX1100-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX1100-GISEL-NEXT:    v_log_f32_e32 v0, 0
-; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 s0, 0x800000, 0
+; GFX1100-GISEL-NEXT:    v_cmp_lt_f32_e64 s0, 0, 0x800000
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -5937,15 +5902,15 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6007,15 +5972,15 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6045,7 +6010,7 @@ define float @v_log10_f32_from_fpext_f16(i16 %src.i) {
 ; GFX1100-GISEL-NEXT:    s_waitcnt_depctr 0xfff
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -6100,18 +6065,18 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; SI-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; SI-GISEL-NEXT:    v_add_f32_e32 v0, v0, v1
 ; SI-GISEL-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
 ; SI-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; SI-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; SI-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; SI-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; SI-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; SI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6177,15 +6142,15 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX900-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX900-GISEL-NEXT:    v_add_f16_e32 v0, v0, v1
 ; GFX900-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX900-GISEL-NEXT:    s_mov_b32 s4, 0x3e9a209a
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3284fbcf
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e9a209a
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x3284fbcf
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7f800000
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, 0x3e9a209a, v0
-; GFX900-GISEL-NEXT:    v_fma_f32 v4, v0, s4, -v3
-; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, v4
-; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v3, v1
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v2
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, 0x3e9a209a, v0
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v1, -v4
+; GFX900-GISEL-NEXT:    v_fma_f32 v1, v0, v2, v1
+; GFX900-GISEL-NEXT:    v_add_f32_e32 v1, v4, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 vcc, |v0|, v3
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX900-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6218,7 +6183,7 @@ define float @v_log10_f32_from_fpext_math_f16(i16 %src0.i, i16 %src1.i) {
 ; GFX1100-GISEL-NEXT:    v_mul_f32_e32 v1, 0x3e9a209a, v0
 ; GFX1100-GISEL-NEXT:    v_cmp_gt_f32_e64 vcc_lo, 0x7f800000, |v0|
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1100-GISEL-NEXT:    v_fma_f32 v2, v0, 0x3e9a209a, -v1
+; GFX1100-GISEL-NEXT:    v_fma_f32 v2, 0x3e9a209a, v0, -v1
 ; GFX1100-GISEL-NEXT:    v_fmac_f32_e32 v2, 0x3284fbcf, v0
 ; GFX1100-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1100-GISEL-NEXT:    v_add_f32_e32 v1, v1, v2
@@ -7450,15 +7415,15 @@ define <4 x half> @v_log10_v4f16(<4 x half> %in) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_log_f16_e32 v3, v1
+; VI-GISEL-NEXT:    v_log_f16_e32 v4, v1
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x34d1
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x34d1
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-GISEL-NEXT:    v_mul_f16_e32 v4, 0x34d1, v4
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_v4f16:
@@ -7618,15 +7583,15 @@ define <4 x half> @v_log10_v4f16_fast(<4 x half> %in) {
 ; VI-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; VI-GISEL-NEXT:    v_log_f16_e32 v2, v0
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_log_f16_e32 v3, v1
+; VI-GISEL-NEXT:    v_log_f16_e32 v4, v1
 ; VI-GISEL-NEXT:    v_log_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x34d1
+; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x34d1
 ; VI-GISEL-NEXT:    v_mul_f16_e32 v2, 0x34d1, v2
-; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-GISEL-NEXT:    v_mul_f16_e32 v3, 0x34d1, v3
-; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-GISEL-NEXT:    v_mul_f16_e32 v4, 0x34d1, v4
+; VI-GISEL-NEXT:    v_mul_f16_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; VI-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_or_b32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    v_or_b32_e32 v1, v4, v1
 ; VI-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX900-SDAG-LABEL: v_log10_v4f16_fast:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 4c4b678010adf68..6ccef4c02ab3b10 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -242,24 +242,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; SI-GISEL-LABEL: s_log2_v2f32:
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; SI-GISEL-NEXT:    v_mov_b32_e32 v3, s7
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v3
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v0, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v0, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
+; SI-GISEL-NEXT:    v_mul_f32_e32 v3, s6, v3
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; SI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; SI-GISEL-NEXT:    v_log_f32_e32 v3, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
-; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
+; SI-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; SI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
+; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; SI-GISEL-NEXT:    s_mov_b32 s6, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -292,24 +290,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; VI-GISEL-LABEL: s_log2_v2f32:
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s7
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v3
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v0, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v0, s[0:1]
-; VI-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_mul_f32_e32 v3, s6, v3
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; VI-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; VI-GISEL-NEXT:    v_log_f32_e32 v3, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
-; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
+; VI-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; VI-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
+; VI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v2, s4
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, s5
 ; VI-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -341,24 +337,22 @@ define amdgpu_kernel void @s_log2_v2f32(ptr addrspace(1) %out, <2 x float> %in)
 ; GFX900-GISEL-LABEL: s_log2_v2f32:
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x4f800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x42000000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x42000000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, s6
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, s7
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v2
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v3
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v2, 1.0, v0, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v0, s[0:1]
-; GFX900-GISEL-NEXT:    v_mul_f32_e32 v2, s6, v2
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v0
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v3, 1.0, v1, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v0, 1.0, v1, s[0:1]
+; GFX900-GISEL-NEXT:    v_mul_f32_e32 v3, s6, v3
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s7, v0
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v2, v2
-; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, v1, s[0:1]
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v2, v0
-; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v3, v1
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v3, v3
+; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v0
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v2, s[0:1]
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v3, v0
+; GFX900-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX900-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[4:5]
 ; GFX900-GISEL-NEXT:    s_endpgm
@@ -512,31 +506,28 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; SI-GISEL:       ; %bb.0:
 ; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
 ; SI-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
-; SI-GISEL-NEXT:    s_mov_b32 s10, -1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v1
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v1
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; SI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v4
-; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v1, vcc
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v2
+; SI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v4, s5, v4
-; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v1, s[0:1]
+; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
 ; SI-GISEL-NEXT:    v_log_f32_e32 v4, v4
 ; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v2, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v4, v1
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, v3, s[0:1]
+; SI-GISEL-NEXT:    s_mov_b32 s10, -1
 ; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
@@ -578,23 +569,20 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v1
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; VI-GISEL-NEXT:    v_mov_b32_e32 v4, s5
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v4
-; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v2
-; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v1, vcc
-; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v1, s[0:1]
+; VI-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
+; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v4, s5, v4
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v4, v4
@@ -642,23 +630,20 @@ define amdgpu_kernel void @s_log2_v3f32(ptr addrspace(1) %out, <3 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x800000
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x4f800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x42000000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v1, vcc
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v1
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v2, vcc
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v3, vcc
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s5, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v1
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v4
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, s5
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v4
-; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s6, v2
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v1, vcc
-; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v1, s[0:1]
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v4, 1.0, v2, vcc
+; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v2, s[0:1]
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v4, s5, v4
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s6, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v4, v4
@@ -865,42 +850,39 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ;
 ; SI-GISEL-LABEL: s_log2_v4f32:
 ; SI-GISEL:       ; %bb.0:
-; SI-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[0:1], 0xd
-; SI-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; SI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0xd
+; SI-GISEL-NEXT:    s_load_dwordx2 s[8:9], s[0:1], 0x9
+; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; SI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-GISEL-NEXT:    v_mov_b32_e32 v0, s8
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; SI-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; SI-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v1
-; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s8, v0
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
 ; SI-GISEL-NEXT:    v_log_f32_e32 v0, v0
-; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s9, v1
+; SI-GISEL-NEXT:    v_mul_f32_e32 v1, s5, v1
 ; SI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; SI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s10, v2
-; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s11, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s6, v2
+; SI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s7, v2
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v1, v1, v5
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 1.0, v3, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, v3, s[0:1]
-; SI-GISEL-NEXT:    v_mul_f32_e32 v5, s10, v5
-; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s11, v2
+; SI-GISEL-NEXT:    v_mul_f32_e32 v5, s6, v5
+; SI-GISEL-NEXT:    v_mul_f32_e32 v2, s7, v2
 ; SI-GISEL-NEXT:    v_log_f32_e32 v5, v5
 ; SI-GISEL-NEXT:    v_log_f32_e32 v3, v2
 ; SI-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v4, vcc
 ; SI-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v2, v5, v2
 ; SI-GISEL-NEXT:    v_sub_f32_e32 v3, v3, v4
-; SI-GISEL-NEXT:    s_mov_b32 s6, -1
-; SI-GISEL-NEXT:    s_mov_b32 s7, 0xf000
-; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
+; SI-GISEL-NEXT:    s_mov_b32 s10, -1
+; SI-GISEL-NEXT:    s_mov_b32 s11, 0xf000
+; SI-GISEL-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
 ; SI-GISEL-NEXT:    s_endpgm
 ;
 ; VI-SDAG-LABEL: s_log2_v4f32:
@@ -944,21 +926,18 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; VI-GISEL:       ; %bb.0:
 ; VI-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; VI-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; VI-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; VI-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
 ; VI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; VI-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; VI-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v1
+; VI-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
 ; VI-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; VI-GISEL-NEXT:    v_mul_f32_e32 v1, s5, v1
 ; VI-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; VI-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; VI-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; VI-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
 ; VI-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
@@ -1020,21 +999,18 @@ define amdgpu_kernel void @s_log2_v4f32(ptr addrspace(1) %out, <4 x float> %in)
 ; GFX900-GISEL:       ; %bb.0:
 ; GFX900-GISEL-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x34
 ; GFX900-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x24
-; GFX900-GISEL-NEXT:    s_mov_b32 s0, 0x800000
+; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4f800000
 ; GFX900-GISEL-NEXT:    v_mov_b32_e32 v4, 0x42000000
 ; GFX900-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v0, s4
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e32 vcc, s0, v0
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e32 vcc, s4, v2
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v0, 1.0, v3, vcc
-; GFX900-GISEL-NEXT:    v_cmp_gt_f32_e64 s[0:1], s0, v1
+; GFX900-GISEL-NEXT:    v_cmp_lt_f32_e64 s[0:1], s5, v2
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v0, s4, v0
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v1, 1.0, v3, s[0:1]
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v0, v0
 ; GFX900-GISEL-NEXT:    v_mul_f32_e32 v1, s5, v1
 ; GFX900-GISEL-NEXT:    v_log_f32_e32 v1, v1
-; GFX900-GISEL-NEXT:    v_mov_b32_e32 v2, 0x800000
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v4, vcc
 ; GFX900-GISEL-NEXT:    v_sub_f32_e32 v0, v0, v5
 ; GFX900-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, v4, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 2a1488652d887a4..2e7fa86e8ab8b6c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -4239,14 +4239,10 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX8-LABEL: constant_zextload_v8i1_to_v8i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_mov_b32_e32 v13, 0
-; GFX8-NEXT:    v_mov_b32_e32 v15, v13
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ubyte v2, v[0:1]
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s3
@@ -4256,37 +4252,41 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s0
 ; GFX8-NEXT:    s_add_u32 s0, s0, 16
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v21, s3
 ; GFX8-NEXT:    v_mov_b32_e32 v23, s1
-; GFX8-NEXT:    v_mov_b32_e32 v11, 0
-; GFX8-NEXT:    v_mov_b32_e32 v9, v13
-; GFX8-NEXT:    v_mov_b32_e32 v5, v13
-; GFX8-NEXT:    v_mov_b32_e32 v1, v13
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v11, v1
+; GFX8-NEXT:    v_mov_b32_e32 v13, v1
+; GFX8-NEXT:    v_mov_b32_e32 v15, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v20, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v22, s0
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 2, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 4, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 6, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v10, 5, v2
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v12, 3, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 7, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v12
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v6
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v24
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 2, v0
+; GFX8-NEXT:    v_and_b32_e32 v8, 1, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 4, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 6, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v10, 5, v0
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 3, v0
+; GFX8-NEXT:    v_and_b32_e32 v12, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 7, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v24, 1, v10
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v14
 ; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[12:15]
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v24
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[12:15]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v8i1_to_v8i64:
@@ -4558,101 +4558,102 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ; GFX8-LABEL: constant_zextload_v16i1_to_v16i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_mov_b32_e32 v20, 0
-; GFX8-NEXT:    v_mov_b32_e32 v19, 0
-; GFX8-NEXT:    v_mov_b32_e32 v17, v20
-; GFX8-NEXT:    v_mov_b32_e32 v22, v20
+; GFX8-NEXT:    v_mov_b32_e32 v2, 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_mov_b32_e32 v4, v2
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
-; GFX8-NEXT:    flat_load_ushort v2, v[0:1]
+; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x70
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    s_add_u32 s4, s0, 0x50
 ; GFX8-NEXT:    s_addc_u32 s5, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v0, s4
-; GFX8-NEXT:    v_mov_b32_e32 v1, s5
-; GFX8-NEXT:    v_mov_b32_e32 v23, v20
-; GFX8-NEXT:    v_mov_b32_e32 v13, v20
-; GFX8-NEXT:    v_mov_b32_e32 v9, v20
-; GFX8-NEXT:    v_mov_b32_e32 v5, v20
-; GFX8-NEXT:    v_mov_b32_e32 v25, 0
-; GFX8-NEXT:    v_mov_b32_e32 v15, 0
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0
-; GFX8-NEXT:    v_mov_b32_e32 v11, 0
+; GFX8-NEXT:    v_mov_b32_e32 v23, s5
+; GFX8-NEXT:    v_mov_b32_e32 v22, s4
+; GFX8-NEXT:    v_mov_b32_e32 v9, v2
+; GFX8-NEXT:    v_mov_b32_e32 v11, v2
+; GFX8-NEXT:    v_mov_b32_e32 v12, v2
+; GFX8-NEXT:    v_mov_b32_e32 v14, v2
+; GFX8-NEXT:    v_mov_b32_e32 v15, v2
+; GFX8-NEXT:    v_mov_b32_e32 v17, v2
+; GFX8-NEXT:    v_mov_b32_e32 v19, v2
+; GFX8-NEXT:    v_mov_b32_e32 v21, v2
+; GFX8-NEXT:    v_mov_b32_e32 v25, v2
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 10, v2
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v4
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 11, v2
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff, v4
-; GFX8-NEXT:    flat_store_dwordx4 v[0:1], v[16:19]
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 14, v2
-; GFX8-NEXT:    v_mov_b32_e32 v17, s3
-; GFX8-NEXT:    v_mov_b32_e32 v16, s2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 10, v0
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 11, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[5:8]
+; GFX8-NEXT:    v_mov_b32_e32 v23, s3
+; GFX8-NEXT:    v_lshrrev_b16_e32 v1, 14, v0
+; GFX8-NEXT:    v_mov_b32_e32 v22, s2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 15, v0
+; GFX8-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX8-NEXT:    s_add_u32 s2, s0, 64
-; GFX8-NEXT:    v_mov_b32_e32 v0, 1
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v4
-; GFX8-NEXT:    v_lshrrev_b16_e32 v21, 15, v2
+; GFX8-NEXT:    v_mov_b32_e32 v5, v2
+; GFX8-NEXT:    v_mov_b32_e32 v7, v2
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[1:4]
+; GFX8-NEXT:    v_mov_b32_e32 v23, v2
+; GFX8-NEXT:    v_mov_b32_e32 v3, 1
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[19:22]
-; GFX8-NEXT:    v_mov_b32_e32 v17, s3
-; GFX8-NEXT:    v_and_b32_sdwa v19, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 9, v2
-; GFX8-NEXT:    v_mov_b32_e32 v16, s2
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, s3
+; GFX8-NEXT:    v_and_b32_sdwa v8, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 9, v0
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x60
-; GFX8-NEXT:    v_mov_b32_e32 v22, 0
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[19:22]
-; GFX8-NEXT:    v_mov_b32_e32 v1, v20
-; GFX8-NEXT:    v_mov_b32_e32 v19, s3
-; GFX8-NEXT:    v_mov_b32_e32 v18, s2
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 48
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v21, s3
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 12, v2
-; GFX8-NEXT:    v_mov_b32_e32 v20, s2
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 12, v0
+; GFX8-NEXT:    flat_store_dwordx4 v[1:2], v[8:11]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s1
+; GFX8-NEXT:    v_mov_b32_e32 v9, s3
+; GFX8-NEXT:    v_and_b32_e32 v11, 1, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 13, v0
+; GFX8-NEXT:    v_mov_b32_e32 v8, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    v_and_b32_e32 v22, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 13, v2
-; GFX8-NEXT:    v_mov_b32_e32 v17, s1
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_mov_b32_e32 v16, s0
-; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff, v6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s0
+; GFX8-NEXT:    flat_store_dwordx4 v[3:4], v[11:14]
 ; GFX8-NEXT:    s_add_u32 s0, s0, 16
-; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 7, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 6, v2
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[22:25]
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 7, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 6, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 4, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v16, 5, v0
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 2, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v8, 4, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v10, 5, v2
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e32 v14, 3, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:    v_lshrrev_b16_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v6
-; GFX8-NEXT:    v_mov_b32_e32 v19, s3
-; GFX8-NEXT:    v_mov_b32_e32 v23, s1
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v10
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v14
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v24
-; GFX8-NEXT:    v_mov_b32_e32 v18, s2
-; GFX8-NEXT:    v_mov_b32_e32 v22, s0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v4, 2, v0
+; GFX8-NEXT:    v_and_b32_e32 v18, 1, v14
+; GFX8-NEXT:    v_and_b32_e32 v14, 1, v6
+; GFX8-NEXT:    v_lshrrev_b16_e32 v6, 3, v0
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
+; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX8-NEXT:    v_mov_b32_e32 v11, s3
+; GFX8-NEXT:    v_mov_b32_e32 v13, s1
+; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v16
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff, v3
+; GFX8-NEXT:    v_mov_b32_e32 v10, s2
+; GFX8-NEXT:    v_mov_b32_e32 v12, s0
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v0
 ; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX8-NEXT:    flat_store_dwordx4 v[20:21], v[12:15]
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[8:11]
-; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[4:7]
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[14:17]
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[18:21]
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[4:7]
+; GFX8-NEXT:    flat_store_dwordx4 v[1:2], v[22:25]
 ; GFX8-NEXT:    s_endpgm
 ;
 ; EG-LABEL: constant_zextload_v16i1_to_v16i64:
@@ -5126,18 +5127,20 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX8-LABEL: constant_zextload_v32i1_to_v32i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_mov_b32_e32 v26, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 13, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 11, s2
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 9, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 13, s2
+; GFX8-NEXT:    v_and_b32_e32 v15, 1, v1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 9, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 11, s2
+; GFX8-NEXT:    v_and_b32_e32 v8, 1, v1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 7, s2
+; GFX8-NEXT:    v_and_b32_e32 v11, 1, v2
+; GFX8-NEXT:    v_and_b32_e32 v2, 1, v1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 5, s2
 ; GFX8-NEXT:    s_lshr_b32 s14, s2, 24
-; GFX8-NEXT:    v_and_b32_e32 v11, 1, v1
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 7, s2
+; GFX8-NEXT:    v_and_b32_e32 v5, 1, v1
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 3, s2
 ; GFX8-NEXT:    s_bfe_u32 s10, s2, 0x10018
 ; GFX8-NEXT:    s_and_b32 s11, s2, 1
@@ -5149,18 +5152,16 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_bfe_u32 s20, s2, 0x10015
 ; GFX8-NEXT:    s_bfe_u32 s21, s2, 0x10016
 ; GFX8-NEXT:    s_bfe_u32 s22, s2, 0x10017
-; GFX8-NEXT:    v_mov_b32_e32 v4, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 14, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 12, s2
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 14, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 12, s2
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 10, s2
-; GFX8-NEXT:    v_and_b32_e32 v3, 1, v0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 6, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 5, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 4, s2
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 2, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 6, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 4, s2
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 2, s2
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 1, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 15, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 15, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0xb0
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    s_add_u32 s4, s0, 0xa0
@@ -5170,137 +5171,132 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_add_u32 s8, s0, 0x80
 ; GFX8-NEXT:    s_addc_u32 s9, s1, 0
 ; GFX8-NEXT:    s_add_u32 s12, s0, 0x70
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v16, 1, v1
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 5, s14
 ; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v12, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 1, v1
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 3, s14
-; GFX8-NEXT:    v_mov_b32_e32 v25, s13
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v1
+; GFX8-NEXT:    v_mov_b32_e32 v23, s13
+; GFX8-NEXT:    v_and_b32_e32 v25, 1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v24, s12
+; GFX8-NEXT:    v_mov_b32_e32 v22, s12
 ; GFX8-NEXT:    s_add_u32 s12, s0, 0xf0
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v14
+; GFX8-NEXT:    v_and_b32_e32 v18, 1, v4
+; GFX8-NEXT:    v_mov_b32_e32 v19, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v21, v1
-; GFX8-NEXT:    v_mov_b32_e32 v23, v1
 ; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT:    v_mov_b32_e32 v25, s13
-; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 6, s14
-; GFX8-NEXT:    v_mov_b32_e32 v24, s12
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GFX8-NEXT:    v_mov_b32_e32 v23, s13
+; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 6, s14
+; GFX8-NEXT:    v_mov_b32_e32 v22, s12
 ; GFX8-NEXT:    s_add_u32 s12, s0, 0x60
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v17
-; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 7, s14
+; GFX8-NEXT:    v_and_b32_e32 v18, 1, v6
+; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 7, s14
 ; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[20:23]
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff, v11
-; GFX8-NEXT:    v_and_b32_e32 v20, 1, v15
-; GFX8-NEXT:    v_and_b32_e32 v22, 0xffff, v16
-; GFX8-NEXT:    v_mov_b32_e32 v16, s13
-; GFX8-NEXT:    v_mov_b32_e32 v15, s12
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 4, s14
+; GFX8-NEXT:    v_and_b32_e32 v18, 1, v14
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v15
+; GFX8-NEXT:    v_mov_b32_e32 v15, s13
+; GFX8-NEXT:    v_mov_b32_e32 v14, s12
 ; GFX8-NEXT:    s_add_u32 s12, s0, 0x50
-; GFX8-NEXT:    v_mov_b32_e32 v23, 0
 ; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[20:23]
-; GFX8-NEXT:    v_mov_b32_e32 v16, s13
-; GFX8-NEXT:    v_and_b32_e32 v23, 1, v9
-; GFX8-NEXT:    v_mov_b32_e32 v24, v1
-; GFX8-NEXT:    v_mov_b32_e32 v15, s12
+; GFX8-NEXT:    v_mov_b32_e32 v23, s13
+; GFX8-NEXT:    v_mov_b32_e32 v22, s12
 ; GFX8-NEXT:    s_add_u32 s12, s0, 64
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[23:26]
-; GFX8-NEXT:    v_mov_b32_e32 v15, 1
+; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[18:21]
 ; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    v_and_b32_sdwa v23, v4, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v16, s13
-; GFX8-NEXT:    v_mov_b32_e32 v15, s12
-; GFX8-NEXT:    s_add_u32 s12, s0, 48
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff, v8
-; GFX8-NEXT:    v_mov_b32_e32 v26, 0
-; GFX8-NEXT:    s_addc_u32 s13, s1, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[23:26]
-; GFX8-NEXT:    v_mov_b32_e32 v16, s13
-; GFX8-NEXT:    v_and_b32_e32 v23, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff, v3
-; GFX8-NEXT:    v_mov_b32_e32 v26, 0
-; GFX8-NEXT:    v_mov_b32_e32 v15, s12
-; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[23:26]
-; GFX8-NEXT:    v_mov_b32_e32 v16, s3
-; GFX8-NEXT:    v_and_b32_e32 v23, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v25, 0xffff, v0
+; GFX8-NEXT:    v_and_b32_e32 v18, 1, v9
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v11
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff, v10
+; GFX8-NEXT:    v_mov_b32_e32 v10, 1
+; GFX8-NEXT:    v_mov_b32_e32 v23, s13
+; GFX8-NEXT:    v_and_b32_sdwa v18, v0, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v8
+; GFX8-NEXT:    v_mov_b32_e32 v22, s12
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GFX8-NEXT:    v_and_b32_e32 v23, 0xffff, v2
+; GFX8-NEXT:    v_and_b32_e32 v18, 1, v7
+; GFX8-NEXT:    v_mov_b32_e32 v8, s3
+; GFX8-NEXT:    v_and_b32_e32 v21, 1, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s21
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s22
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v15, s2
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v16, s5
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s19
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s20
-; GFX8-NEXT:    v_mov_b32_e32 v15, s4
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v16, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s4
+; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s17
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s18
-; GFX8-NEXT:    v_mov_b32_e32 v15, s6
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v16, s9
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v7, s8
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s15
-; GFX8-NEXT:    v_mov_b32_e32 v15, s8
+; GFX8-NEXT:    v_mov_b32_e32 v8, s9
+; GFX8-NEXT:    s_add_u32 s2, s0, 48
+; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
+; GFX8-NEXT:    s_addc_u32 s3, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_and_b32_e32 v15, 1, v24
+; GFX8-NEXT:    v_mov_b32_e32 v22, v1
+; GFX8-NEXT:    v_mov_b32_e32 v24, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_add_u32 s2, s0, 32
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[21:24]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v26, 0
+; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v5
+; GFX8-NEXT:    v_mov_b32_e32 v21, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_add_u32 s2, s0, 16
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[23:26]
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[18:21]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 4, s14
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v7
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0
-; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX8-NEXT:    v_and_b32_e32 v9, 1, v12
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v12, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v13
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v10
-; GFX8-NEXT:    v_mov_b32_e32 v13, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
+; GFX8-NEXT:    v_mov_b32_e32 v8, s1
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[9:12]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s11
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v10
-; GFX8-NEXT:    v_mov_b32_e32 v3, v13
-; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v14
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, s0
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0xe0
-; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0xd0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 1, s14
+; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff, v17
+; GFX8-NEXT:    v_mov_b32_e32 v16, v1
+; GFX8-NEXT:    v_mov_b32_e32 v18, v1
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 1, s14
-; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff, v12
-; GFX8-NEXT:    v_mov_b32_e32 v12, 0
-; GFX8-NEXT:    v_mov_b32_e32 v10, v1
-; GFX8-NEXT:    s_add_u32 s0, s0, 0xc0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 2, s14
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[9:12]
+; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 2, s14
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[15:18]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v14, 0xffff, v14
-; GFX8-NEXT:    v_mov_b32_e32 v17, 0
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v19
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v18
-; GFX8-NEXT:    v_mov_b32_e32 v22, 0
-; GFX8-NEXT:    v_mov_b32_e32 v20, v1
+; GFX8-NEXT:    s_add_u32 s0, s0, 0xc0
+; GFX8-NEXT:    v_and_b32_e32 v13, 0xffff, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v26
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v25
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
-; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[19:22]
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s10
-; GFX8-NEXT:    v_mov_b32_e32 v2, v14
-; GFX8-NEXT:    v_mov_b32_e32 v3, v17
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, v13
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
@@ -6228,32 +6224,31 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-LABEL: constant_zextload_v64i1_to_v64i64:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
-; GFX8-NEXT:    v_mov_b32_e32 v28, 0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 13, s2
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 11, s2
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 9, s2
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 7, s2
-; GFX8-NEXT:    v_and_b32_e32 v13, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 5, s2
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 3, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 13, s2
+; GFX8-NEXT:    v_and_b32_e32 v18, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 11, s2
+; GFX8-NEXT:    v_and_b32_e32 v16, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 9, s2
+; GFX8-NEXT:    v_and_b32_e32 v15, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 7, s2
+; GFX8-NEXT:    v_and_b32_e32 v13, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 5, s2
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 3, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 14, s2
-; GFX8-NEXT:    s_lshr_b32 s31, s3, 24
+; GFX8-NEXT:    s_lshr_b32 s33, s3, 24
 ; GFX8-NEXT:    s_lshr_b32 s24, s2, 24
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v19, 12, s2
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 10, s2
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 6, s2
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v11, 4, s2
-; GFX8-NEXT:    v_and_b32_e32 v7, 1, v1
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 1, s2
+; GFX8-NEXT:    v_and_b32_e32 v8, 1, v0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v9, 2, s2
+; GFX8-NEXT:    v_lshrrev_b16_e64 v0, 1, s2
 ; GFX8-NEXT:    s_bfe_u32 s20, s2, 0x10018
 ; GFX8-NEXT:    s_bfe_u32 s21, s3, 0x10018
 ; GFX8-NEXT:    s_and_b32 s22, s3, 1
@@ -6265,7 +6260,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_bfe_u32 s28, s2, 0x10013
 ; GFX8-NEXT:    s_bfe_u32 s29, s2, 0x10014
 ; GFX8-NEXT:    s_bfe_u32 s30, s2, 0x10015
-; GFX8-NEXT:    s_bfe_u32 s33, s2, 0x10016
+; GFX8-NEXT:    s_bfe_u32 s31, s2, 0x10016
 ; GFX8-NEXT:    s_bfe_u32 s2, s2, 0x10017
 ; GFX8-NEXT:    s_bfe_u32 s34, s3, 0x10011
 ; GFX8-NEXT:    s_bfe_u32 s35, s3, 0x10010
@@ -6291,10 +6286,9 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    s_addc_u32 s17, s1, 0
 ; GFX8-NEXT:    s_add_u32 s18, s0, 0x80
 ; GFX8-NEXT:    s_addc_u32 s19, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v9, 1, v1
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v1, 13, s3
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x70
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v1
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v23, s42
@@ -6306,277 +6300,269 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v22, 14, s3
 ; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[2:5]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v24, s42
-; GFX8-NEXT:    v_and_b32_e32 v2, 1, v22
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 15, s3
-; GFX8-NEXT:    v_mov_b32_e32 v25, s43
-; GFX8-NEXT:    flat_store_dwordx4 v[24:25], v[2:5]
-; GFX8-NEXT:    s_add_u32 s42, s0, 0x1f0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s42
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 15, s3
+; GFX8-NEXT:    v_mov_b32_e32 v23, v1
+; GFX8-NEXT:    v_mov_b32_e32 v25, v1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s43
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[22:25]
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 11, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 6, s31
+; GFX8-NEXT:    s_add_u32 s42, s0, 0x1f0
+; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 6, s33
 ; GFX8-NEXT:    v_and_b32_e32 v4, 1, v2
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s42
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v21
-; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 7, s31
-; GFX8-NEXT:    v_mov_b32_e32 v25, v1
-; GFX8-NEXT:    v_mov_b32_e32 v27, v1
+; GFX8-NEXT:    v_and_b32_e32 v21, 1, v21
+; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 7, s33
+; GFX8-NEXT:    v_mov_b32_e32 v22, v1
+; GFX8-NEXT:    v_mov_b32_e32 v24, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0xf0
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v20, 6, s24
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[24:27]
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[21:24]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s42
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v20
-; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 7, s24
-; GFX8-NEXT:    v_mov_b32_e32 v25, v1
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v20
+; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 7, s24
+; GFX8-NEXT:    v_mov_b32_e32 v23, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x60
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[24:27]
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[22:25]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v19
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff, v18
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v19
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v18
 ; GFX8-NEXT:    v_mov_b32_e32 v18, s42
-; GFX8-NEXT:    v_mov_b32_e32 v27, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v19, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x50
-; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[24:27]
+; GFX8-NEXT:    flat_store_dwordx4 v[18:19], v[22:25]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v17
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff, v16
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v17
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v16
 ; GFX8-NEXT:    v_mov_b32_e32 v16, s42
-; GFX8-NEXT:    v_mov_b32_e32 v27, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v17, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 64
-; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[24:27]
-; GFX8-NEXT:    v_mov_b32_e32 v19, 1
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff, v15
+; GFX8-NEXT:    flat_store_dwordx4 v[16:17], v[22:25]
+; GFX8-NEXT:    v_mov_b32_e32 v17, 1
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v15, s42
-; GFX8-NEXT:    v_and_b32_sdwa v24, v12, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_mov_b32_e32 v27, 0
-; GFX8-NEXT:    v_mov_b32_e32 v16, s43
+; GFX8-NEXT:    v_mov_b32_e32 v26, s42
+; GFX8-NEXT:    v_and_b32_sdwa v22, v12, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v15
+; GFX8-NEXT:    v_mov_b32_e32 v27, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 48
-; GFX8-NEXT:    flat_store_dwordx4 v[15:16], v[24:27]
+; GFX8-NEXT:    flat_store_dwordx4 v[26:27], v[22:25]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff, v13
-; GFX8-NEXT:    v_mov_b32_e32 v12, s42
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v14
-; GFX8-NEXT:    v_mov_b32_e32 v27, 0
-; GFX8-NEXT:    v_mov_b32_e32 v13, s43
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v14
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v13
+; GFX8-NEXT:    v_mov_b32_e32 v13, s42
+; GFX8-NEXT:    v_mov_b32_e32 v14, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 32
-; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[24:27]
+; GFX8-NEXT:    flat_store_dwordx4 v[13:14], v[22:25]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v24, 1, v11
-; GFX8-NEXT:    v_and_b32_e32 v26, 0xffff, v10
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v11
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v10
 ; GFX8-NEXT:    v_mov_b32_e32 v10, s42
-; GFX8-NEXT:    v_mov_b32_e32 v27, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v11, s43
-; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 5, s31
-; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[24:27]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v10, 3, s31
 ; GFX8-NEXT:    s_add_u32 s42, s0, 16
-; GFX8-NEXT:    v_and_b32_e32 v14, 1, v12
-; GFX8-NEXT:    v_and_b32_e32 v25, 1, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v8
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff, v7
+; GFX8-NEXT:    flat_store_dwordx4 v[10:11], v[22:25]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v7, s42
-; GFX8-NEXT:    v_mov_b32_e32 v13, 0
-; GFX8-NEXT:    v_mov_b32_e32 v11, v1
-; GFX8-NEXT:    v_mov_b32_e32 v8, s43
-; GFX8-NEXT:    flat_store_dwordx4 v[7:8], v[10:13]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v7, 1, s31
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v9
+; GFX8-NEXT:    v_and_b32_e32 v24, 0xffff, v8
+; GFX8-NEXT:    v_mov_b32_e32 v8, s42
+; GFX8-NEXT:    v_mov_b32_e32 v9, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x160
-; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 12, s3
-; GFX8-NEXT:    v_and_b32_e32 v27, 1, v7
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 12, s3
+; GFX8-NEXT:    v_lshrrev_b16_e64 v10, 3, s33
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[22:25]
+; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 1, s33
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s42
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v23
-; GFX8-NEXT:    v_mov_b32_e32 v13, 0
+; GFX8-NEXT:    v_mov_b32_e32 v22, s42
+; GFX8-NEXT:    v_and_b32_e32 v28, 1, v10
+; GFX8-NEXT:    v_and_b32_e32 v19, 1, v8
+; GFX8-NEXT:    v_and_b32_e32 v8, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v7
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v11, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, s43
-; GFX8-NEXT:    flat_store_dwordx4 v[6:7], v[10:13]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 5, s24
+; GFX8-NEXT:    v_mov_b32_e32 v23, s43
+; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 5, s24
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x150
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 10, s3
-; GFX8-NEXT:    v_and_b32_e32 v23, 1, v6
+; GFX8-NEXT:    flat_store_dwordx4 v[22:23], v[8:11]
+; GFX8-NEXT:    v_and_b32_e32 v22, 1, v5
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v4
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s42
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v21
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff, v4
-; GFX8-NEXT:    v_mov_b32_e32 v13, 0
-; GFX8-NEXT:    v_mov_b32_e32 v7, s43
-; GFX8-NEXT:    flat_store_dwordx4 v[6:7], v[10:13]
-; GFX8-NEXT:    v_lshrrev_b16_e64 v6, 1, s24
+; GFX8-NEXT:    v_mov_b32_e32 v4, s42
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v21
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v5, s43
+; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[7:10]
+; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 3, s24
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 9, s3
-; GFX8-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX8-NEXT:    v_and_b32_e32 v10, 1, v4
+; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 1, s24
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x140
-; GFX8-NEXT:    v_mov_b32_e32 v0, s3
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
 ; GFX8-NEXT:    v_and_b32_e32 v20, 1, v2
-; GFX8-NEXT:    v_and_b32_e32 v12, 0xffff, v6
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s42
+; GFX8-NEXT:    v_mov_b32_e32 v8, s42
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v2, 7, s3
-; GFX8-NEXT:    v_and_b32_sdwa v19, v0, v19 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v20
-; GFX8-NEXT:    v_mov_b32_e32 v22, 0
-; GFX8-NEXT:    v_mov_b32_e32 v20, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, s43
+; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 6, s3
+; GFX8-NEXT:    v_and_b32_e32 v11, 0xffff, v4
+; GFX8-NEXT:    v_and_b32_sdwa v4, v6, v17 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v20
+; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_mov_b32_e32 v9, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x130
 ; GFX8-NEXT:    v_and_b32_e32 v2, 1, v2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v18, 6, s3
 ; GFX8-NEXT:    v_lshrrev_b16_e64 v3, 5, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[6:7], v[19:22]
+; GFX8-NEXT:    flat_store_dwordx4 v[8:9], v[4:7]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v6, s42
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v18
+; GFX8-NEXT:    v_mov_b32_e32 v17, s42
 ; GFX8-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_and_b32_e32 v18, 1, v18
-; GFX8-NEXT:    v_and_b32_e32 v20, 0xffff, v2
-; GFX8-NEXT:    v_mov_b32_e32 v21, 0
-; GFX8-NEXT:    v_mov_b32_e32 v19, v1
-; GFX8-NEXT:    v_mov_b32_e32 v7, s43
+; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v10
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
+; GFX8-NEXT:    v_mov_b32_e32 v18, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x120
-; GFX8-NEXT:    v_lshrrev_b16_e64 v17, 4, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[6:7], v[18:21]
+; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 4, s3
+; GFX8-NEXT:    flat_store_dwordx4 v[17:18], v[7:10]
+; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff, v3
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s42
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 3, s3
-; GFX8-NEXT:    v_and_b32_e32 v17, 1, v17
-; GFX8-NEXT:    v_mov_b32_e32 v20, 0
-; GFX8-NEXT:    v_mov_b32_e32 v18, v1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v12, 3, s3
+; GFX8-NEXT:    v_lshrrev_b16_e64 v13, 1, s3
+; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff, v19
+; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX8-NEXT:    v_mov_b32_e32 v17, v1
+; GFX8-NEXT:    v_mov_b32_e32 v19, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s43
 ; GFX8-NEXT:    s_add_u32 s42, s0, 0x110
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v5
-; GFX8-NEXT:    v_lshrrev_b16_e64 v16, 2, s3
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 3, s24
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[17:20]
+; GFX8-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX8-NEXT:    v_lshrrev_b16_e64 v15, 2, s3
+; GFX8-NEXT:    v_and_b32_e32 v13, 1, v13
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
 ; GFX8-NEXT:    s_addc_u32 s43, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s42
-; GFX8-NEXT:    v_lshrrev_b16_e64 v24, 4, s31
-; GFX8-NEXT:    v_and_b32_e32 v10, 1, v4
-; GFX8-NEXT:    v_and_b32_e32 v16, 1, v16
-; GFX8-NEXT:    v_and_b32_e32 v18, 0xffff, v15
-; GFX8-NEXT:    v_mov_b32_e32 v19, 0
-; GFX8-NEXT:    v_mov_b32_e32 v17, v1
+; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX8-NEXT:    v_and_b32_e32 v17, 1, v15
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff, v12
+; GFX8-NEXT:    v_mov_b32_e32 v18, v1
+; GFX8-NEXT:    v_mov_b32_e32 v20, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s43
-; GFX8-NEXT:    v_and_b32_e32 v6, 0xffff, v10
-; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v23
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[16:19]
+; GFX8-NEXT:    v_and_b32_e32 v8, 0xffff, v13
+; GFX8-NEXT:    v_mov_b32_e32 v13, s5
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[17:20]
+; GFX8-NEXT:    v_and_b32_e32 v10, 0xffff, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s41
-; GFX8-NEXT:    v_and_b32_e32 v19, 1, v24
-; GFX8-NEXT:    v_mov_b32_e32 v24, s5
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s40
 ; GFX8-NEXT:    v_mov_b32_e32 v3, v1
-; GFX8-NEXT:    v_mov_b32_e32 v23, s4
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s7
+; GFX8-NEXT:    v_mov_b32_e32 v12, s4
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v13, s7
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s38
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s39
-; GFX8-NEXT:    v_mov_b32_e32 v23, s6
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s9
+; GFX8-NEXT:    v_mov_b32_e32 v12, s6
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v13, s9
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s36
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s37
-; GFX8-NEXT:    v_mov_b32_e32 v23, s8
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s11
+; GFX8-NEXT:    v_mov_b32_e32 v12, s8
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v13, s11
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s35
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s34
-; GFX8-NEXT:    v_mov_b32_e32 v23, s10
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s13
-; GFX8-NEXT:    v_mov_b32_e32 v0, s33
+; GFX8-NEXT:    v_mov_b32_e32 v12, s10
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v12, s12
+; GFX8-NEXT:    v_mov_b32_e32 v0, s31
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v23, s12
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s15
+; GFX8-NEXT:    v_mov_b32_e32 v13, s13
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v12, s14
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s29
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s30
-; GFX8-NEXT:    v_mov_b32_e32 v23, s14
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s17
+; GFX8-NEXT:    v_mov_b32_e32 v13, s15
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v12, s16
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s27
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s28
-; GFX8-NEXT:    v_mov_b32_e32 v23, s16
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s19
+; GFX8-NEXT:    v_mov_b32_e32 v13, s17
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v12, s18
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s26
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s25
-; GFX8-NEXT:    v_mov_b32_e32 v23, s18
-; GFX8-NEXT:    v_lshrrev_b16_e64 v5, 1, s3
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s1
+; GFX8-NEXT:    v_mov_b32_e32 v13, s19
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v13, s1
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x100
-; GFX8-NEXT:    v_and_b32_e32 v5, 1, v5
-; GFX8-NEXT:    v_and_b32_e32 v2, 0xffff, v9
-; GFX8-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s23
-; GFX8-NEXT:    v_mov_b32_e32 v23, s0
+; GFX8-NEXT:    v_mov_b32_e32 v2, v10
+; GFX8-NEXT:    v_mov_b32_e32 v12, s0
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v21, 0xffff, v14
-; GFX8-NEXT:    v_and_b32_e32 v5, 0xffff, v5
-; GFX8-NEXT:    v_mov_b32_e32 v14, 0
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
-; GFX8-NEXT:    v_mov_b32_e32 v24, s3
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
+; GFX8-NEXT:    v_mov_b32_e32 v13, s3
+; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 5, s33
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s22
-; GFX8-NEXT:    v_mov_b32_e32 v2, v5
-; GFX8-NEXT:    v_mov_b32_e32 v3, v14
-; GFX8-NEXT:    v_mov_b32_e32 v23, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v8
+; GFX8-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x1e0
-; GFX8-NEXT:    flat_store_dwordx4 v[23:24], v[0:3]
+; GFX8-NEXT:    v_and_b32_e32 v26, 1, v14
+; GFX8-NEXT:    v_lshrrev_b16_e64 v27, 4, s33
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_mov_b32_e32 v22, 0
-; GFX8-NEXT:    v_mov_b32_e32 v20, v1
+; GFX8-NEXT:    v_and_b32_e32 v17, 1, v27
+; GFX8-NEXT:    v_and_b32_e32 v19, 0xffff, v26
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x1d0
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[19:22]
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[17:20]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v26, 2, s31
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0x1c0
-; GFX8-NEXT:    v_and_b32_e32 v15, 1, v26
-; GFX8-NEXT:    v_and_b32_e32 v17, 0xffff, v25
-; GFX8-NEXT:    v_mov_b32_e32 v18, 0
-; GFX8-NEXT:    v_mov_b32_e32 v16, v1
+; GFX8-NEXT:    v_lshrrev_b16_e64 v14, 2, s33
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_and_b32_e32 v27, 0xffff, v27
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[15:18]
+; GFX8-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX8-NEXT:    v_and_b32_e32 v16, 0xffff, v28
+; GFX8-NEXT:    v_mov_b32_e32 v15, v1
+; GFX8-NEXT:    v_mov_b32_e32 v17, v1
+; GFX8-NEXT:    v_mov_b32_e32 v13, s3
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[14:17]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s21
-; GFX8-NEXT:    v_mov_b32_e32 v15, s3
-; GFX8-NEXT:    v_mov_b32_e32 v2, v27
-; GFX8-NEXT:    v_mov_b32_e32 v3, v28
-; GFX8-NEXT:    v_mov_b32_e32 v14, s2
+; GFX8-NEXT:    v_mov_b32_e32 v2, v5
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0xe0
-; GFX8-NEXT:    flat_store_dwordx4 v[14:15], v[0:3]
+; GFX8-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
-; GFX8-NEXT:    v_lshrrev_b16_e64 v8, 4, s24
+; GFX8-NEXT:    v_lshrrev_b16_e64 v23, 4, s24
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_add_u32 s2, s0, 0xd0
-; GFX8-NEXT:    v_and_b32_e32 v8, 1, v8
-; GFX8-NEXT:    v_mov_b32_e32 v11, 0
-; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_and_b32_e32 v7, 1, v23
+; GFX8-NEXT:    v_and_b32_e32 v9, 0xffff, v22
+; GFX8-NEXT:    v_mov_b32_e32 v8, v1
+; GFX8-NEXT:    v_mov_b32_e32 v10, v1
 ; GFX8-NEXT:    s_addc_u32 s3, s1, 0
-; GFX8-NEXT:    v_lshrrev_b16_e64 v4, 2, s24
-; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[8:11]
+; GFX8-NEXT:    v_lshrrev_b16_e64 v21, 2, s24
+; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[7:10]
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX8-NEXT:    s_add_u32 s0, s0, 0xc0
-; GFX8-NEXT:    v_and_b32_e32 v4, 1, v4
-; GFX8-NEXT:    v_mov_b32_e32 v7, 0
+; GFX8-NEXT:    v_and_b32_e32 v4, 1, v21
 ; GFX8-NEXT:    v_mov_b32_e32 v5, v1
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX8-NEXT:    s_addc_u32 s1, s1, 0
-; GFX8-NEXT:    v_mov_b32_e32 v13, 0
 ; GFX8-NEXT:    flat_store_dwordx4 v[2:3], v[4:7]
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s20
 ; GFX8-NEXT:    v_mov_b32_e32 v5, s1
-; GFX8-NEXT:    v_mov_b32_e32 v2, v12
-; GFX8-NEXT:    v_mov_b32_e32 v3, v13
+; GFX8-NEXT:    v_mov_b32_e32 v2, v11
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v4, s0
 ; GFX8-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GFX8-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index 2ca17b535cba009..7c80a220b72d7b0 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -6031,73 +6031,73 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v9
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v9
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, v4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v4
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, v4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v3
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v3
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xffff, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v4
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 16
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s0
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(0)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v14, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v12, 0xffff, v3
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v6, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[12:15]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[16:19]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[8:11]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[4:7]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v1
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[3:6]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[7:10]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[11:14]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v8i16_to_v8i64:
@@ -6479,66 +6479,65 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v8, 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v17, 16, v3
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v1
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v0
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v2
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v1
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, 0
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v21, 16, v2
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v23, 0xffff, v0
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v19, 0xffff, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v11, 0xffff, v1
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v15, 0xffff, v3
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v5
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v6
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v6
 ; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
 ; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v27, 0xffff, v7
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v7, 0xffff, v5
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v10, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v30, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v16, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v18, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v12, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v20, v8
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, v8
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v30, 16, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v28, 0xffff, v7
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v5
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v29, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v21
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v21
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[7:10], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v22, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[15:18], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[11:14], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[19:22], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 16
-; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s2
 ; GCN-HSA-NEXT:    flat_load_dwordx4 v[4:7], v[4:5]
@@ -6546,65 +6545,70 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x50
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, v8
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v5
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v13, 16, v1
+; GCN-HSA-NEXT:    v_and_b32_e32 v11, 0xffff, v1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[11:14]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0x70
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(1)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v5
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v8, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, s2
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[14:17]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s4
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v7
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[17:20]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v9, 16, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s3
+; GCN-HSA-NEXT:    v_and_b32_e32 v7, 0xffff, v3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s1
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[7:10]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s1
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, s0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, s0
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 0x60
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v15
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v4
-; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v24, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s1
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[7:8], v[14:17]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, v8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v27, s1
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v12, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v10, 0xffff, v2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v7, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v8
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
 ; GCN-HSA-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, s0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[2:5]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[19:20], v[15:18]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[23:24], v[10:13]
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[25:26], v[6:9]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v26, s0
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v22, 16, v0
+; GCN-HSA-NEXT:    v_and_b32_e32 v20, 0xffff, v0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[24:25], v[10:13]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[18:19], v[20:23]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[3:4], v[14:17]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[26:27], v[6:9]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v16i16_to_v16i64:
@@ -6619,32 +6623,25 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[0:3], off, s[8:11], 0
 ; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v30, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, v29
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v29
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v0
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v7
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v29, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v27, 0xffff, v5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, 0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v5
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v5
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v0
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v1
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v1
@@ -6652,10 +6649,17 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v2
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v3
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, 0
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v6
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v6
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v4
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v29
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v29
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:80
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:64
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:96
@@ -7248,140 +7252,148 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-SI-NEXT:    s_load_dwordx4 s[4:7], s[0:1], 0x9
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s3, 0xf000
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s2, -1
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v24, 0
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v39, 0
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s10, s2
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s11, s3
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[19:22], off, s[8:11], 0
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[25:28], off, s[8:11], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[29:32], off, s[8:11], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[33:36], off, s[8:11], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[14:17], off, s[8:11], 0
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[18:21], off, s[8:11], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[22:25], off, s[8:11], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_load_dwordx4 v[26:29], off, s[8:11], 0 offset:48
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v14, 16, v20
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v22
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v32, 16, v15
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v36, 16, v17
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(2)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v27
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v6, 16, v21
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v19
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v60, 0xffff, v19
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v62, v2
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v21
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v20
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v22
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v25
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v25
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v27
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v39, 16, v26
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v37, 0xffff, v26
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v43, 16, v28
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v41, 0xffff, v28
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v47, 16, v29
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v45, 0xffff, v29
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v29, 16, v31
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v27, 0xffff, v31
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v51, 16, v30
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v49, 0xffff, v30
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v55, 16, v32
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v53, 0xffff, v32
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v5, 16, v20
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v3, 16, v16
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v14
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v14
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v12, 0xffff, v16
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v14, v3
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v30, 0xffff, v15
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v34, 0xffff, v17
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v10, 16, v18
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v8, 0xffff, v18
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v4, 0xffff, v20
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v6, v5
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v18, 16, v19
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v16, 0xffff, v19
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v44, 16, v21
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v42, 0xffff, v21
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v48, 16, v22
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v46, 0xffff, v22
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v22, 16, v24
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v20, 0xffff, v24
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v52, 16, v23
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v50, 0xffff, v23
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v56, 16, v25
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v54, 0xffff, v25
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v40, 16, v29
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v60, 16, v26
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v58, 0xffff, v26
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v26, 16, v28
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v24, 0xffff, v28
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(1)
+; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v27
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v27
+; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v38, 0xffff, v29
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v41, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v39
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v55, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v57, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v51, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v53, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v43, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v45, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v35, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v37, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v31, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v33, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v25, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v27, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v59, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v61, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v47, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v49, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v12, off, s[12:15], 0 offset:20 ; 4-byte Folded Spill
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v25, 16, v36
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v59, 16, v33
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v57, 0xffff, v33
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v33, 16, v35
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v31, 0xffff, v35
-; GCN-NOHSA-SI-NEXT:    v_lshrrev_b32_e32 v2, 16, v34
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v0, 0xffff, v34
-; GCN-NOHSA-SI-NEXT:    v_and_b32_e32 v23, 0xffff, v36
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v26, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v1, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v3, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v54, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v56, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v50, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v52, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v42, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v44, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v38, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v40, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v17, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v19, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v32, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v58, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v28, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v46, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v9, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v21, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v5, v24
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v61, v24
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v13, off, s[12:15], 0 offset:24 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v14, off, s[12:15], 0 offset:28 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    buffer_store_dword v15, off, s[12:15], 0 offset:32 ; 4-byte Folded Spill
+; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(0)
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v12, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v13, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v14, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v15, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v13, v39
+; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v15, v39
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-SI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[23:26], off, s[0:3], 0 offset:240
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v34, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v63, 0
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v60, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v61, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v62, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    buffer_store_dword v63, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(4)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v23, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v48, 0
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v30, 0
-; GCN-NOHSA-SI-NEXT:    s_waitcnt expcnt(3)
-; GCN-NOHSA-SI-NEXT:    v_mov_b32_e32 v60, 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:240
 ; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:208
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[53:56], off, s[0:3], 0 offset:176
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[49:52], off, s[0:3], 0 offset:144
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:112
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[37:40], off, s[0:3], 0 offset:80
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:48
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:16
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[31:34], off, s[0:3], 0 offset:224
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:192
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[27:30], off, s[0:3], 0 offset:160
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:128
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:96
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:32
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload
-; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:176
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:144
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:112
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[34:37], off, s[0:3], 0 offset:48
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[30:33], off, s[0:3], 0 offset:16
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:224
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[58:61], off, s[0:3], 0 offset:192
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:160
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[46:49], off, s[0:3], 0 offset:128
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:96
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:64
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v0, off, s[12:15], 0 offset:20 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v1, off, s[12:15], 0 offset:24 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v2, off, s[12:15], 0 offset:28 ; 4-byte Folded Reload
+; GCN-NOHSA-SI-NEXT:    buffer_load_dword v3, off, s[12:15], 0 offset:32 ; 4-byte Folded Reload
 ; GCN-NOHSA-SI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:32
+; GCN-NOHSA-SI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0
 ; GCN-NOHSA-SI-NEXT:    s_endpgm
 ;
 ; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
 ; GCN-HSA:       ; %bb.0:
 ; GCN-HSA-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v29, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v25, 0
 ; GCN-HSA-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 16
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[0:3], v[0:1]
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[2:5], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s4, s2, 32
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s3, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, s4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v5, s5
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[5:8], v[4:5]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[6:9], v[0:1]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s2, 48
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[10:13], v[0:1]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s3, 0
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[9:12], v[9:10]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s3
-; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s2
-; GCN-HSA-NEXT:    flat_load_dwordx4 v[13:16], v[13:14]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_load_dwordx4 v[14:17], v[0:1]
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 48
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s4, s0, 16
@@ -7396,123 +7408,123 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-HSA-NEXT:    s_addc_u32 s13, s1, 0
 ; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x70
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v1, 0
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
 ; GCN-HSA-NEXT:    s_add_u32 s14, s0, 0x50
-; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v1
 ; GCN-HSA-NEXT:    s_addc_u32 s15, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v27, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v23, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v24, v1
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(3)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v3
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v3
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s14
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v1
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s10
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s15
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s14
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v3
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s11
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s10
 ; GCN-HSA-NEXT:    s_waitcnt vmcnt(4)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v8
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v9
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v9
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s12
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v7
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v7
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v23, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s4
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v11
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v11
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[22:23], v[18:21]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s7
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v20, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s6
+; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v17
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s12
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v6
-; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s9
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s8
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v15
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v15
+; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
 ; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v22, s5
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s4
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v12
+; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v12
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s1
+; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s0
 ; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v19, 16, v10
 ; GCN-HSA-NEXT:    v_and_b32_e32 v17, 0xffff, v10
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[21:22], v[17:20]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s6
-; GCN-HSA-NEXT:    s_waitcnt vmcnt(5)
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v16
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v16
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s9
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 32
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s8
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v14
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v14
-; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v20, s4
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v11
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v11
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, s1
-; GCN-HSA-NEXT:    s_add_u32 s4, s0, 0xe0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[20:21], v[16:19]
-; GCN-HSA-NEXT:    v_mov_b32_e32 v10, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v18, 16, v9
-; GCN-HSA-NEXT:    v_and_b32_e32 v16, 0xffff, v9
 ; GCN-HSA-NEXT:    s_addc_u32 s5, s1, 0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[10:11], v[16:19]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v15
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v15
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s5
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v9, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v14, s4
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[14:15], v[8:11]
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v16, 16, v0
-; GCN-HSA-NEXT:    v_and_b32_e32 v14, 0xffff, v0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v5
-; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v5
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v5, 16, v12
-; GCN-HSA-NEXT:    v_and_b32_e32 v3, 0xffff, v12
-; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[11:12], v[17:20]
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v16
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v16
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, s5
+; GCN-HSA-NEXT:    v_mov_b32_e32 v10, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v15, s4
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[15:16], v[9:12]
+; GCN-HSA-NEXT:    v_and_b32_e32 v0, 0xffff, v13
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GCN-HSA-NEXT:    v_and_b32_e32 v9, 0xffff, v2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v2, 16, v13
+; GCN-HSA-NEXT:    v_mov_b32_e32 v13, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, s2
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xc0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[3:6]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[12:13], v[0:3]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v28, 16, v13
-; GCN-HSA-NEXT:    v_and_b32_e32 v26, 0xffff, v13
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v23, 16, v14
+; GCN-HSA-NEXT:    v_and_b32_e32 v21, 0xffff, v14
+; GCN-HSA-NEXT:    v_mov_b32_e32 v22, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0xa0
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[26:29]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[21:24]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v24, 16, v7
-; GCN-HSA-NEXT:    v_and_b32_e32 v22, 0xffff, v7
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v20, 16, v8
+; GCN-HSA-NEXT:    v_and_b32_e32 v18, 0xffff, v8
+; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v21, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x80
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[22:25]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[18:21]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v21, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v19, v4
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-HSA-NEXT:    s_add_u32 s2, s0, 0x60
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[18:21]
 ; GCN-HSA-NEXT:    s_addc_u32 s3, s1, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s2
-; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
-; GCN-HSA-NEXT:    v_and_b32_e32 v8, 0xffff, v2
-; GCN-HSA-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s3
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v17, 16, v6
+; GCN-HSA-NEXT:    v_and_b32_e32 v15, 0xffff, v6
+; GCN-HSA-NEXT:    v_mov_b32_e32 v16, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v18, v1
 ; GCN-HSA-NEXT:    s_add_u32 s0, s0, 64
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[8:11]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[15:18]
+; GCN-HSA-NEXT:    v_mov_b32_e32 v6, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v8, v1
+; GCN-HSA-NEXT:    v_mov_b32_e32 v2, s2
+; GCN-HSA-NEXT:    v_mov_b32_e32 v12, v1
 ; GCN-HSA-NEXT:    s_addc_u32 s1, s1, 0
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v17, 0
-; GCN-HSA-NEXT:    v_mov_b32_e32 v15, v4
+; GCN-HSA-NEXT:    v_lshrrev_b32_e32 v7, 16, v4
+; GCN-HSA-NEXT:    v_and_b32_e32 v5, 0xffff, v4
+; GCN-HSA-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-HSA-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[14:17]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[2:3], v[5:8]
+; GCN-HSA-NEXT:    flat_store_dwordx4 v[0:1], v[9:12]
 ; GCN-HSA-NEXT:    s_endpgm
 ;
 ; GCN-NOHSA-VI-LABEL: global_zextload_v32i16_to_v32i64:
@@ -7525,94 +7537,96 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s8, s6
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s9, s7
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[5:8], off, s[8:11], 0
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[21:24], off, s[8:11], 0 offset:16
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[35:38], off, s[8:11], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v60, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v58, 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[7:10], off, s[8:11], 0
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s0, s4
 ; GCN-NOHSA-VI-NEXT:    s_mov_b32 s1, s5
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v55, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v52, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v49, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v46, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v43, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v40, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v15, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v19, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v17, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v27, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v25, 0
-; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(3)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v6
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v47, 16, v34
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v6, 0xffff, v10
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v19, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v17, 0xffff, v14
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v23, 16, v13
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v21, 0xffff, v13
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v27, 16, v16
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v25, 0xffff, v16
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v15
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v15
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[13:16], off, s[8:11], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_load_dwordx4 v[31:34], off, s[8:11], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v2, 16, v8
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v8
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v5, 16, v7
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v3, 0xffff, v7
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v8, 16, v10
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v12, 16, v9
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v10, 0xffff, v9
+; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(1)
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v40, 16, v13
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v38, 0xffff, v13
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v46, 16, v15
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v44, 0xffff, v15
 ; GCN-NOHSA-VI-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v59, 16, v37
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v57, 0xffff, v37
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v56, 16, v38
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v54, 0xffff, v38
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[57:60], off, s[0:3], 0 offset:224
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v53, 16, v35
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v57, 0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v51, 0xffff, v35
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[54:57], off, s[0:3], 0 offset:240
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v50, 16, v36
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v54, 0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v48, 0xffff, v36
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[51:54], off, s[0:3], 0 offset:192
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v45, 0xffff, v34
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v51, 0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[48:51], off, s[0:3], 0 offset:208
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v44, 16, v31
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v48, 0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v42, 0xffff, v31
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[45:48], off, s[0:3], 0 offset:176
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v30, 16, v23
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v45, 0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v28, 0xffff, v23
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v41, 16, v32
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v39, 0xffff, v32
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v15, 16, v32
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v13, 0xffff, v32
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v52, 16, v34
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v50, 0xffff, v34
 ; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v34, 16, v33
 ; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v32, 0xffff, v33
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v35, 0
 ; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v33, 0
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[42:45], off, s[0:3], 0 offset:128
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v42, 0
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v0, 0xffff, v6
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v6, 16, v5
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v4, 0xffff, v5
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v14, 16, v7
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v12, 0xffff, v7
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v18, 16, v22
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v16, 0xffff, v22
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v22, 16, v21
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v20, 0xffff, v21
-; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v26, 16, v24
-; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v24, 0xffff, v24
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:160
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[39:42], off, s[0:3], 0 offset:144
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v5, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v23, 0
-; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v21, 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v51, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v53, v33
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v49, 16, v31
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v47, 0xffff, v31
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[50:53], off, s[0:3], 0 offset:240
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v48, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v50, v33
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[47:50], off, s[0:3], 0 offset:192
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v45, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v47, v33
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v43, 16, v16
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v41, 0xffff, v16
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[44:47], off, s[0:3], 0 offset:160
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v42, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v44, v33
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[41:44], off, s[0:3], 0 offset:176
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v39, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v41, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v29, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v31, v33
+; GCN-NOHSA-VI-NEXT:    v_lshrrev_b32_e32 v37, 16, v14
+; GCN-NOHSA-VI-NEXT:    v_and_b32_e32 v35, 0xffff, v14
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v14, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v16, v33
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[38:41], off, s[0:3], 0 offset:128
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v36, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v38, v33
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[28:31], off, s[0:3], 0 offset:96
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[24:27], off, s[0:3], 0 offset:112
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[20:23], off, s[0:3], 0 offset:64
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[16:19], off, s[0:3], 0 offset:80
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[12:15], off, s[0:3], 0 offset:32
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 offset:48
-; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v26, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v28, v33
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[13:16], off, s[0:3], 0 offset:208
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[35:38], off, s[0:3], 0 offset:144
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v22, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v35, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v24, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v18, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v20, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v11, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v13, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v7, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v9, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v4, v33
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[25:28], off, s[0:3], 0 offset:112
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[32:35], off, s[0:3], 0 offset:224
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[21:24], off, s[0:3], 0 offset:64
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[17:20], off, s[0:3], 0 offset:80
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[10:13], off, s[0:3], 0 offset:32
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[0:3], 0 offset:48
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v1, v33
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v6, v33
+; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[3:6], off, s[0:3], 0
+; GCN-NOHSA-VI-NEXT:    s_nop 0
+; GCN-NOHSA-VI-NEXT:    v_mov_b32_e32 v3, v33
 ; GCN-NOHSA-VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
 ; GCN-NOHSA-VI-NEXT:    s_endpgm
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix.ll b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
index 9bdbac7f0fc9dc6..2f749900214ee52 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix.ll
@@ -1030,15 +1030,15 @@ define float @v_mad_mix_f32_f16lo_f16lo_cvtf16imm63(half %src0, half %src1) #0 {
 }
 
 define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32_f32imm1:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    s_mov_b32 s0, 1.0
-; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
-; GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GFX1100-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1:
+; SDAG-GFX1100:       ; %bb.0:
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 1.0
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imm1:
 ; SDAG-GFX900:       ; %bb.0:
@@ -1084,21 +1084,31 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
 ; SDAG-CI-NEXT:    v_mad_f32 v1, v1, v3, 1.0
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_f32imm1:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v3, 1.0
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_f32imm1:
 ; GISEL-GFX900:       ; %bb.0:
 ; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT:    s_mov_b32 s4, 1.0
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    v_mov_b32_e32 v3, 1.0
+; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_f32imm1:
 ; GISEL-GFX906:       ; %bb.0:
 ; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT:    s_mov_b32 s4, 1.0
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    v_mov_b32_e32 v3, 1.0
+; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1130,15 +1140,15 @@ define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1)
 }
 
 define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    s_mov_b32 s0, 0x3e230000
-; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
-; GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GFX1100-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; SDAG-GFX1100:       ; %bb.0:
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 0x3e230000
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; SDAG-GFX900:       ; %bb.0:
@@ -1186,21 +1196,31 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v1, v4, v3
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v3, 0x3e230000
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-GFX900:       ; %bb.0:
 ; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT:    s_mov_b32 s4, 0x3e230000
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    v_mov_b32_e32 v3, 0x3e230000
+; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-GFX906:       ; %bb.0:
 ; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT:    s_mov_b32 s4, 0x3e230000
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    v_mov_b32_e32 v3, 0x3e230000
+; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1210,22 +1230,22 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v2, v0
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GISEL-VI-NEXT:    v_cvt_f32_f16_e32 v0, v1
-; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; GISEL-VI-NEXT:    s_mov_b32 s4, 0x3e230000
-; GISEL-VI-NEXT:    v_mad_f32 v0, v2, v0, s4
-; GISEL-VI-NEXT:    v_mad_f32 v1, v3, v1, s4
+; GISEL-VI-NEXT:    v_cvt_f32_f16_sdwa v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0x3e230000
+; GISEL-VI-NEXT:    v_madak_f32 v0, v2, v0, 0x3e230000
+; GISEL-VI-NEXT:    v_mac_f32_e32 v1, v3, v4
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-CI-LABEL: v_mad_mix_v2f32_cvtf16imminv2pi:
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT:    s_mov_b32 s4, 0x3e230000
-; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v2, s4
-; GISEL-CI-NEXT:    v_mad_f32 v1, v1, v3, s4
+; GISEL-CI-NEXT:    v_mov_b32_e32 v1, 0x3e230000
+; GISEL-CI-NEXT:    v_madak_f32 v0, v0, v2, 0x3e230000
+; GISEL-CI-NEXT:    v_mac_f32_e32 v1, v4, v3
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
@@ -1235,15 +1255,15 @@ define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half>
 }
 
 define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 {
-; GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi:
-; GFX1100:       ; %bb.0:
-; GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT:    s_mov_b32 s0, 0.15915494
-; GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
-; GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
-; GFX1100-NEXT:    v_mov_b32_e32 v0, v2
-; GFX1100-NEXT:    s_setpc_b64 s[30:31]
+; SDAG-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; SDAG-GFX1100:       ; %bb.0:
+; SDAG-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT:    s_mov_b32 s0, 0.15915494
+; SDAG-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, s0 op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, s0 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; SDAG-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; SDAG-GFX1100-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; SDAG-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; SDAG-GFX900:       ; %bb.0:
@@ -1290,21 +1310,31 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 ; SDAG-CI-NEXT:    v_mac_f32_e32 v1, v4, v3
 ; SDAG-CI-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GISEL-GFX1100-LABEL: v_mad_mix_v2f32_f32imminv2pi:
+; GISEL-GFX1100:       ; %bb.0:
+; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v3, 0.15915494
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, v2
+; GISEL-GFX1100-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GISEL-GFX900-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; GISEL-GFX900:       ; %bb.0:
 ; GISEL-GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX900-NEXT:    s_mov_b32 s4, 0.15915494
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    v_mov_b32_e32 v3, 0.15915494
+; GISEL-GFX900-NEXT:    v_mad_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
+; GISEL-GFX900-NEXT:    v_mad_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX900-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX900-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GISEL-GFX906-LABEL: v_mad_mix_v2f32_f32imminv2pi:
 ; GISEL-GFX906:       ; %bb.0:
 ; GISEL-GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-GFX906-NEXT:    s_mov_b32 s4, 0.15915494
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, s4 op_sel_hi:[1,1,0]
-; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, s4 op_sel:[1,1,0] op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    v_mov_b32_e32 v3, 0.15915494
+; GISEL-GFX906-NEXT:    v_fma_mix_f32 v2, v0, v1, v3 op_sel_hi:[1,1,0]
+; GISEL-GFX906-NEXT:    v_fma_mix_f32 v1, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0]
 ; GISEL-GFX906-NEXT:    v_mov_b32_e32 v0, v2
 ; GISEL-GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1323,12 +1353,12 @@ define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %s
 ; GISEL-CI:       ; %bb.0:
 ; GISEL-CI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v4, v1
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v2, v2
 ; GISEL-CI-NEXT:    v_cvt_f32_f16_e32 v3, v3
-; GISEL-CI-NEXT:    s_mov_b32 s4, 0x3e22f983
-; GISEL-CI-NEXT:    v_mad_f32 v0, v0, v2, s4
-; GISEL-CI-NEXT:    v_mad_f32 v1, v1, v3, s4
+; GISEL-CI-NEXT:    v_mov_b32_e32 v1, 0x3e22f983
+; GISEL-CI-NEXT:    v_madak_f32 v0, v0, v2, 0x3e22f983
+; GISEL-CI-NEXT:    v_mac_f32_e32 v1, v4, v3
 ; GISEL-CI-NEXT:    s_setpc_b64 s[30:31]
   %src0.ext = fpext <2 x half> %src0 to <2 x float>
   %src1.ext = fpext <2 x half> %src1 to <2 x float>
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 3cb03099da93d51..05c4f45a9aacd91 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -2465,8 +2465,7 @@ define hidden amdgpu_kernel void @negativeoffset(ptr addrspace(1) nocapture %buf
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
-; GFX9-NEXT:    s_movk_i32 s0, 0x1000
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v2
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, 0x1000, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v3, vcc
 ; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, 0, v2
 ; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
index 26cde1e6cd59d20..506c92a71b3babb 100644
--- a/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
+++ b/llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll
@@ -3,8 +3,8 @@
 
 ; GCN-LABEL: {{^}}test_remat_sgpr:
 ; GCN-NOT:     v_writelane_b32
-; GCN:         {{^}}[[LOOP:.LBB[0-9_]+]]:
 ; GCN-COUNT-6: s_mov_b32 s{{[0-9]+}}, 0x
+; GCN:         {{^}}[[LOOP:.LBB[0-9_]+]]:
 ; GCN-NOT:     v_writelane_b32
 ; GCN:         s_cbranch_{{[^ ]+}} [[LOOP]]
 ; GCN: .sgpr_spill_count: 0
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 8da720d7f991cdd..a8ae8c0c38792a0 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -2591,11 +2591,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT:    s_movk_i32 s2, 0xffe0
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe0, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -3554,11 +3553,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT:    s_movk_i32 s2, 0xc400
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc400, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc400, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -3720,11 +3718,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT:    s_movk_i32 s2, 0x4400
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4400, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x4400, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -3886,11 +3883,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT:    s_movk_i32 s2, 0x4000
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4000, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x4000, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -4052,11 +4048,10 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
 ; SI-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-GISEL-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; SI-GISEL-NEXT:    s_movk_i32 s2, 0xc000
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, s2, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, s2, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc000, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc000, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
@@ -4280,11 +4275,11 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; GFX9-GISEL:       ; %bb.0:
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[0:1], 0x24
 ; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffe00000
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    s_mov_b32 s2, 0xffe00000
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, s2
+; GFX9-GISEL-NEXT:    v_pk_add_u16 v1, v1, v2
 ; GFX9-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-GISEL-NEXT:    s_endpgm
 ;
@@ -4306,7 +4301,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-GISEL-NEXT:    global_load_dword v1, v0, s[2:3]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, v1, 0xffe0 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX10-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1]
 ; GFX10-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX10-GISEL-NEXT:    s_endpgm
 ;
@@ -4330,7 +4325,7 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-NEXT:    global_load_b32 v1, v0, s[2:3]
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, v1, 0xffe0 op_sel:[0,1] op_sel_hi:[1,0]
+; GFX11-GISEL-NEXT:    v_pk_add_u16 v1, 0xffe0, v1 op_sel:[1,0] op_sel_hi:[0,1]
 ; GFX11-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX11-GISEL-NEXT:    s_nop 0
 ; GFX11-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
index b9cdd478090e586..e46992ccbbc57d4 100644
--- a/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_sat_pk_u8_i16.ll
@@ -50,8 +50,8 @@ define <2 x i16> @basic_smax_smin(i16 %src0, i16 %src1) {
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-VI-NEXT:    v_max_i16_e32 v0, 0, v0
-; GISEL-VI-NEXT:    v_max_i16_e32 v1, 0, v1
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0xff
+; GISEL-VI-NEXT:    v_max_i16_e32 v1, 0, v1
 ; GISEL-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
 ; GISEL-VI-NEXT:    v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -312,9 +312,9 @@ define <2 x i16> @basic_smin_smax_combined(i16 %src0, i16 %src1) {
 ; GISEL-VI-LABEL: basic_smin_smax_combined:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GISEL-VI-NEXT:    v_min_i16_e32 v0, 0xff, v0
 ; GISEL-VI-NEXT:    v_max_i16_e32 v1, 0, v1
-; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GISEL-VI-NEXT:    v_max_i16_e32 v0, 0, v0
 ; GISEL-VI-NEXT:    v_min_i16_sdwa v1, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -559,11 +559,11 @@ define <2 x i16> @vec_smin_smax(<2 x i16> %src) {
 ; GISEL-VI-LABEL: vec_smin_smax:
 ; GISEL-VI:       ; %bb.0:
 ; GISEL-VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0xff
-; GISEL-VI-NEXT:    v_min_i16_e32 v1, 0xff, v0
-; GISEL-VI-NEXT:    v_min_i16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-VI-NEXT:    v_mov_b32_e32 v1, 0xff
+; GISEL-VI-NEXT:    v_min_i16_e32 v2, 0xff, v0
+; GISEL-VI-NEXT:    v_min_i16_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GISEL-VI-NEXT:    v_max_i16_e32 v1, 0, v2
 ; GISEL-VI-NEXT:    v_mov_b32_e32 v2, 0
-; GISEL-VI-NEXT:    v_max_i16_e32 v1, 0, v1
 ; GISEL-VI-NEXT:    v_max_i16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GISEL-VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GISEL-VI-NEXT:    s_setpc_b64 s[30:31]



More information about the cfe-commits mailing list